From 6aea938f54ca785ce3bea67a8948698225c6ca58 Mon Sep 17 00:00:00 2001
From: Julien Brunel <brunel@diku.dk>
Date: Fri, 10 Oct 2008 12:00:19 -0700
Subject: RDMA/ucma: Test ucma_alloc_multicast() return against NULL, not with
 IS_ERR()

In case of error, the function ucma_alloc_multicast() returns a NULL
pointer, but never returns an ERR pointer.  So after a call to this
function, an IS_ERR test should be replaced by a NULL test.

The semantic match that finds this problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@match bad_is_err_test@
expression x, E;
@@

x = ucma_alloc_multicast(...)
... when != x = E
IS_ERR(x)
// </smpl>

Signed-off-by:  Julien Brunel <brunel@diku.dk>
Signed-off-by:  Julia Lawall <julia@diku.dk>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/ucma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 3ddacf39b7b..4346a24568f 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -904,8 +904,8 @@ static ssize_t ucma_join_multicast(struct ucma_file *file,
 
 	mutex_lock(&file->mut);
 	mc = ucma_alloc_multicast(ctx);
-	if (IS_ERR(mc)) {
-		ret = PTR_ERR(mc);
+	if (!mc) {
+		ret = -ENOMEM;
 		goto err1;
 	}
 
-- 
cgit v1.2.3


From a3cdcbfa8fb1fccfe48d359da86e99546610c562 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Fri, 10 Oct 2008 12:01:37 -0700
Subject: mlx4_core: Add QP range reservation support

To allow allocating an aligned range of consecutive QP numbers, add an
interface to reserve an aligned range of QP numbers and have the QP
allocation function always take a QP number.

This will be used for RSS support in the mlx4_en Ethernet driver and
also potentially by IPoIB RSS support.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/qp.c | 21 ++++++++++--
 drivers/net/mlx4/alloc.c        | 74 ++++++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4.h         |  2 ++
 drivers/net/mlx4/qp.c           | 45 +++++++++++++++++--------
 include/linux/mlx4/device.h     |  5 ++-
 5 files changed, 129 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index baa01deb243..39167a797f9 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -451,6 +451,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
 {
+	int qpn;
 	int err;
 
 	mutex_init(&qp->mutex);
@@ -545,9 +546,17 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 		}
 	}
 
-	err = mlx4_qp_alloc(dev->dev, sqpn, &qp->mqp);
+	if (sqpn) {
+		qpn = sqpn;
+	} else {
+		err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn);
+		if (err)
+			goto err_wrid;
+	}
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
 	if (err)
-		goto err_wrid;
+		goto err_qpn;
 
 	/*
 	 * Hardware wants QPN written in big-endian order (after
@@ -560,6 +569,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 
 	return 0;
 
+err_qpn:
+	if (!sqpn)
+		mlx4_qp_release_range(dev->dev, qpn, 1);
+
 err_wrid:
 	if (pd->uobject) {
 		if (!init_attr->srq)
@@ -655,6 +668,10 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 	mlx4_ib_unlock_cqs(send_cq, recv_cq);
 
 	mlx4_qp_free(dev->dev, &qp->mqp);
+
+	if (!is_sqp(dev, qp))
+		mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+
 	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
 	if (is_user) {
diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
index 096bca54bcf..e6c0d5bb5dc 100644
--- a/drivers/net/mlx4/alloc.c
+++ b/drivers/net/mlx4/alloc.c
@@ -65,10 +65,82 @@ u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
 
 void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj)
 {
+	mlx4_bitmap_free_range(bitmap, obj, 1);
+}
+
+static unsigned long find_aligned_range(unsigned long *bitmap,
+					u32 start, u32 nbits,
+					int len, int align)
+{
+	unsigned long end, i;
+
+again:
+	start = ALIGN(start, align);
+
+	while ((start < nbits) && test_bit(start, bitmap))
+		start += align;
+
+	if (start >= nbits)
+		return -1;
+
+	end = start+len;
+	if (end > nbits)
+		return -1;
+
+	for (i = start + 1; i < end; i++) {
+		if (test_bit(i, bitmap)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	return start;
+}
+
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align)
+{
+	u32 obj, i;
+
+	if (likely(cnt == 1 && align == 1))
+		return mlx4_bitmap_alloc(bitmap);
+
+	spin_lock(&bitmap->lock);
+
+	obj = find_aligned_range(bitmap->table, bitmap->last,
+				 bitmap->max, cnt, align);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+		obj = find_aligned_range(bitmap->table, 0,
+					 bitmap->max,
+					 cnt, align);
+	}
+
+	if (obj < bitmap->max) {
+		for (i = 0; i < cnt; i++)
+			set_bit(obj + i, bitmap->table);
+		if (obj == bitmap->last) {
+			bitmap->last = (obj + cnt);
+			if (bitmap->last >= bitmap->max)
+				bitmap->last = 0;
+		}
+		obj |= bitmap->top;
+	} else
+		obj = -1;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt)
+{
+	u32 i;
+
 	obj &= bitmap->max - 1;
 
 	spin_lock(&bitmap->lock);
-	clear_bit(obj, bitmap->table);
+	for (i = 0; i < cnt; i++)
+		clear_bit(obj + i, bitmap->table);
 	bitmap->last = min(bitmap->last, obj);
 	bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
 	spin_unlock(&bitmap->lock);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 5337e3ac3e7..b55ddab73f6 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -288,6 +288,8 @@ static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
 
 u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
 void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj);
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt);
 int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved);
 void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
 
diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c
index c49a86044bf..98e0c40ba36 100644
--- a/drivers/net/mlx4/qp.c
+++ b/drivers/net/mlx4/qp.c
@@ -147,19 +147,42 @@ int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_modify);
 
-int mlx4_qp_alloc(struct mlx4_dev *dev, int sqpn, struct mlx4_qp *qp)
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int qpn;
+
+	qpn = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align);
+	if (qpn == -1)
+		return -ENOMEM;
+
+	*base = qpn;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range);
+
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	if (base_qpn < dev->caps.sqp_start + 8)
+		return;
+
+	mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_release_range);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
 	int err;
 
-	if (sqpn)
-		qp->qpn = sqpn;
-	else {
-		qp->qpn = mlx4_bitmap_alloc(&qp_table->bitmap);
-		if (qp->qpn == -1)
-			return -ENOMEM;
-	}
+	if (!qpn)
+		return -EINVAL;
+
+	qp->qpn = qpn;
 
 	err = mlx4_table_get(dev, &qp_table->qp_table, qp->qpn);
 	if (err)
@@ -208,9 +231,6 @@ err_put_qp:
 	mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
 
 err_out:
-	if (!sqpn)
-		mlx4_bitmap_free(&qp_table->bitmap, qp->qpn);
-
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
@@ -239,9 +259,6 @@ void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
 	mlx4_table_put(dev, &qp_table->altc_table, qp->qpn);
 	mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn);
 	mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
-
-	if (qp->qpn >= dev->caps.sqp_start + 8)
-		mlx4_bitmap_free(&qp_table->bitmap, qp->qpn);
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_free);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b2f94446831..d21e879f3c9 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -400,7 +400,10 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 		  int collapsed);
 void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
 
-int mlx4_qp_alloc(struct mlx4_dev *dev, int sqpn, struct mlx4_qp *qp);
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp);
 void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
 
 int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt,
-- 
cgit v1.2.3


From 0540bbbe455e123a1692d26205ad1a29983883b0 Mon Sep 17 00:00:00 2001
From: Hoang-Nam Nguyen <hnguyen@linux.vnet.ibm.com>
Date: Fri, 10 Oct 2008 14:40:39 -0700
Subject: IB/ehca: Don't allow creating UC QP with SRQ

This patch prevents a UC QP to be created attached to an SRQ, since
current firmware does not support this feature.

Signed-off-by: Michael Faath <micfaath@de.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/ehca/ehca_qp.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index 4dbe2870e01..40b578d601c 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -502,6 +502,12 @@ static struct ehca_qp *internal_create_qp(
 	if (init_attr->srq) {
 		my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq);
 
+		if (qp_type == IB_QPT_UC) {
+			ehca_err(pd->device, "UC with SRQ not supported");
+			atomic_dec(&shca->num_qps);
+			return ERR_PTR(-EINVAL);
+		}
+
 		has_srq = 1;
 		parms.ext_type = EQPT_SRQBASE;
 		parms.srq_qpn = my_srq->real_qp_num;
-- 
cgit v1.2.3


From 2767840a5ca73fde62b25e0209aa9269ec4fa7c7 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Fri, 10 Oct 2008 15:58:52 -0700
Subject: IPoIB: Always initialize poll_timer to avoid crash on unload

ipoib_ib_dev_stop() does del_timer_sync(&priv->poll_timer), but if a
P_key for an interface is not found, poll_timer is not initialized, so
this leads to a crash or hang.  Fix this by moving where poll_timer is
initialized to ipoib_ib_dev_init(), which is always called.

This fixes <https://bugs.openfabrics.org/show_bug.cgi?id=1172>.

Debugged-by: Yosef Etigin <yosefe@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_ib.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 0e748aeeae9..28eb6f03c58 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -685,10 +685,6 @@ int ipoib_ib_dev_open(struct net_device *dev)
 	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
 			   round_jiffies_relative(HZ));
 
-	init_timer(&priv->poll_timer);
-	priv->poll_timer.function = ipoib_ib_tx_timer_func;
-	priv->poll_timer.data = (unsigned long)dev;
-
 	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 
 	return 0;
@@ -906,6 +902,9 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 		return -ENODEV;
 	}
 
+	setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
+		    (unsigned long) dev);
+
 	if (dev->flags & IFF_UP) {
 		if (ipoib_ib_dev_open(dev)) {
 			ipoib_transport_dev_cleanup(dev);
-- 
cgit v1.2.3


From 528051746b24dd214883db11bcbb0e667f60447d Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Tue, 14 Oct 2008 14:05:36 -0700
Subject: IB/mad: Use krealloc() to resize snoop table

Use krealloc() instead of kmalloc() followed by memcpy() when resizing
the MAD module's snoop table.

Also put parentheses around the new table size to avoid calculating
the wrong size to allocate, which fixes a bug pointed out by Haven
Hash <haven.hash@isilon.com>.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/mad.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 49c45feccd5..5c54fc2350b 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -406,19 +406,15 @@ static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
 
 	if (i == qp_info->snoop_table_size) {
 		/* Grow table. */
-		new_snoop_table = kmalloc(sizeof mad_snoop_priv *
-					  qp_info->snoop_table_size + 1,
-					  GFP_ATOMIC);
+		new_snoop_table = krealloc(qp_info->snoop_table,
+					   sizeof mad_snoop_priv *
+					   (qp_info->snoop_table_size + 1),
+					   GFP_ATOMIC);
 		if (!new_snoop_table) {
 			i = -ENOMEM;
 			goto out;
 		}
-		if (qp_info->snoop_table) {
-			memcpy(new_snoop_table, qp_info->snoop_table,
-			       sizeof mad_snoop_priv *
-			       qp_info->snoop_table_size);
-			kfree(qp_info->snoop_table);
-		}
+
 		qp_info->snoop_table = new_snoop_table;
 		qp_info->snoop_table_size++;
 	}
-- 
cgit v1.2.3


From dc35fac9e936c6cc6ad825fc7e4455468d10adc6 Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Wed, 15 Oct 2008 10:50:34 -0700
Subject: RDMA/cxgb3: Remove cmid reference on tid allocation failures

The error path in iwch_connect() can fail to drop the cmid reference,
which will cause the process to hang when destroying the cmid.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/cxgb3/iwch_cm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index c325c44807e..44e936e48a3 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -1942,6 +1942,7 @@ fail4:
 fail3:
 	cxgb3_free_atid(ep->com.tdev, ep->atid);
 fail2:
+	cm_id->rem_ref(cm_id);
 	put_ep(&ep->com);
 out:
 	return err;
-- 
cgit v1.2.3


From 7b49d9548e59bbb23cd8bd433ca30125eb28987e Mon Sep 17 00:00:00 2001
From: Roland Dreier <rolandd@cisco.com>
Date: Thu, 16 Oct 2008 15:39:14 -0700
Subject: Update NetEffect maintainer emails to Intel emails

Intel acquired NetEffect, so update the nes driver maintainer contacts
to the new @intel.com addresses.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 MAINTAINERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8dae4555f10..a47bff2a436 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2899,9 +2899,9 @@ S:	Maintained
 
 NETEFFECT IWARP RNIC DRIVER (IW_NES)
 P:	Faisal Latif
-M:	flatif@neteffect.com
+M:	faisal.latif@intel.com
 P:	Chien Tung
-M:	ctung@neteffect.com
+M:	chien.tin.tung@intel.com
 L:	general@lists.openfabrics.org
 W:	http://www.neteffect.com
 S:	Supported
-- 
cgit v1.2.3


From 93fc9e1bb6507dde945c2eab68c93e1066ac3691 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 22 Oct 2008 10:25:29 -0700
Subject: mlx4_core: Support multiple pre-reserved QP regions

For ethernet support, we need to reserve QPs for the ethernet and
fibre channel driver.  The QPs are reserved at the end of the QP
table.  (This way we assure that they are aligned to their size)

We need to consider these reserved ranges in bitmap creation, so we
extend the mlx4 bitmap utility functions to allow reserved ranges at
both the bottom and the top of the range.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/alloc.c    | 29 +++++++++++++--------
 drivers/net/mlx4/cq.c       |  2 +-
 drivers/net/mlx4/eq.c       |  2 +-
 drivers/net/mlx4/fw.c       |  5 ++++
 drivers/net/mlx4/fw.h       |  2 ++
 drivers/net/mlx4/main.c     | 62 ++++++++++++++++++++++++++++++++++++++++-----
 drivers/net/mlx4/mcg.c      |  4 +--
 drivers/net/mlx4/mlx4.h     |  4 ++-
 drivers/net/mlx4/mr.c       |  2 +-
 drivers/net/mlx4/pd.c       |  4 +--
 drivers/net/mlx4/qp.c       | 36 ++++++++++++++++++++++++--
 drivers/net/mlx4/srq.c      |  2 +-
 include/linux/mlx4/device.h | 19 +++++++++++++-
 13 files changed, 144 insertions(+), 29 deletions(-)

diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
index e6c0d5bb5dc..e2bc7ecf162 100644
--- a/drivers/net/mlx4/alloc.c
+++ b/drivers/net/mlx4/alloc.c
@@ -47,13 +47,16 @@ u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
 
 	obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
 	if (obj >= bitmap->max) {
-		bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
 		obj = find_first_zero_bit(bitmap->table, bitmap->max);
 	}
 
 	if (obj < bitmap->max) {
 		set_bit(obj, bitmap->table);
-		bitmap->last = (obj + 1) & (bitmap->max - 1);
+		bitmap->last = (obj + 1);
+		if (bitmap->last == bitmap->max)
+			bitmap->last = 0;
 		obj |= bitmap->top;
 	} else
 		obj = -1;
@@ -109,9 +112,9 @@ u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align)
 	obj = find_aligned_range(bitmap->table, bitmap->last,
 				 bitmap->max, cnt, align);
 	if (obj >= bitmap->max) {
-		bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
-		obj = find_aligned_range(bitmap->table, 0,
-					 bitmap->max,
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+		obj = find_aligned_range(bitmap->table, 0, bitmap->max,
 					 cnt, align);
 	}
 
@@ -136,17 +139,19 @@ void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt)
 {
 	u32 i;
 
-	obj &= bitmap->max - 1;
+	obj &= bitmap->max + bitmap->reserved_top - 1;
 
 	spin_lock(&bitmap->lock);
 	for (i = 0; i < cnt; i++)
 		clear_bit(obj + i, bitmap->table);
 	bitmap->last = min(bitmap->last, obj);
-	bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+	bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+			& bitmap->mask;
 	spin_unlock(&bitmap->lock);
 }
 
-int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved)
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 reserved_top)
 {
 	int i;
 
@@ -156,14 +161,16 @@ int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved
 
 	bitmap->last = 0;
 	bitmap->top  = 0;
-	bitmap->max  = num;
+	bitmap->max  = num - reserved_top;
 	bitmap->mask = mask;
+	bitmap->reserved_top = reserved_top;
 	spin_lock_init(&bitmap->lock);
-	bitmap->table = kzalloc(BITS_TO_LONGS(num) * sizeof (long), GFP_KERNEL);
+	bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) *
+				sizeof (long), GFP_KERNEL);
 	if (!bitmap->table)
 		return -ENOMEM;
 
-	for (i = 0; i < reserved; ++i)
+	for (i = 0; i < reserved_bot; ++i)
 		set_bit(i, bitmap->table);
 
 	return 0;
diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c
index 9bb50e3f897..b7ad2829d67 100644
--- a/drivers/net/mlx4/cq.c
+++ b/drivers/net/mlx4/cq.c
@@ -300,7 +300,7 @@ int mlx4_init_cq_table(struct mlx4_dev *dev)
 	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
 
 	err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs,
-			       dev->caps.num_cqs - 1, dev->caps.reserved_cqs);
+			       dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index 8a8b56135a5..de169338cd9 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -558,7 +558,7 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 	int i;
 
 	err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs,
-			       dev->caps.num_eqs - 1, dev->caps.reserved_eqs);
+			       dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 7e32955da98..40d8142c23b 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -357,6 +357,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_PORT_MTU_OFFSET			0x01
 #define QUERY_PORT_WIDTH_OFFSET			0x06
 #define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
+#define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
 #define QUERY_PORT_MAX_VL_OFFSET		0x0b
 
 		for (i = 1; i <= dev_cap->num_ports; ++i) {
@@ -374,6 +375,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
 			MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
 			dev_cap->max_vl[i]	   = field & 0xf;
+			MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
+			dev_cap->log_max_macs[i]  = field & 0xf;
+			dev_cap->log_max_vlans[i] = field >> 4;
+
 		}
 	}
 
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index decbb5c2ad4..c34e726d66e 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -102,6 +102,8 @@ struct mlx4_dev_cap {
 	u32 reserved_lkey;
 	u64 max_icm_sz;
 	int max_gso_sz;
+	u8  log_max_macs[MLX4_MAX_PORTS + 1];
+	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_adapter {
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 1252a919de2..560e1962212 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -85,6 +85,19 @@ static struct mlx4_profile default_profile = {
 	.num_mtt	= 1 << 20,
 };
 
+static int log_num_mac = 2;
+module_param_named(log_num_mac, log_num_mac, int, 0444);
+MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)");
+
+static int log_num_vlan;
+module_param_named(log_num_vlan, log_num_vlan, int, 0444);
+MODULE_PARM_DESC(log_num_vlan, "Log2 max number of VLANs per ETH port (0-7)");
+
+static int use_prio;
+module_param_named(use_prio, use_prio, bool, 0444);
+MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
+		  "(0/1, default 0)");
+
 static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	int err;
@@ -134,7 +147,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.max_rq_sg	     = dev_cap->max_rq_sg;
 	dev->caps.max_wqes	     = dev_cap->max_qp_sz;
 	dev->caps.max_qp_init_rdma   = dev_cap->max_requester_per_qp;
-	dev->caps.reserved_qps	     = dev_cap->reserved_qps;
 	dev->caps.max_srq_wqes	     = dev_cap->max_srq_sz;
 	dev->caps.max_srq_sge	     = dev_cap->max_rq_sg - 1;
 	dev->caps.reserved_srqs	     = dev_cap->reserved_srqs;
@@ -163,6 +175,39 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 
+	dev->caps.log_num_macs  = log_num_mac;
+	dev->caps.log_num_vlans = log_num_vlan;
+	dev->caps.log_num_prios = use_prio ? 3 : 0;
+
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
+			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
+			mlx4_warn(dev, "Requested number of MACs is too much "
+				  "for port %d, reducing to %d.\n",
+				  i, 1 << dev->caps.log_num_macs);
+		}
+		if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) {
+			dev->caps.log_num_vlans = dev_cap->log_max_vlans[i];
+			mlx4_warn(dev, "Requested number of VLANs is too much "
+				  "for port %d, reducing to %d.\n",
+				  i, 1 << dev->caps.log_num_vlans);
+		}
+	}
+
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
+		(1 << dev->caps.log_num_macs) *
+		(1 << dev->caps.log_num_vlans) *
+		(1 << dev->caps.log_num_prios) *
+		dev->caps.num_ports;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
+
+	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH];
+
 	return 0;
 }
 
@@ -211,7 +256,8 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
 				  ((u64) (MLX4_CMPT_TYPE_QP *
 					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
 				  cmpt_entry_sz, dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0, 0);
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
 	if (err)
 		goto err;
 
@@ -336,7 +382,8 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  init_hca->qpc_base,
 				  dev_cap->qpc_entry_sz,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0, 0);
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map QP context memory, aborting.\n");
 		goto err_unmap_dmpt;
@@ -346,7 +393,8 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  init_hca->auxc_base,
 				  dev_cap->aux_entry_sz,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0, 0);
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n");
 		goto err_unmap_qp;
@@ -356,7 +404,8 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  init_hca->altc_base,
 				  dev_cap->altc_entry_sz,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0, 0);
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n");
 		goto err_unmap_auxc;
@@ -366,7 +415,8 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 				  init_hca->rdmarc_base,
 				  dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0, 0);
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n");
 		goto err_unmap_altc;
diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c
index c83f88ce073..592c01ae2c5 100644
--- a/drivers/net/mlx4/mcg.c
+++ b/drivers/net/mlx4/mcg.c
@@ -368,8 +368,8 @@ int mlx4_init_mcg_table(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
 
-	err = mlx4_bitmap_init(&priv->mcg_table.bitmap,
-			       dev->caps.num_amgms, dev->caps.num_amgms - 1, 0);
+	err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms,
+			       dev->caps.num_amgms - 1, 0, 0);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index b55ddab73f6..9e2f44c3181 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -111,6 +111,7 @@ struct mlx4_bitmap {
 	u32			last;
 	u32			top;
 	u32			max;
+	u32                     reserved_top;
 	u32			mask;
 	spinlock_t		lock;
 	unsigned long	       *table;
@@ -290,7 +291,8 @@ u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
 void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj);
 u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
 void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt);
-int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved);
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 resetrved_top);
 void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
 
 int mlx4_reset(struct mlx4_dev *dev);
diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
index d1dd5b48dbd..0caf74cae8b 100644
--- a/drivers/net/mlx4/mr.c
+++ b/drivers/net/mlx4/mr.c
@@ -461,7 +461,7 @@ int mlx4_init_mr_table(struct mlx4_dev *dev)
 	int err;
 
 	err = mlx4_bitmap_init(&mr_table->mpt_bitmap, dev->caps.num_mpts,
-			       ~0, dev->caps.reserved_mrws);
+			       ~0, dev->caps.reserved_mrws, 0);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/pd.c b/drivers/net/mlx4/pd.c
index aa616892d09..26d1a7a9e37 100644
--- a/drivers/net/mlx4/pd.c
+++ b/drivers/net/mlx4/pd.c
@@ -62,7 +62,7 @@ int mlx4_init_pd_table(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds,
-				(1 << 24) - 1, dev->caps.reserved_pds);
+				(1 << 24) - 1, dev->caps.reserved_pds, 0);
 }
 
 void mlx4_cleanup_pd_table(struct mlx4_dev *dev)
@@ -100,7 +100,7 @@ int mlx4_init_uar_table(struct mlx4_dev *dev)
 
 	return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap,
 				dev->caps.num_uars, dev->caps.num_uars - 1,
-				max(128, dev->caps.reserved_uars));
+				max(128, dev->caps.reserved_uars), 0);
 }
 
 void mlx4_cleanup_uar_table(struct mlx4_dev *dev)
diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c
index 98e0c40ba36..1c565ef8d17 100644
--- a/drivers/net/mlx4/qp.c
+++ b/drivers/net/mlx4/qp.c
@@ -272,6 +272,7 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 {
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
 	int err;
+	int reserved_from_top = 0;
 
 	spin_lock_init(&qp_table->lock);
 	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
@@ -281,9 +282,40 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	 * block of special QPs must be aligned to a multiple of 8, so
 	 * round up.
 	 */
-	dev->caps.sqp_start = ALIGN(dev->caps.reserved_qps, 8);
+	dev->caps.sqp_start =
+		ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8);
+
+	{
+		int sort[MLX4_NUM_QP_REGION];
+		int i, j, tmp;
+		int last_base = dev->caps.num_qps;
+
+		for (i = 1; i < MLX4_NUM_QP_REGION; ++i)
+			sort[i] = i;
+
+		for (i = MLX4_NUM_QP_REGION; i > 0; --i) {
+			for (j = 2; j < i; ++j) {
+				if (dev->caps.reserved_qps_cnt[sort[j]] >
+				    dev->caps.reserved_qps_cnt[sort[j - 1]]) {
+					tmp             = sort[j];
+					sort[j]         = sort[j - 1];
+					sort[j - 1]     = tmp;
+				}
+			}
+		}
+
+		for (i = 1; i < MLX4_NUM_QP_REGION; ++i) {
+			last_base -= dev->caps.reserved_qps_cnt[sort[i]];
+			dev->caps.reserved_qps_base[sort[i]] = last_base;
+			reserved_from_top +=
+				dev->caps.reserved_qps_cnt[sort[i]];
+		}
+
+	}
+
 	err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
-			       (1 << 24) - 1, dev->caps.sqp_start + 8);
+			       (1 << 23) - 1, dev->caps.sqp_start + 8,
+			       reserved_from_top);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/srq.c b/drivers/net/mlx4/srq.c
index 533eb6db24b..fe9f218691f 100644
--- a/drivers/net/mlx4/srq.c
+++ b/drivers/net/mlx4/srq.c
@@ -245,7 +245,7 @@ int mlx4_init_srq_table(struct mlx4_dev *dev)
 	INIT_RADIX_TREE(&srq_table->tree, GFP_ATOMIC);
 
 	err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs,
-			       dev->caps.num_srqs - 1, dev->caps.reserved_srqs);
+			       dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0);
 	if (err)
 		return err;
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index d21e879f3c9..693f93cd29e 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -145,6 +145,18 @@ enum {
 	MLX4_MTT_FLAG_PRESENT		= 1
 };
 
+enum mlx4_qp_region {
+	MLX4_QP_REGION_FW = 0,
+	MLX4_QP_REGION_ETH_ADDR,
+	MLX4_QP_REGION_FC_ADDR,
+	MLX4_QP_REGION_FC_EXCH,
+	MLX4_NUM_QP_REGION
+};
+
+enum {
+	MLX4_NUM_FEXCH          = 64 * 1024,
+};
+
 static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
 {
 	return (major << 32) | (minor << 16) | subminor;
@@ -169,7 +181,6 @@ struct mlx4_caps {
 	int			max_rq_desc_sz;
 	int			max_qp_init_rdma;
 	int			max_qp_dest_rdma;
-	int			reserved_qps;
 	int			sqp_start;
 	int			num_srqs;
 	int			max_srq_wqes;
@@ -201,6 +212,12 @@ struct mlx4_caps {
 	u16			stat_rate_support;
 	u8			port_width_cap[MLX4_MAX_PORTS + 1];
 	int			max_gso_sz;
+	int                     reserved_qps_cnt[MLX4_NUM_QP_REGION];
+	int			reserved_qps;
+	int                     reserved_qps_base[MLX4_NUM_QP_REGION];
+	int                     log_num_macs;
+	int                     log_num_vlans;
+	int                     log_num_prios;
 };
 
 struct mlx4_buf_list {
-- 
cgit v1.2.3


From b79acb49de6c2ab9ff0245f0f2b573d48b9a2d93 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 22 Oct 2008 10:56:48 -0700
Subject: mlx4_core: Get ethernet MTU and default address from firmware

Get maximum ethernet MTU and default MAC address from the firmware
QUERY_DEV_CAP command.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/fw.c       | 13 ++++++++-----
 drivers/net/mlx4/fw.h       |  4 +++-
 drivers/net/mlx4/main.c     |  4 +++-
 include/linux/mlx4/device.h |  4 +++-
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 40d8142c23b..8d402db9a03 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -346,7 +346,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
 			dev_cap->max_vl[i]	   = field >> 4;
 			MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
-			dev_cap->max_mtu[i]	   = field >> 4;
+			dev_cap->ib_mtu[i]	   = field >> 4;
 			dev_cap->max_port_width[i] = field & 0xf;
 			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
 			dev_cap->max_gids[i]	   = 1 << (field & 0xf);
@@ -355,8 +355,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		}
 	} else {
 #define QUERY_PORT_MTU_OFFSET			0x01
+#define QUERY_PORT_ETH_MTU_OFFSET		0x02
 #define QUERY_PORT_WIDTH_OFFSET			0x06
 #define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
+#define QUERY_PORT_MAC_OFFSET			0x08
 #define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
 #define QUERY_PORT_MAX_VL_OFFSET		0x0b
 
@@ -367,7 +369,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 				goto out;
 
 			MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
-			dev_cap->max_mtu[i]	   = field & 0xf;
+			dev_cap->ib_mtu[i]	   = field & 0xf;
 			MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
 			dev_cap->max_port_width[i] = field & 0xf;
 			MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
@@ -378,7 +380,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
 			dev_cap->log_max_macs[i]  = field & 0xf;
 			dev_cap->log_max_vlans[i] = field >> 4;
-
+			MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET);
+			MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET);
 		}
 	}
 
@@ -412,7 +415,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
 		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
 	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
-		 dev_cap->local_ca_ack_delay, 128 << dev_cap->max_mtu[1],
+		 dev_cap->local_ca_ack_delay, 128 << dev_cap->ib_mtu[1],
 		 dev_cap->max_port_width[1]);
 	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
 		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
@@ -824,7 +827,7 @@ int mlx4_INIT_PORT(struct mlx4_dev *dev, int port)
 		flags |= (dev->caps.port_width_cap[port] & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT;
 		MLX4_PUT(inbox, flags,		  INIT_PORT_FLAGS_OFFSET);
 
-		field = 128 << dev->caps.mtu_cap[port];
+		field = 128 << dev->caps.ib_mtu_cap[port];
 		MLX4_PUT(inbox, field, INIT_PORT_MTU_OFFSET);
 		field = dev->caps.gid_table_len[port];
 		MLX4_PUT(inbox, field, INIT_PORT_MAX_GID_OFFSET);
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index c34e726d66e..d0913d4d262 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -66,11 +66,13 @@ struct mlx4_dev_cap {
 	int local_ca_ack_delay;
 	int num_ports;
 	u32 max_msg_sz;
-	int max_mtu[MLX4_MAX_PORTS + 1];
+	int ib_mtu[MLX4_MAX_PORTS + 1];
 	int max_port_width[MLX4_MAX_PORTS + 1];
 	int max_vl[MLX4_MAX_PORTS + 1];
 	int max_gids[MLX4_MAX_PORTS + 1];
 	int max_pkeys[MLX4_MAX_PORTS + 1];
+	u64 def_mac[MLX4_MAX_PORTS + 1];
+	u16 eth_mtu[MLX4_MAX_PORTS + 1];
 	u16 stat_rate_support;
 	u32 flags;
 	int reserved_uars;
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 560e1962212..28f36b88de3 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -133,10 +133,12 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.num_ports	     = dev_cap->num_ports;
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
 		dev->caps.vl_cap[i]	    = dev_cap->max_vl[i];
-		dev->caps.mtu_cap[i]	    = dev_cap->max_mtu[i];
+		dev->caps.ib_mtu_cap[i]	    = dev_cap->ib_mtu[i];
 		dev->caps.gid_table_len[i]  = dev_cap->max_gids[i];
 		dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i];
 		dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
+		dev->caps.eth_mtu_cap[i]    = dev_cap->eth_mtu[i];
+		dev->caps.def_mac[i]        = dev_cap->def_mac[i];
 	}
 
 	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 693f93cd29e..f9e73cfc540 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -166,7 +166,9 @@ struct mlx4_caps {
 	u64			fw_ver;
 	int			num_ports;
 	int			vl_cap[MLX4_MAX_PORTS + 1];
-	int			mtu_cap[MLX4_MAX_PORTS + 1];
+	int			ib_mtu_cap[MLX4_MAX_PORTS + 1];
+	u64			def_mac[MLX4_MAX_PORTS + 1];
+	int			eth_mtu_cap[MLX4_MAX_PORTS + 1];
 	int			gid_table_len[MLX4_MAX_PORTS + 1];
 	int			pkey_table_len[MLX4_MAX_PORTS + 1];
 	int			local_ca_ack_delay;
-- 
cgit v1.2.3


From 2a2336f8228292b8197f4187e54b0748903e6645 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 22 Oct 2008 11:44:46 -0700
Subject: mlx4_core: Ethernet MAC/VLAN management

Add support for managing MAC and VLAN filters for each port.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Oren Duer <oren@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/Makefile   |   2 +-
 drivers/net/mlx4/main.c     |  14 +++
 drivers/net/mlx4/mlx4.h     |  33 ++++++
 drivers/net/mlx4/port.c     | 259 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mlx4/cmd.h    |   9 ++
 include/linux/mlx4/device.h |  12 ++
 6 files changed, 328 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/mlx4/port.c

diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index 0952a6528f5..9f493666e27 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
 
 mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
-		mr.o pd.o profile.o qp.o reset.o srq.o
+		mr.o pd.o port.o profile.o qp.o reset.o srq.o
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 28f36b88de3..0a5c8bfb3f1 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -780,11 +780,22 @@ no_msi:
 		priv->eq_table.eq[i].irq = dev->pdev->irq;
 }
 
+static void mlx4_init_port_info(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+
+	info->dev = dev;
+	info->port = port;
+	mlx4_init_mac_table(dev, &info->mac_table);
+	mlx4_init_vlan_table(dev, &info->vlan_table);
+}
+
 static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct mlx4_priv *priv;
 	struct mlx4_dev *dev;
 	int err;
+	int port;
 
 	printk(KERN_INFO PFX "Initializing %s\n",
 	       pci_name(pdev));
@@ -894,6 +905,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_close;
 
+	for (port = 1; port <= dev->caps.num_ports; port++)
+		mlx4_init_port_info(dev, port);
+
 	err = mlx4_register_device(dev);
 	if (err)
 		goto err_cleanup;
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 9e2f44c3181..23309f381ee 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -252,6 +252,35 @@ struct mlx4_catas_err {
 	struct list_head	list;
 };
 
+#define MLX4_MAX_MAC_NUM	128
+#define MLX4_MAC_TABLE_SIZE	(MLX4_MAX_MAC_NUM << 3)
+
+struct mlx4_mac_table {
+	__be64			entries[MLX4_MAX_MAC_NUM];
+	int			refs[MLX4_MAX_MAC_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+#define MLX4_MAX_VLAN_NUM	128
+#define MLX4_VLAN_TABLE_SIZE	(MLX4_MAX_VLAN_NUM << 2)
+
+struct mlx4_vlan_table {
+	__be32			entries[MLX4_MAX_VLAN_NUM];
+	int			refs[MLX4_MAX_VLAN_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+struct mlx4_port_info {
+	struct mlx4_dev	       *dev;
+	int			port;
+	struct mlx4_mac_table	mac_table;
+	struct mlx4_vlan_table	vlan_table;
+};
+
 struct mlx4_priv {
 	struct mlx4_dev		dev;
 
@@ -280,6 +309,7 @@ struct mlx4_priv {
 
 	struct mlx4_uar		driver_uar;
 	void __iomem	       *kar;
+	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
 };
 
 static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
@@ -350,4 +380,7 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
 
 void mlx4_handle_catas_err(struct mlx4_dev *dev);
 
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
+
 #endif /* MLX4_H */
diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c
new file mode 100644
index 00000000000..8644f3d978e
--- /dev/null
+++ b/drivers/net/mlx4/port.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+#define MLX4_MAC_VALID		(1ull << 63)
+#define MLX4_MAC_MASK		0xffffffffffffULL
+
+#define MLX4_VLAN_VALID		(1u << 31)
+#define MLX4_VLAN_MASK		0xfff
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+	}
+	table->max   = 1 << dev->caps.log_num_macs;
+	table->total = 0;
+}
+
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+	}
+	table->max   = 1 << dev->caps.log_num_vlans;
+	table->total = 0;
+}
+
+static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port,
+				   __be64 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE);
+
+	in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index)
+{
+	struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table;
+	int i, err = 0;
+	int free = -1;
+
+	mlx4_dbg(dev, "Registering MAC: 0x%llx\n", (unsigned long long) mac);
+	mutex_lock(&table->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM - 1; i++) {
+		if (free < 0 && !table->refs[i]) {
+			free = i;
+			continue;
+		}
+
+		if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			/* MAC already registered, increase refernce count */
+			*index = i;
+			++table->refs[i];
+			goto out;
+		}
+	}
+	mlx4_dbg(dev, "Free MAC index is %d\n", free);
+
+	if (table->total == table->max) {
+		/* No free mac entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	/* Register new MAC */
+	table->refs[free] = 1;
+	table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+	err = mlx4_set_port_mac_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n", (unsigned long long) mac);
+		table->refs[free] = 0;
+		table->entries[free] = 0;
+		goto out;
+	}
+
+	*index = free;
+	++table->total;
+out:
+	mutex_unlock(&table->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_mac);
+
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index)
+{
+	struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table;
+
+	mutex_lock(&table->mutex);
+	if (!table->refs[index]) {
+		mlx4_warn(dev, "No MAC entry for index %d\n", index);
+		goto out;
+	}
+	if (--table->refs[index]) {
+		mlx4_warn(dev, "Have more references for index %d,"
+			  "no need to modify MAC table\n", index);
+		goto out;
+	}
+	table->entries[index] = 0;
+	mlx4_set_port_mac_table(dev, port, table->entries);
+	--table->total;
+out:
+	mutex_unlock(&table->mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_mac);
+
+static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port,
+				    __be32 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE);
+	in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int i, err = 0;
+	int free = -1;
+
+	mutex_lock(&table->mutex);
+	for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (free < 0 && (table->refs[i] == 0)) {
+			free = i;
+			continue;
+		}
+
+		if (table->refs[i] &&
+		    (vlan == (MLX4_VLAN_MASK &
+			      be32_to_cpu(table->entries[i])))) {
+			/* Vlan already registered, increase refernce count */
+			*index = i;
+			++table->refs[i];
+			goto out;
+		}
+	}
+
+	if (table->total == table->max) {
+		/* No free vlan entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	/* Register new MAC */
+	table->refs[free] = 1;
+	table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+	err = mlx4_set_port_vlan_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_warn(dev, "Failed adding vlan: %u\n", vlan);
+		table->refs[free] = 0;
+		table->entries[free] = 0;
+		goto out;
+	}
+
+	*index = free;
+	++table->total;
+out:
+	mutex_unlock(&table->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_vlan);
+
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+
+	if (index < MLX4_VLAN_REGULAR) {
+		mlx4_warn(dev, "Trying to free special vlan index %d\n", index);
+		return;
+	}
+
+	mutex_lock(&table->mutex);
+	if (!table->refs[index]) {
+		mlx4_warn(dev, "No vlan entry for index %d\n", index);
+		goto out;
+	}
+	if (--table->refs[index]) {
+		mlx4_dbg(dev, "Have more references for index %d,"
+			 "no need to modify vlan table\n", index);
+		goto out;
+	}
+	table->entries[index] = 0;
+	mlx4_set_port_vlan_table(dev, port, table->entries);
+	--table->total;
+out:
+	mutex_unlock(&table->mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 77323a72dd3..cf9c679ab38 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -132,6 +132,15 @@ enum {
 	MLX4_MAILBOX_SIZE	=  4096
 };
 
+enum {
+	/* set port opcode modifiers */
+	MLX4_SET_PORT_GENERAL   = 0x0,
+	MLX4_SET_PORT_RQP_CALC  = 0x1,
+	MLX4_SET_PORT_MAC_TABLE = 0x2,
+	MLX4_SET_PORT_VLAN_TABLE = 0x3,
+	MLX4_SET_PORT_PRIO_MAP  = 0x4,
+};
+
 struct mlx4_dev;
 
 struct mlx4_cmd_mailbox {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index f9e73cfc540..1951fe70a25 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -153,6 +153,12 @@ enum mlx4_qp_region {
 	MLX4_NUM_QP_REGION
 };
 
+enum mlx4_special_vlan_idx {
+	MLX4_NO_VLAN_IDX        = 0,
+	MLX4_VLAN_MISS_IDX,
+	MLX4_VLAN_REGULAR
+};
+
 enum {
 	MLX4_NUM_FEXCH          = 64 * 1024,
 };
@@ -438,6 +444,12 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 			  int block_mcast_loopback);
 int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
 
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index);
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index);
+
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index);
+
 int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
 		      int npages, u64 iova, u32 *lkey, u32 *rkey);
 int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
-- 
cgit v1.2.3


From 7ff93f8b7ecbc36e7ffc5c11a61643821c1bfee5 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 22 Oct 2008 15:38:42 -0700
Subject: mlx4_core: Multiple port type support

Multi-protocol adapters support different port types.  Each consumer
of mlx4_core queries for supported port types; in particular mlx4_ib
can no longer assume that all physical ports belong to it.  Port type
is configured through a sysfs interface.  When the type of a port is
changed, all mlx4 interfaces are unregistered, and then registered
again with the new port types.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/mad.c     |   6 +-
 drivers/infiniband/hw/mlx4/main.c    |  11 +-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   1 +
 drivers/net/mlx4/fw.c                |   4 +
 drivers/net/mlx4/fw.h                |   1 +
 drivers/net/mlx4/main.c              | 211 ++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4.h              |   6 +
 drivers/net/mlx4/port.c              |  23 ++++
 include/linux/mlx4/device.h          |  14 +++
 9 files changed, 266 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index cdca3a511e1..606f1e2ef28 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -298,7 +298,7 @@ int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
 	int p, q;
 	int ret;
 
-	for (p = 0; p < dev->dev->caps.num_ports; ++p)
+	for (p = 0; p < dev->num_ports; ++p)
 		for (q = 0; q <= 1; ++q) {
 			agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
 						      q ? IB_QPT_GSI : IB_QPT_SMI,
@@ -314,7 +314,7 @@ int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
 	return 0;
 
 err:
-	for (p = 0; p < dev->dev->caps.num_ports; ++p)
+	for (p = 0; p < dev->num_ports; ++p)
 		for (q = 0; q <= 1; ++q)
 			if (dev->send_agent[p][q])
 				ib_unregister_mad_agent(dev->send_agent[p][q]);
@@ -327,7 +327,7 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
 	struct ib_mad_agent *agent;
 	int p, q;
 
-	for (p = 0; p < dev->dev->caps.num_ports; ++p) {
+	for (p = 0; p < dev->num_ports; ++p) {
 		for (q = 0; q <= 1; ++q) {
 			agent = dev->send_agent[p][q];
 			dev->send_agent[p][q] = NULL;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index a3c2851c054..2e80f8f47b0 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -574,7 +574,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.owner		= THIS_MODULE;
 	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
 	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
-	ibdev->ib_dev.phys_port_cnt	= dev->caps.num_ports;
+	ibdev->num_ports = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		ibdev->num_ports++;
+	ibdev->ib_dev.phys_port_cnt     = ibdev->num_ports;
 	ibdev->ib_dev.num_comp_vectors	= 1;
 	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
 
@@ -691,7 +694,7 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 	struct mlx4_ib_dev *ibdev = ibdev_ptr;
 	int p;
 
-	for (p = 1; p <= dev->caps.num_ports; ++p)
+	for (p = 1; p <= ibdev->num_ports; ++p)
 		mlx4_CLOSE_PORT(dev, p);
 
 	mlx4_ib_mad_cleanup(ibdev);
@@ -706,6 +709,10 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
 			  enum mlx4_dev_event event, int port)
 {
 	struct ib_event ibev;
+	struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
+
+	if (port > ibdev->num_ports)
+		return;
 
 	switch (event) {
 	case MLX4_DEV_EVENT_PORT_UP:
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 6e2b0dc21b6..9974e886b8d 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -162,6 +162,7 @@ struct mlx4_ib_ah {
 struct mlx4_ib_dev {
 	struct ib_device	ib_dev;
 	struct mlx4_dev	       *dev;
+	int			num_ports;
 	void __iomem	       *uar_map;
 
 	struct mlx4_uar		priv_uar;
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 8d402db9a03..be09fdb79cb 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -88,6 +88,7 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u32 flags)
 		[ 8] = "P_Key violation counter",
 		[ 9] = "Q_Key violation counter",
 		[10] = "VMM",
+		[12] = "DPDP",
 		[16] = "MW support",
 		[17] = "APM support",
 		[18] = "Atomic ops support",
@@ -354,6 +355,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
 		}
 	} else {
+#define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
 #define QUERY_PORT_MTU_OFFSET			0x01
 #define QUERY_PORT_ETH_MTU_OFFSET		0x02
 #define QUERY_PORT_WIDTH_OFFSET			0x06
@@ -368,6 +370,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			if (err)
 				goto out;
 
+			MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+			dev_cap->supported_port_types[i] = field & 3;
 			MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
 			dev_cap->ib_mtu[i]	   = field & 0xf;
 			MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index d0913d4d262..526d7f30c04 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -104,6 +104,7 @@ struct mlx4_dev_cap {
 	u32 reserved_lkey;
 	u64 max_icm_sz;
 	int max_gso_sz;
+	u8  supported_port_types[MLX4_MAX_PORTS + 1];
 	u8  log_max_macs[MLX4_MAX_PORTS + 1];
 	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
 };
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 0a5c8bfb3f1..c1d447873bf 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -98,6 +98,44 @@ module_param_named(use_prio, use_prio, bool, 0444);
 MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
 		  "(0/1, default 0)");
 
+static int mlx4_check_port_params(struct mlx4_dev *dev,
+				  enum mlx4_port_type *port_type)
+{
+	int i;
+
+	for (i = 0; i < dev->caps.num_ports - 1; i++) {
+		if (port_type[i] != port_type[i+1] &&
+		    !(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+			mlx4_err(dev, "Only same port types supported "
+				 "on this HCA, aborting.\n");
+			return -EINVAL;
+		}
+	}
+	if ((port_type[0] == MLX4_PORT_TYPE_ETH) &&
+	    (port_type[1] == MLX4_PORT_TYPE_IB)) {
+		mlx4_err(dev, "eth-ib configuration is not supported.\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (!(port_type[i] & dev->caps.supported_type[i+1])) {
+			mlx4_err(dev, "Requested port type for port %d is not "
+				      "supported on this HCA\n", i + 1);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static void mlx4_set_port_mask(struct mlx4_dev *dev)
+{
+	int i;
+
+	dev->caps.port_mask = 0;
+	for (i = 1; i <= dev->caps.num_ports; ++i)
+		if (dev->caps.port_type[i] == MLX4_PORT_TYPE_IB)
+			dev->caps.port_mask |= 1 << (i - 1);
+}
 static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
 	int err;
@@ -139,6 +177,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
 		dev->caps.eth_mtu_cap[i]    = dev_cap->eth_mtu[i];
 		dev->caps.def_mac[i]        = dev_cap->def_mac[i];
+		dev->caps.supported_type[i] = dev_cap->supported_port_types[i];
 	}
 
 	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
@@ -182,6 +221,11 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.log_num_prios = use_prio ? 3 : 0;
 
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		if (dev->caps.supported_type[i] != MLX4_PORT_TYPE_ETH)
+			dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
+		else
+			dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+
 		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
 			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
 			mlx4_warn(dev, "Requested number of MACs is too much "
@@ -196,6 +240,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		}
 	}
 
+	mlx4_set_port_mask(dev);
+
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
@@ -213,6 +259,95 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	return 0;
 }
 
+/*
+ * Change the port configuration of the device.
+ * Every user of this function must hold the port mutex.
+ */
+static int mlx4_change_port_types(struct mlx4_dev *dev,
+				  enum mlx4_port_type *port_types)
+{
+	int err = 0;
+	int change = 0;
+	int port;
+
+	for (port = 0; port <  dev->caps.num_ports; port++) {
+		if (port_types[port] != dev->caps.port_type[port + 1]) {
+			change = 1;
+			dev->caps.port_type[port + 1] = port_types[port];
+		}
+	}
+	if (change) {
+		mlx4_unregister_device(dev);
+		for (port = 1; port <= dev->caps.num_ports; port++) {
+			mlx4_CLOSE_PORT(dev, port);
+			err = mlx4_SET_PORT(dev, port);
+			if (err) {
+				mlx4_err(dev, "Failed to set port %d, "
+					      "aborting\n", port);
+				goto out;
+			}
+		}
+		mlx4_set_port_mask(dev);
+		err = mlx4_register_device(dev);
+	}
+
+out:
+	return err;
+}
+
+static ssize_t show_port_type(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+
+	return sprintf(buf, "%s\n",
+		       mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB ?
+		       "ib" : "eth");
+}
+
+static ssize_t set_port_type(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	struct mlx4_priv *priv = mlx4_priv(mdev);
+	enum mlx4_port_type types[MLX4_MAX_PORTS];
+	int i;
+	int err = 0;
+
+	if (!strcmp(buf, "ib\n"))
+		info->tmp_type = MLX4_PORT_TYPE_IB;
+	else if (!strcmp(buf, "eth\n"))
+		info->tmp_type = MLX4_PORT_TYPE_ETH;
+	else {
+		mlx4_err(mdev, "%s is not supported port type\n", buf);
+		return -EINVAL;
+	}
+
+	mutex_lock(&priv->port_mutex);
+	for (i = 0; i < mdev->caps.num_ports; i++)
+		types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type :
+					mdev->caps.port_type[i+1];
+
+	err = mlx4_check_port_params(mdev, types);
+	if (err)
+		goto out;
+
+	for (i = 1; i <= mdev->caps.num_ports; i++)
+		priv->port[i].tmp_type = 0;
+
+	err = mlx4_change_port_types(mdev, types);
+
+out:
+	mutex_unlock(&priv->port_mutex);
+	return err ? err : count;
+}
+
 static int mlx4_load_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -617,6 +752,7 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
+	int port;
 
 	err = mlx4_init_uar_table(dev);
 	if (err) {
@@ -715,8 +851,20 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 		goto err_qp_table_free;
 	}
 
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		err = mlx4_SET_PORT(dev, port);
+		if (err) {
+			mlx4_err(dev, "Failed to set port %d, aborting\n",
+				port);
+			goto err_mcg_table_free;
+		}
+	}
+
 	return 0;
 
+err_mcg_table_free:
+	mlx4_cleanup_mcg_table(dev);
+
 err_qp_table_free:
 	mlx4_cleanup_qp_table(dev);
 
@@ -780,14 +928,37 @@ no_msi:
 		priv->eq_table.eq[i].irq = dev->pdev->irq;
 }
 
-static void mlx4_init_port_info(struct mlx4_dev *dev, int port)
+static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 {
 	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	int err = 0;
 
 	info->dev = dev;
 	info->port = port;
 	mlx4_init_mac_table(dev, &info->mac_table);
 	mlx4_init_vlan_table(dev, &info->vlan_table);
+
+	sprintf(info->dev_name, "mlx4_port%d", port);
+	info->port_attr.attr.name = info->dev_name;
+	info->port_attr.attr.mode = S_IRUGO | S_IWUSR;
+	info->port_attr.show      = show_port_type;
+	info->port_attr.store     = set_port_type;
+
+	err = device_create_file(&dev->pdev->dev, &info->port_attr);
+	if (err) {
+		mlx4_err(dev, "Failed to create file for port %d\n", port);
+		info->port = -1;
+	}
+
+	return err;
+}
+
+static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
+{
+	if (info->port < 0)
+		return;
+
+	device_remove_file(&info->dev->pdev->dev, &info->port_attr);
 }
 
 static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
@@ -870,6 +1041,8 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_LIST_HEAD(&priv->ctx_list);
 	spin_lock_init(&priv->ctx_lock);
 
+	mutex_init(&priv->port_mutex);
+
 	INIT_LIST_HEAD(&priv->pgdir_list);
 	mutex_init(&priv->pgdir_mutex);
 
@@ -905,18 +1078,24 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_close;
 
-	for (port = 1; port <= dev->caps.num_ports; port++)
-		mlx4_init_port_info(dev, port);
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		err = mlx4_init_port_info(dev, port);
+		if (err)
+			goto err_port;
+	}
 
 	err = mlx4_register_device(dev);
 	if (err)
-		goto err_cleanup;
+		goto err_port;
 
 	pci_set_drvdata(pdev, dev);
 
 	return 0;
 
-err_cleanup:
+err_port:
+	for (port = 1; port <= dev->caps.num_ports; port++)
+		mlx4_cleanup_port_info(&priv->port[port]);
+
 	mlx4_cleanup_mcg_table(dev);
 	mlx4_cleanup_qp_table(dev);
 	mlx4_cleanup_srq_table(dev);
@@ -973,8 +1152,10 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	if (dev) {
 		mlx4_unregister_device(dev);
 
-		for (p = 1; p <= dev->caps.num_ports; ++p)
+		for (p = 1; p <= dev->caps.num_ports; p++) {
+			mlx4_cleanup_port_info(&priv->port[p]);
 			mlx4_CLOSE_PORT(dev, p);
+		}
 
 		mlx4_cleanup_mcg_table(dev);
 		mlx4_cleanup_qp_table(dev);
@@ -1026,10 +1207,28 @@ static struct pci_driver mlx4_driver = {
 	.remove		= __devexit_p(mlx4_remove_one)
 };
 
+static int __init mlx4_verify_params(void)
+{
+	if ((log_num_mac < 0) || (log_num_mac > 7)) {
+		printk(KERN_WARNING "mlx4_core: bad num_mac: %d\n", log_num_mac);
+		return -1;
+	}
+
+	if ((log_num_vlan < 0) || (log_num_vlan > 7)) {
+		printk(KERN_WARNING "mlx4_core: bad num_vlan: %d\n", log_num_vlan);
+		return -1;
+	}
+
+	return 0;
+}
+
 static int __init mlx4_init(void)
 {
 	int ret;
 
+	if (mlx4_verify_params())
+		return -EINVAL;
+
 	ret = mlx4_catas_init();
 	if (ret)
 		return ret;
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 23309f381ee..fa431fad0ee 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -277,6 +277,9 @@ struct mlx4_vlan_table {
 struct mlx4_port_info {
 	struct mlx4_dev	       *dev;
 	int			port;
+	char			dev_name[16];
+	struct device_attribute port_attr;
+	enum mlx4_port_type	tmp_type;
 	struct mlx4_mac_table	mac_table;
 	struct mlx4_vlan_table	vlan_table;
 };
@@ -310,6 +313,7 @@ struct mlx4_priv {
 	struct mlx4_uar		driver_uar;
 	void __iomem	       *kar;
 	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
+	struct mutex		port_mutex;
 };
 
 static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
@@ -383,4 +387,6 @@ void mlx4_handle_catas_err(struct mlx4_dev *dev);
 void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
 void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
 
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port);
+
 #endif /* MLX4_H */
diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c
index 8644f3d978e..e2fdab42c4c 100644
--- a/drivers/net/mlx4/port.c
+++ b/drivers/net/mlx4/port.c
@@ -257,3 +257,26 @@ out:
 	mutex_unlock(&table->mutex);
 }
 EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	u8 is_eth = dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, 256);
+	if (is_eth) {
+		((u8 *) mailbox->buf)[3] = 6;
+		((__be16 *) mailbox->buf)[4] = cpu_to_be16(1 << 15);
+		((__be16 *) mailbox->buf)[6] = cpu_to_be16(1 << 15);
+	}
+	err = mlx4_cmd(dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 1951fe70a25..bd9977b8949 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -60,6 +60,7 @@ enum {
 	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1 <<  7,
 	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1 <<  8,
 	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1 <<  9,
+	MLX4_DEV_CAP_FLAG_DPDP		= 1 << 12,
 	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1 << 16,
 	MLX4_DEV_CAP_FLAG_APM		= 1 << 17,
 	MLX4_DEV_CAP_FLAG_ATOMIC	= 1 << 18,
@@ -153,6 +154,11 @@ enum mlx4_qp_region {
 	MLX4_NUM_QP_REGION
 };
 
+enum mlx4_port_type {
+	MLX4_PORT_TYPE_IB	= 1 << 0,
+	MLX4_PORT_TYPE_ETH	= 1 << 1,
+};
+
 enum mlx4_special_vlan_idx {
 	MLX4_NO_VLAN_IDX        = 0,
 	MLX4_VLAN_MISS_IDX,
@@ -226,6 +232,9 @@ struct mlx4_caps {
 	int                     log_num_macs;
 	int                     log_num_vlans;
 	int                     log_num_prios;
+	enum mlx4_port_type	port_type[MLX4_MAX_PORTS + 1];
+	u8			supported_type[MLX4_MAX_PORTS + 1];
+	u32			port_mask;
 };
 
 struct mlx4_buf_list {
@@ -380,6 +389,11 @@ struct mlx4_init_port_param {
 	u64			si_guid;
 };
 
+#define mlx4_foreach_port(port, dev, type)				\
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	\
+		if (((type) == MLX4_PORT_TYPE_IB ? (dev)->caps.port_mask : \
+		     ~(dev)->caps.port_mask) & 1 << ((port) - 1))
+
 int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		   struct mlx4_buf *buf);
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
-- 
cgit v1.2.3


From c27a02cd94d6695fbe5858ca364b1e415af02212 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 22 Oct 2008 15:47:49 -0700
Subject: mlx4_en: Add driver for Mellanox ConnectX 10GbE NIC

The Mellanox ConnectX can operate as an InfiniBand adapter, as an
Ethernet NIC, or as a Fibre Channel (FC) HBA.  The kernel has a
low-level driver, mlx4_core, which handles multiplexing access to the
device, and there is also already an InfiniBad driver, mlx4_ib.

This patch adds a new driver, mlx4_en, which implements a standard
Ethernet NIC driver.

Signed-off-by: Liran Liss <liranl@mellanox.co.il>
Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/Kconfig             |    9 +
 drivers/net/mlx4/Makefile       |    5 +
 drivers/net/mlx4/en_cq.c        |  146 ++++++
 drivers/net/mlx4/en_main.c      |  254 +++++++++
 drivers/net/mlx4/en_netdev.c    | 1088 +++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/en_params.c    |  480 +++++++++++++++++
 drivers/net/mlx4/en_port.c      |  261 ++++++++++
 drivers/net/mlx4/en_port.h      |  570 ++++++++++++++++++++
 drivers/net/mlx4/en_resources.c |   96 ++++
 drivers/net/mlx4/en_rx.c        | 1080 ++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/en_tx.c        |  820 +++++++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_en.h      |  561 ++++++++++++++++++++
 12 files changed, 5370 insertions(+)
 create mode 100644 drivers/net/mlx4/en_cq.c
 create mode 100644 drivers/net/mlx4/en_main.c
 create mode 100644 drivers/net/mlx4/en_netdev.c
 create mode 100644 drivers/net/mlx4/en_params.c
 create mode 100644 drivers/net/mlx4/en_port.c
 create mode 100644 drivers/net/mlx4/en_port.h
 create mode 100644 drivers/net/mlx4/en_resources.c
 create mode 100644 drivers/net/mlx4/en_rx.c
 create mode 100644 drivers/net/mlx4/en_tx.c
 create mode 100644 drivers/net/mlx4/mlx4_en.h

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 4a11296a951..69e2c10cb9f 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2465,6 +2465,15 @@ config PASEMI_MAC
 	  This driver supports the on-chip 1/10Gbit Ethernet controller on
 	  PA Semi's PWRficient line of chips.
 
+config MLX4_EN
+	tristate "Mellanox Technologies 10Gbit Ethernet support"
+	depends on PCI && INET
+	select MLX4_CORE
+	select INET_LRO
+	help
+	  This driver supports Mellanox Technologies ConnectX Ethernet
+	  devices.
+
 config MLX4_CORE
 	tristate
 	depends on PCI
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index 9f493666e27..a7a97bf998f 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -2,3 +2,8 @@ obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
 
 mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
 		mr.o pd.o port.o profile.o qp.o reset.o srq.o
+
+obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
+
+mlx4_en-y := 	en_main.o en_tx.o en_rx.o en_params.o en_port.o en_cq.o \
+		en_resources.o en_netdev.o
diff --git a/drivers/net/mlx4/en_cq.c b/drivers/net/mlx4/en_cq.c
new file mode 100644
index 00000000000..1368a8010af
--- /dev/null
+++ b/drivers/net/mlx4/en_cq.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_en.h"
+
+static void mlx4_en_cq_event(struct mlx4_cq *cq, enum mlx4_event event)
+{
+	return;
+}
+
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv,
+		      struct mlx4_en_cq *cq,
+		      int entries, int ring, enum cq_type mode)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	cq->size = entries;
+	if (mode == RX)
+		cq->buf_size = cq->size * sizeof(struct mlx4_cqe);
+	else
+		cq->buf_size = sizeof(struct mlx4_cqe);
+
+	cq->ring = ring;
+	cq->is_tx = mode;
+	spin_lock_init(&cq->lock);
+
+	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
+				cq->buf_size, 2 * PAGE_SIZE);
+	if (err)
+		return err;
+
+	err = mlx4_en_map_buffer(&cq->wqres.buf);
+	if (err)
+		mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+
+	return err;
+}
+
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	cq->dev = mdev->pndev[priv->port];
+	cq->mcq.set_ci_db  = cq->wqres.db.db;
+	cq->mcq.arm_db     = cq->wqres.db.db + 1;
+	*cq->mcq.set_ci_db = 0;
+	*cq->mcq.arm_db    = 0;
+	cq->buf = (struct mlx4_cqe *) cq->wqres.buf.direct.buf;
+	memset(cq->buf, 0, cq->buf_size);
+
+	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar,
+			    cq->wqres.db.dma, &cq->mcq, cq->is_tx);
+	if (err)
+		return err;
+
+	cq->mcq.comp  = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq;
+	cq->mcq.event = mlx4_en_cq_event;
+
+	if (cq->is_tx) {
+		init_timer(&cq->timer);
+		cq->timer.function = mlx4_en_poll_tx_cq;
+		cq->timer.data = (unsigned long) cq;
+	} else {
+		netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64);
+		napi_enable(&cq->napi);
+	}
+
+	return 0;
+}
+
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mlx4_en_unmap_buffer(&cq->wqres.buf);
+	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+	cq->buf_size = 0;
+	cq->buf = NULL;
+}
+
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	if (cq->is_tx)
+		del_timer(&cq->timer);
+	else
+		napi_disable(&cq->napi);
+
+	mlx4_cq_free(mdev->dev, &cq->mcq);
+}
+
+/* Set rx cq moderation parameters */
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	return mlx4_cq_modify(priv->mdev->dev, &cq->mcq,
+			      cq->moder_cnt, cq->moder_time);
+}
+
+int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	cq->armed = 1;
+	mlx4_cq_arm(&cq->mcq, MLX4_CQ_DB_REQ_NOT, priv->mdev->uar_map,
+		    &priv->mdev->uar_lock);
+
+	return 0;
+}
+
+
diff --git a/drivers/net/mlx4/en_main.c b/drivers/net/mlx4/en_main.c
new file mode 100644
index 00000000000..1b0eebf84f7
--- /dev/null
+++ b/drivers/net/mlx4/en_main.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/cpumask.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_en.h"
+
+MODULE_AUTHOR("Liran Liss, Yevgeny Petrilin");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA Ethernet driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION " ("DRV_RELDATE")");
+
+static const char mlx4_en_version[] =
+	DRV_NAME ": Mellanox ConnectX HCA Ethernet driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr,
+			  enum mlx4_dev_event event, int port)
+{
+	struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr;
+	struct mlx4_en_priv *priv;
+
+	if (!mdev->pndev[port])
+		return;
+
+	priv = netdev_priv(mdev->pndev[port]);
+	switch (event) {
+	case MLX4_DEV_EVENT_PORT_UP:
+	case MLX4_DEV_EVENT_PORT_DOWN:
+		/* To prevent races, we poll the link state in a separate
+		  task rather than changing it here */
+		priv->link_state = event;
+		queue_work(mdev->workqueue, &priv->linkstate_task);
+		break;
+
+	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+		mlx4_err(mdev, "Internal error detected, restarting device\n");
+		break;
+
+	default:
+		mlx4_warn(mdev, "Unhandled event: %d\n", event);
+	}
+}
+
+static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
+{
+	struct mlx4_en_dev *mdev = endev_ptr;
+	int i;
+
+	mutex_lock(&mdev->state_lock);
+	mdev->device_up = false;
+	mutex_unlock(&mdev->state_lock);
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		if (mdev->pndev[i])
+			mlx4_en_destroy_netdev(mdev->pndev[i]);
+
+	flush_workqueue(mdev->workqueue);
+	destroy_workqueue(mdev->workqueue);
+	mlx4_mr_free(dev, &mdev->mr);
+	mlx4_uar_free(dev, &mdev->priv_uar);
+	mlx4_pd_free(dev, mdev->priv_pdn);
+	kfree(mdev);
+}
+
+static void *mlx4_en_add(struct mlx4_dev *dev)
+{
+	static int mlx4_en_version_printed;
+	struct mlx4_en_dev *mdev;
+	int i;
+	int err;
+
+	if (!mlx4_en_version_printed) {
+		printk(KERN_INFO "%s", mlx4_en_version);
+		mlx4_en_version_printed++;
+	}
+
+	mdev = kzalloc(sizeof *mdev, GFP_KERNEL);
+	if (!mdev) {
+		dev_err(&dev->pdev->dev, "Device struct alloc failed, "
+			"aborting.\n");
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	if (mlx4_pd_alloc(dev, &mdev->priv_pdn))
+		goto err_free_dev;
+
+	if (mlx4_uar_alloc(dev, &mdev->priv_uar))
+		goto err_pd;
+
+	mdev->uar_map = ioremap(mdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!mdev->uar_map)
+		goto err_uar;
+	spin_lock_init(&mdev->uar_lock);
+
+	mdev->dev = dev;
+	mdev->dma_device = &(dev->pdev->dev);
+	mdev->pdev = dev->pdev;
+	mdev->device_up = false;
+
+	mdev->LSO_support = !!(dev->caps.flags & (1 << 15));
+	if (!mdev->LSO_support)
+		mlx4_warn(mdev, "LSO not supported, please upgrade to later "
+				"FW version to enable LSO\n");
+
+	if (mlx4_mr_alloc(mdev->dev, mdev->priv_pdn, 0, ~0ull,
+			 MLX4_PERM_LOCAL_WRITE |  MLX4_PERM_LOCAL_READ,
+			 0, 0, &mdev->mr)) {
+		mlx4_err(mdev, "Failed allocating memory region\n");
+		goto err_uar;
+	}
+	if (mlx4_mr_enable(mdev->dev, &mdev->mr)) {
+		mlx4_err(mdev, "Failed enabling memory region\n");
+		goto err_mr;
+	}
+
+	/* Build device profile according to supplied module parameters */
+	err = mlx4_en_get_profile(mdev);
+	if (err) {
+		mlx4_err(mdev, "Bad module parameters, aborting.\n");
+		goto err_mr;
+	}
+
+	/* Configure wich ports to start according to module parameters */
+	mdev->port_cnt = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		mdev->port_cnt++;
+
+	/* If we did not receive an explicit number of Rx rings, default to
+	 * the number of completion vectors populated by the mlx4_core */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Using %d tx rings for port:%d\n",
+			  mdev->profile.prof[i].tx_ring_num, i);
+		if (!mdev->profile.prof[i].rx_ring_num) {
+			mdev->profile.prof[i].rx_ring_num = 1;
+			mlx4_info(mdev, "Defaulting to %d rx rings for port:%d\n",
+				  1, i);
+		} else
+			mlx4_info(mdev, "Using %d rx rings for port:%d\n",
+				  mdev->profile.prof[i].rx_ring_num, i);
+	}
+
+	/* Create our own workqueue for reset/multicast tasks
+	 * Note: we cannot use the shared workqueue because of deadlocks caused
+	 *       by the rtnl lock */
+	mdev->workqueue = create_singlethread_workqueue("mlx4_en");
+	if (!mdev->workqueue) {
+		err = -ENOMEM;
+		goto err_close_nic;
+	}
+
+	/* At this stage all non-port specific tasks are complete:
+	 * mark the card state as up */
+	mutex_init(&mdev->state_lock);
+	mdev->device_up = true;
+
+	/* Setup ports */
+
+	/* Create a netdev for each port */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Activating port:%d\n", i);
+		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i])) {
+			mdev->pndev[i] = NULL;
+			goto err_free_netdev;
+		}
+	}
+	return mdev;
+
+
+err_free_netdev:
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		if (mdev->pndev[i])
+			mlx4_en_destroy_netdev(mdev->pndev[i]);
+	}
+
+	mutex_lock(&mdev->state_lock);
+	mdev->device_up = false;
+	mutex_unlock(&mdev->state_lock);
+	flush_workqueue(mdev->workqueue);
+
+	/* Stop event queue before we drop down to release shared SW state */
+
+err_close_nic:
+	destroy_workqueue(mdev->workqueue);
+err_mr:
+	mlx4_mr_free(dev, &mdev->mr);
+err_uar:
+	mlx4_uar_free(dev, &mdev->priv_uar);
+err_pd:
+	mlx4_pd_free(dev, mdev->priv_pdn);
+err_free_dev:
+	kfree(mdev);
+err_free_res:
+	return NULL;
+}
+
+static struct mlx4_interface mlx4_en_interface = {
+	.add	= mlx4_en_add,
+	.remove	= mlx4_en_remove,
+	.event	= mlx4_en_event,
+};
+
+static int __init mlx4_en_init(void)
+{
+	return mlx4_register_interface(&mlx4_en_interface);
+}
+
+static void __exit mlx4_en_cleanup(void)
+{
+	mlx4_unregister_interface(&mlx4_en_interface);
+}
+
+module_init(mlx4_en_init);
+module_exit(mlx4_en_cleanup);
+
diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c
new file mode 100644
index 00000000000..a339afbeed3
--- /dev/null
+++ b/drivers/net/mlx4/en_netdev.c
@@ -0,0 +1,1088 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+#include <linux/delay.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include "mlx4_en.h"
+#include "en_port.h"
+
+
+static void mlx4_en_vlan_rx_register(struct net_device *dev, struct vlan_group *grp)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	mlx4_dbg(HW, priv, "Registering VLAN group:%p\n", grp);
+	priv->vlgrp = grp;
+
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, grp);
+		if (err)
+			mlx4_err(mdev, "Failed configuring VLAN filter\n");
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+static void mlx4_en_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	if (!priv->vlgrp)
+		return;
+
+	mlx4_dbg(HW, priv, "adding VLAN:%d (vlgrp entry:%p)\n",
+		 vid, vlan_group_get_device(priv->vlgrp, vid));
+
+	/* Add VID to port VLAN filter */
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlgrp);
+		if (err)
+			mlx4_err(mdev, "Failed configuring VLAN filter\n");
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+static void mlx4_en_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	if (!priv->vlgrp)
+		return;
+
+	mlx4_dbg(HW, priv, "Killing VID:%d (vlgrp:%p vlgrp "
+		 "entry:%p)\n", vid, priv->vlgrp,
+		 vlan_group_get_device(priv->vlgrp, vid));
+	vlan_group_set_device(priv->vlgrp, vid, NULL);
+
+	/* Remove VID from port VLAN filter */
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlgrp);
+		if (err)
+			mlx4_err(mdev, "Failed configuring VLAN filter\n");
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+static u64 mlx4_en_mac_to_u64(u8 *addr)
+{
+	u64 mac = 0;
+	int i;
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac <<= 8;
+		mac |= addr[i];
+	}
+	return mac;
+}
+
+static int mlx4_en_set_mac(struct net_device *dev, void *addr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct sockaddr *saddr = addr;
+
+	if (!is_valid_ether_addr(saddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(dev->dev_addr, saddr->sa_data, ETH_ALEN);
+	priv->mac = mlx4_en_mac_to_u64(dev->dev_addr);
+	queue_work(mdev->workqueue, &priv->mac_task);
+	return 0;
+}
+
+static void mlx4_en_do_set_mac(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 mac_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		/* Remove old MAC and insert the new one */
+		mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index);
+		err = mlx4_register_mac(mdev->dev, priv->port,
+					priv->mac, &priv->mac_index);
+		if (err)
+			mlx4_err(mdev, "Failed changing HW MAC address\n");
+	} else
+		mlx4_dbg(HW, priv, "Port is down, exiting...\n");
+
+	mutex_unlock(&mdev->state_lock);
+}
+
+static void mlx4_en_clear_list(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct dev_mc_list *plist = priv->mc_list;
+	struct dev_mc_list *next;
+
+	while (plist) {
+		next = plist->next;
+		kfree(plist);
+		plist = next;
+	}
+	priv->mc_list = NULL;
+}
+
+static void mlx4_en_cache_mclist(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct dev_mc_list *mclist;
+	struct dev_mc_list *tmp;
+	struct dev_mc_list *plist = NULL;
+
+	for (mclist = dev->mc_list; mclist; mclist = mclist->next) {
+		tmp = kmalloc(sizeof(struct dev_mc_list), GFP_ATOMIC);
+		if (!tmp) {
+			mlx4_err(mdev, "failed to allocate multicast list\n");
+			mlx4_en_clear_list(dev);
+			return;
+		}
+		memcpy(tmp, mclist, sizeof(struct dev_mc_list));
+		tmp->next = NULL;
+		if (plist)
+			plist->next = tmp;
+		else
+			priv->mc_list = tmp;
+		plist = tmp;
+	}
+}
+
+
+static void mlx4_en_set_multicast(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (!priv->port_up)
+		return;
+
+	queue_work(priv->mdev->workqueue, &priv->mcast_task);
+}
+
+static void mlx4_en_do_set_multicast(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 mcast_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+	struct dev_mc_list *mclist;
+	u64 mcast_addr = 0;
+	int err;
+
+	mutex_lock(&mdev->state_lock);
+	if (!mdev->device_up) {
+		mlx4_dbg(HW, priv, "Card is not up, ignoring "
+				   "multicast change.\n");
+		goto out;
+	}
+	if (!priv->port_up) {
+		mlx4_dbg(HW, priv, "Port is down, ignoring "
+				   "multicast change.\n");
+		goto out;
+	}
+
+	/*
+	 * Promsicuous mode: disable all filters
+	 */
+
+	if (dev->flags & IFF_PROMISC) {
+		if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) {
+			if (netif_msg_rx_status(priv))
+				mlx4_warn(mdev, "Port:%d entering promiscuous mode\n",
+					  priv->port);
+			priv->flags |= MLX4_EN_FLAG_PROMISC;
+
+			/* Enable promiscouos mode */
+			err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
+						     priv->base_qpn, 1);
+			if (err)
+				mlx4_err(mdev, "Failed enabling "
+					 "promiscous mode\n");
+
+			/* Disable port multicast filter (unconditionally) */
+			err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+						  0, MLX4_MCAST_DISABLE);
+			if (err)
+				mlx4_err(mdev, "Failed disabling "
+					 "multicast filter\n");
+
+			/* Disable port VLAN filter */
+			err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, NULL);
+			if (err)
+				mlx4_err(mdev, "Failed disabling "
+					 "VLAN filter\n");
+		}
+		goto out;
+	}
+
+	/*
+	 * Not in promiscous mode
+	 */
+
+	if (priv->flags & MLX4_EN_FLAG_PROMISC) {
+		if (netif_msg_rx_status(priv))
+			mlx4_warn(mdev, "Port:%d leaving promiscuous mode\n",
+				  priv->port);
+		priv->flags &= ~MLX4_EN_FLAG_PROMISC;
+
+		/* Disable promiscouos mode */
+		err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
+					     priv->base_qpn, 0);
+		if (err)
+			mlx4_err(mdev, "Failed disabling promiscous mode\n");
+
+		/* Enable port VLAN filter */
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlgrp);
+		if (err)
+			mlx4_err(mdev, "Failed enabling VLAN filter\n");
+	}
+
+	/* Enable/disable the multicast filter according to IFF_ALLMULTI */
+	if (dev->flags & IFF_ALLMULTI) {
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			mlx4_err(mdev, "Failed disabling multicast filter\n");
+	} else {
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			mlx4_err(mdev, "Failed disabling multicast filter\n");
+
+		/* Flush mcast filter and init it with broadcast address */
+		mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST,
+				    1, MLX4_MCAST_CONFIG);
+
+		/* Update multicast list - we cache all addresses so they won't
+		 * change while HW is updated holding the command semaphor */
+		netif_tx_lock_bh(dev);
+		mlx4_en_cache_mclist(dev);
+		netif_tx_unlock_bh(dev);
+		for (mclist = priv->mc_list; mclist; mclist = mclist->next) {
+			mcast_addr = mlx4_en_mac_to_u64(mclist->dmi_addr);
+			mlx4_SET_MCAST_FLTR(mdev->dev, priv->port,
+					    mcast_addr, 0, MLX4_MCAST_CONFIG);
+		}
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_ENABLE);
+		if (err)
+			mlx4_err(mdev, "Failed enabling multicast filter\n");
+
+		mlx4_en_clear_list(dev);
+	}
+out:
+	mutex_unlock(&mdev->state_lock);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void mlx4_en_netpoll(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_cq *cq;
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = &priv->rx_cq[i];
+		spin_lock_irqsave(&cq->lock, flags);
+		napi_synchronize(&cq->napi);
+		mlx4_en_process_rx_cq(dev, cq, 0);
+		spin_unlock_irqrestore(&cq->lock, flags);
+	}
+}
+#endif
+
+static void mlx4_en_tx_timeout(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	if (netif_msg_timer(priv))
+		mlx4_warn(mdev, "Tx timeout called on port:%d\n", priv->port);
+
+	if (netif_carrier_ok(dev)) {
+		priv->port_stats.tx_timeout++;
+		mlx4_dbg(DRV, priv, "Scheduling watchdog\n");
+		queue_work(mdev->workqueue, &priv->watchdog_task);
+	}
+}
+
+
+static struct net_device_stats *mlx4_en_get_stats(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	spin_lock_bh(&priv->stats_lock);
+	memcpy(&priv->ret_stats, &priv->stats, sizeof(priv->stats));
+	spin_unlock_bh(&priv->stats_lock);
+
+	return &priv->ret_stats;
+}
+
+static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
+	int i;
+
+	/* If we haven't received a specific coalescing setting
+	 * (module param), we set the moderation paramters as follows:
+	 * - moder_cnt is set to the number of mtu sized packets to
+	 *   satisfy our coelsing target.
+	 * - moder_time is set to a fixed value.
+	 */
+	priv->rx_frames = (mdev->profile.rx_moder_cnt ==
+			   MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TARGET /
+				priv->dev->mtu + 1 :
+				mdev->profile.rx_moder_cnt;
+	priv->rx_usecs = (mdev->profile.rx_moder_time ==
+			  MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TIME :
+				mdev->profile.rx_moder_time;
+	mlx4_dbg(INTR, priv, "Default coalesing params for mtu:%d - "
+			     "rx_frames:%d rx_usecs:%d\n",
+		 priv->dev->mtu, priv->rx_frames, priv->rx_usecs);
+
+	/* Setup cq moderation params */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = &priv->rx_cq[i];
+		cq->moder_cnt = priv->rx_frames;
+		cq->moder_time = priv->rx_usecs;
+	}
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		cq = &priv->tx_cq[i];
+		cq->moder_cnt = MLX4_EN_TX_COAL_PKTS;
+		cq->moder_time = MLX4_EN_TX_COAL_TIME;
+	}
+
+	/* Reset auto-moderation params */
+	priv->pkt_rate_low = MLX4_EN_RX_RATE_LOW;
+	priv->rx_usecs_low = MLX4_EN_RX_COAL_TIME_LOW;
+	priv->pkt_rate_high = MLX4_EN_RX_RATE_HIGH;
+	priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH;
+	priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL;
+	priv->adaptive_rx_coal = mdev->profile.auto_moder;
+	priv->last_moder_time = MLX4_EN_AUTO_CONF;
+	priv->last_moder_jiffies = 0;
+	priv->last_moder_packets = 0;
+	priv->last_moder_tx_packets = 0;
+	priv->last_moder_bytes = 0;
+}
+
+static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv)
+{
+	unsigned long period = (unsigned long) (jiffies - priv->last_moder_jiffies);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
+	unsigned long packets;
+	unsigned long rate;
+	unsigned long avg_pkt_size;
+	unsigned long rx_packets;
+	unsigned long rx_bytes;
+	unsigned long tx_packets;
+	unsigned long tx_pkt_diff;
+	unsigned long rx_pkt_diff;
+	int moder_time;
+	int i, err;
+
+	if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ)
+		return;
+
+	spin_lock_bh(&priv->stats_lock);
+	rx_packets = priv->stats.rx_packets;
+	rx_bytes = priv->stats.rx_bytes;
+	tx_packets = priv->stats.tx_packets;
+	spin_unlock_bh(&priv->stats_lock);
+
+	if (!priv->last_moder_jiffies || !period)
+		goto out;
+
+	tx_pkt_diff = ((unsigned long) (tx_packets -
+					priv->last_moder_tx_packets));
+	rx_pkt_diff = ((unsigned long) (rx_packets -
+					priv->last_moder_packets));
+	packets = max(tx_pkt_diff, rx_pkt_diff);
+	rate = packets * HZ / period;
+	avg_pkt_size = packets ? ((unsigned long) (rx_bytes -
+				 priv->last_moder_bytes)) / packets : 0;
+
+	/* Apply auto-moderation only when packet rate exceeds a rate that
+	 * it matters */
+	if (rate > MLX4_EN_RX_RATE_THRESH) {
+		/* If tx and rx packet rates are not balanced, assume that
+		 * traffic is mainly BW bound and apply maximum moderation.
+		 * Otherwise, moderate according to packet rate */
+		if (2 * tx_pkt_diff > 3 * rx_pkt_diff ||
+		    2 * rx_pkt_diff > 3 * tx_pkt_diff) {
+			moder_time = priv->rx_usecs_high;
+		} else {
+			if (rate < priv->pkt_rate_low)
+				moder_time = priv->rx_usecs_low;
+			else if (rate > priv->pkt_rate_high)
+				moder_time = priv->rx_usecs_high;
+			else
+				moder_time = (rate - priv->pkt_rate_low) *
+					(priv->rx_usecs_high - priv->rx_usecs_low) /
+					(priv->pkt_rate_high - priv->pkt_rate_low) +
+					priv->rx_usecs_low;
+		}
+	} else {
+		/* When packet rate is low, use default moderation rather than
+		 * 0 to prevent interrupt storms if traffic suddenly increases */
+		moder_time = priv->rx_usecs;
+	}
+
+	mlx4_dbg(INTR, priv, "tx rate:%lu rx_rate:%lu\n",
+		 tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period);
+
+	mlx4_dbg(INTR, priv, "Rx moder_time changed from:%d to %d period:%lu "
+		 "[jiff] packets:%lu avg_pkt_size:%lu rate:%lu [p/s])\n",
+		 priv->last_moder_time, moder_time, period, packets,
+		 avg_pkt_size, rate);
+
+	if (moder_time != priv->last_moder_time) {
+		priv->last_moder_time = moder_time;
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			cq = &priv->rx_cq[i];
+			cq->moder_time = moder_time;
+			err = mlx4_en_set_cq_moder(priv, cq);
+			if (err) {
+				mlx4_err(mdev, "Failed modifying moderation for cq:%d "
+					 "on port:%d\n", i, priv->port);
+				break;
+			}
+		}
+	}
+
+out:
+	priv->last_moder_packets = rx_packets;
+	priv->last_moder_tx_packets = tx_packets;
+	priv->last_moder_bytes = rx_bytes;
+	priv->last_moder_jiffies = jiffies;
+}
+
+static void mlx4_en_do_get_stats(struct work_struct *work)
+{
+	struct delayed_work *delay = container_of(work, struct delayed_work, work);
+	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+						 stats_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0);
+	if (err)
+		mlx4_dbg(HW, priv, "Could not update stats for "
+				   "port:%d\n", priv->port);
+
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up) {
+		if (priv->port_up)
+			mlx4_en_auto_moderation(priv);
+
+		queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+static void mlx4_en_linkstate(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 linkstate_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int linkstate = priv->link_state;
+
+	mutex_lock(&mdev->state_lock);
+	/* If observable port state changed set carrier state and
+	 * report to system log */
+	if (priv->last_link_state != linkstate) {
+		if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) {
+			if (netif_msg_link(priv))
+				mlx4_info(mdev, "Port %d - link down\n", priv->port);
+			netif_carrier_off(priv->dev);
+		} else {
+			if (netif_msg_link(priv))
+				mlx4_info(mdev, "Port %d - link up\n", priv->port);
+			netif_carrier_on(priv->dev);
+		}
+	}
+	priv->last_link_state = linkstate;
+	mutex_unlock(&mdev->state_lock);
+}
+
+
+static int mlx4_en_start_port(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
+	struct mlx4_en_tx_ring *tx_ring;
+	struct mlx4_en_rx_ring *rx_ring;
+	int rx_index = 0;
+	int tx_index = 0;
+	u16 stride;
+	int err = 0;
+	int i;
+	int j;
+
+	if (priv->port_up) {
+		mlx4_dbg(DRV, priv, "start port called while port already up\n");
+		return 0;
+	}
+
+	/* Calculate Rx buf size */
+	dev->mtu = min(dev->mtu, priv->max_mtu);
+	mlx4_en_calc_rx_buf(dev);
+	mlx4_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_skb_size);
+	stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+				    DS_SIZE * priv->num_frags);
+	/* Configure rx cq's and rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = &priv->rx_cq[i];
+		rx_ring = &priv->rx_ring[i];
+
+		err = mlx4_en_activate_cq(priv, cq);
+		if (err) {
+			mlx4_err(mdev, "Failed activating Rx CQ\n");
+			goto rx_err;
+		}
+		for (j = 0; j < cq->size; j++)
+			cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK;
+		err = mlx4_en_set_cq_moder(priv, cq);
+		if (err) {
+			mlx4_err(mdev, "Failed setting cq moderation parameters");
+			mlx4_en_deactivate_cq(priv, cq);
+			goto cq_err;
+		}
+		mlx4_en_arm_cq(priv, cq);
+
+		++rx_index;
+	}
+
+	err = mlx4_en_activate_rx_rings(priv);
+	if (err) {
+		mlx4_err(mdev, "Failed to activate RX rings\n");
+		goto cq_err;
+	}
+
+	err = mlx4_en_config_rss_steer(priv);
+	if (err) {
+		mlx4_err(mdev, "Failed configuring rss steering\n");
+		goto rx_err;
+	}
+
+	/* Configure tx cq's and rings */
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		/* Configure cq */
+		cq = &priv->tx_cq[i];
+		err = mlx4_en_activate_cq(priv, cq);
+		if (err) {
+			mlx4_err(mdev, "Failed allocating Tx CQ\n");
+			goto tx_err;
+		}
+		err = mlx4_en_set_cq_moder(priv, cq);
+		if (err) {
+			mlx4_err(mdev, "Failed setting cq moderation parameters");
+			mlx4_en_deactivate_cq(priv, cq);
+			goto tx_err;
+		}
+		mlx4_dbg(DRV, priv, "Resetting index of collapsed CQ:%d to -1\n", i);
+		cq->buf->wqe_index = cpu_to_be16(0xffff);
+
+		/* Configure ring */
+		tx_ring = &priv->tx_ring[i];
+		err = mlx4_en_activate_tx_ring(priv, tx_ring, cq->mcq.cqn,
+					       priv->rx_ring[0].srq.srqn);
+		if (err) {
+			mlx4_err(mdev, "Failed allocating Tx ring\n");
+			mlx4_en_deactivate_cq(priv, cq);
+			goto tx_err;
+		}
+		/* Set initial ownership of all Tx TXBBs to SW (1) */
+		for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE)
+			*((u32 *) (tx_ring->buf + j)) = 0xffffffff;
+		++tx_index;
+	}
+
+	/* Configure port */
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    mdev->profile.tx_pause,
+				    mdev->profile.tx_ppp,
+				    mdev->profile.rx_pause,
+				    mdev->profile.rx_ppp);
+	if (err) {
+		mlx4_err(mdev, "Failed setting port general configurations"
+			       " for port %d, with error %d\n", priv->port, err);
+		goto tx_err;
+	}
+	/* Set default qp number */
+	err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0);
+	if (err) {
+		mlx4_err(mdev, "Failed setting default qp numbers\n");
+		goto tx_err;
+	}
+	/* Set port mac number */
+	mlx4_dbg(DRV, priv, "Setting mac for port %d\n", priv->port);
+	err = mlx4_register_mac(mdev->dev, priv->port,
+				priv->mac, &priv->mac_index);
+	if (err) {
+		mlx4_err(mdev, "Failed setting port mac\n");
+		goto tx_err;
+	}
+
+	/* Init port */
+	mlx4_dbg(HW, priv, "Initializing port\n");
+	err = mlx4_INIT_PORT(mdev->dev, priv->port);
+	if (err) {
+		mlx4_err(mdev, "Failed Initializing port\n");
+		goto mac_err;
+	}
+
+	/* Schedule multicast task to populate multicast list */
+	queue_work(mdev->workqueue, &priv->mcast_task);
+
+	priv->port_up = true;
+	netif_start_queue(dev);
+	return 0;
+
+mac_err:
+	mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index);
+tx_err:
+	while (tx_index--) {
+		mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[tx_index]);
+		mlx4_en_deactivate_cq(priv, &priv->tx_cq[tx_index]);
+	}
+
+	mlx4_en_release_rss_steer(priv);
+rx_err:
+	for (i = 0; i < priv->rx_ring_num; i++)
+		mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[rx_index]);
+cq_err:
+	while (rx_index--)
+		mlx4_en_deactivate_cq(priv, &priv->rx_cq[rx_index]);
+
+	return err; /* need to close devices */
+}
+
+
+static void mlx4_en_stop_port(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int i;
+
+	if (!priv->port_up) {
+		mlx4_dbg(DRV, priv, "stop port (%d) called while port already down\n",
+			 priv->port);
+		return;
+	}
+	netif_stop_queue(dev);
+
+	/* Synchronize with tx routine */
+	netif_tx_lock_bh(dev);
+	priv->port_up = false;
+	netif_tx_unlock_bh(dev);
+
+	/* close port*/
+	mlx4_CLOSE_PORT(mdev->dev, priv->port);
+
+	/* Unregister Mac address for the port */
+	mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index);
+
+	/* Free TX Rings */
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[i]);
+		mlx4_en_deactivate_cq(priv, &priv->tx_cq[i]);
+	}
+	msleep(10);
+
+	for (i = 0; i < priv->tx_ring_num; i++)
+		mlx4_en_free_tx_buf(dev, &priv->tx_ring[i]);
+
+	/* Free RSS qps */
+	mlx4_en_release_rss_steer(priv);
+
+	/* Free RX Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]);
+		while (test_bit(NAPI_STATE_SCHED, &priv->rx_cq[i].napi.state))
+			msleep(1);
+		mlx4_en_deactivate_cq(priv, &priv->rx_cq[i]);
+	}
+}
+
+static void mlx4_en_restart(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 watchdog_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+	mlx4_dbg(DRV, priv, "Watchdog task called for port %d\n", priv->port);
+	mlx4_en_stop_port(dev);
+	if (mlx4_en_start_port(dev))
+	    mlx4_err(mdev, "Failed restarting port %d\n", priv->port);
+}
+
+
+static int mlx4_en_open(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int i;
+	int err = 0;
+
+	mutex_lock(&mdev->state_lock);
+
+	if (!mdev->device_up) {
+		mlx4_err(mdev, "Cannot open - device down/disabled\n");
+		err = -EBUSY;
+		goto out;
+	}
+
+	/* Reset HW statistics and performance counters */
+	if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1))
+		mlx4_dbg(HW, priv, "Failed dumping statistics\n");
+
+	memset(&priv->stats, 0, sizeof(priv->stats));
+	memset(&priv->pstats, 0, sizeof(priv->pstats));
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		priv->tx_ring[i].bytes = 0;
+		priv->tx_ring[i].packets = 0;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_ring[i].bytes = 0;
+		priv->rx_ring[i].packets = 0;
+	}
+
+	mlx4_en_set_default_moderation(priv);
+	err = mlx4_en_start_port(dev);
+	if (err)
+		mlx4_err(mdev, "Failed starting port:%d\n", priv->port);
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+
+static int mlx4_en_close(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	if (netif_msg_ifdown(priv))
+		mlx4_info(mdev, "Close called for port:%d\n", priv->port);
+
+	mutex_lock(&mdev->state_lock);
+
+	mlx4_en_stop_port(dev);
+	netif_carrier_off(dev);
+
+	mutex_unlock(&mdev->state_lock);
+	return 0;
+}
+
+static void mlx4_en_free_resources(struct mlx4_en_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		if (priv->tx_ring[i].tx_info)
+			mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]);
+		if (priv->tx_cq[i].buf)
+			mlx4_en_destroy_cq(priv, &priv->tx_cq[i]);
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (priv->rx_ring[i].rx_info)
+			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i]);
+		if (priv->rx_cq[i].buf)
+			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
+	}
+}
+
+static int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile *prof = priv->prof;
+	int i;
+
+	/* Create tx Rings */
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		if (mlx4_en_create_cq(priv, &priv->tx_cq[i],
+				      prof->tx_ring_size, i, TX))
+			goto err;
+
+		if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i],
+					   prof->tx_ring_size, TXBB_SIZE))
+			goto err;
+	}
+
+	/* Create rx Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (mlx4_en_create_cq(priv, &priv->rx_cq[i],
+				      prof->rx_ring_size, i, RX))
+			goto err;
+
+		if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i],
+					   prof->rx_ring_size, priv->stride))
+			goto err;
+	}
+
+	return 0;
+
+err:
+	mlx4_err(mdev, "Failed to allocate NIC resources\n");
+	return -ENOMEM;
+}
+
+
+void mlx4_en_destroy_netdev(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mlx4_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
+
+	/* Unregister device - this will close the port if it was up */
+	if (priv->registered)
+		unregister_netdev(dev);
+
+	if (priv->allocated)
+		mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE);
+
+	cancel_delayed_work(&priv->stats_task);
+	cancel_delayed_work(&priv->refill_task);
+	/* flush any pending task for this netdev */
+	flush_workqueue(mdev->workqueue);
+
+	/* Detach the netdev so tasks would not attempt to access it */
+	mutex_lock(&mdev->state_lock);
+	mdev->pndev[priv->port] = NULL;
+	mutex_unlock(&mdev->state_lock);
+
+	mlx4_en_free_resources(priv);
+	free_netdev(dev);
+}
+
+static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	mlx4_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n",
+		 dev->mtu, new_mtu);
+
+	if ((new_mtu < MLX4_EN_MIN_MTU) || (new_mtu > priv->max_mtu)) {
+		mlx4_err(mdev, "Bad MTU size:%d.\n", new_mtu);
+		return -EPERM;
+	}
+	dev->mtu = new_mtu;
+
+	if (netif_running(dev)) {
+		mutex_lock(&mdev->state_lock);
+		if (!mdev->device_up) {
+			/* NIC is probably restarting - let watchdog task reset
+			 * the port */
+			mlx4_dbg(DRV, priv, "Change MTU called with card down!?\n");
+		} else {
+			mlx4_en_stop_port(dev);
+			mlx4_en_set_default_moderation(priv);
+			err = mlx4_en_start_port(dev);
+			if (err) {
+				mlx4_err(mdev, "Failed restarting port:%d\n",
+					 priv->port);
+				queue_work(mdev->workqueue, &priv->watchdog_task);
+			}
+		}
+		mutex_unlock(&mdev->state_lock);
+	}
+	return 0;
+}
+
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof)
+{
+	struct net_device *dev;
+	struct mlx4_en_priv *priv;
+	int i;
+	int err;
+
+	dev = alloc_etherdev(sizeof(struct mlx4_en_priv));
+	if (dev == NULL) {
+		mlx4_err(mdev, "Net device allocation failed\n");
+		return -ENOMEM;
+	}
+
+	SET_NETDEV_DEV(dev, &mdev->dev->pdev->dev);
+
+	/*
+	 * Initialize driver private data
+	 */
+
+	priv = netdev_priv(dev);
+	memset(priv, 0, sizeof(struct mlx4_en_priv));
+	priv->dev = dev;
+	priv->mdev = mdev;
+	priv->prof = prof;
+	priv->port = port;
+	priv->port_up = false;
+	priv->rx_csum = 1;
+	priv->flags = prof->flags;
+	priv->tx_ring_num = prof->tx_ring_num;
+	priv->rx_ring_num = prof->rx_ring_num;
+	priv->mc_list = NULL;
+	priv->mac_index = -1;
+	priv->msg_enable = MLX4_EN_MSG_LEVEL;
+	spin_lock_init(&priv->stats_lock);
+	INIT_WORK(&priv->mcast_task, mlx4_en_do_set_multicast);
+	INIT_WORK(&priv->mac_task, mlx4_en_do_set_mac);
+	INIT_DELAYED_WORK(&priv->refill_task, mlx4_en_rx_refill);
+	INIT_WORK(&priv->watchdog_task, mlx4_en_restart);
+	INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate);
+	INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats);
+
+	/* Query for default mac and max mtu */
+	priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port];
+	priv->mac = mdev->dev->caps.def_mac[priv->port];
+	if (ILLEGAL_MAC(priv->mac)) {
+		mlx4_err(mdev, "Port: %d, invalid mac burned: 0x%llx, quiting\n",
+			 priv->port, priv->mac);
+		err = -EINVAL;
+		goto out;
+	}
+
+	priv->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					  DS_SIZE * MLX4_EN_MAX_RX_FRAGS);
+	err = mlx4_en_alloc_resources(priv);
+	if (err)
+		goto out;
+
+	/* Populate Rx default RSS mappings */
+	mlx4_en_set_default_rss_map(priv, &priv->rss_map, priv->rx_ring_num *
+						RSS_FACTOR, priv->rx_ring_num);
+	/* Allocate page for receive rings */
+	err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
+				MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE);
+	if (err) {
+		mlx4_err(mdev, "Failed to allocate page for rx qps\n");
+		goto out;
+	}
+	priv->allocated = 1;
+
+	/* Populate Tx priority mappings */
+	mlx4_en_set_prio_map(priv, priv->tx_prio_map, prof->tx_ring_num);
+
+	/*
+	 * Initialize netdev entry points
+	 */
+
+	dev->open = &mlx4_en_open;
+	dev->stop = &mlx4_en_close;
+	dev->hard_start_xmit = &mlx4_en_xmit;
+	dev->get_stats = &mlx4_en_get_stats;
+	dev->set_multicast_list = &mlx4_en_set_multicast;
+	dev->set_mac_address = &mlx4_en_set_mac;
+	dev->change_mtu = &mlx4_en_change_mtu;
+	dev->tx_timeout = &mlx4_en_tx_timeout;
+	dev->watchdog_timeo = MLX4_EN_WATCHDOG_TIMEOUT;
+	dev->vlan_rx_register = mlx4_en_vlan_rx_register;
+	dev->vlan_rx_add_vid = mlx4_en_vlan_rx_add_vid;
+	dev->vlan_rx_kill_vid = mlx4_en_vlan_rx_kill_vid;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	dev->poll_controller = mlx4_en_netpoll;
+#endif
+	SET_ETHTOOL_OPS(dev, &mlx4_en_ethtool_ops);
+
+	/* Set defualt MAC */
+	dev->addr_len = ETH_ALEN;
+	for (i = 0; i < ETH_ALEN; i++)
+		dev->dev_addr[ETH_ALEN - 1 - i] =
+		(u8) (priv->mac >> (8 * i));
+
+	/*
+	 * Set driver features
+	 */
+	dev->features |= NETIF_F_SG;
+	dev->features |= NETIF_F_HW_CSUM;
+	dev->features |= NETIF_F_HIGHDMA;
+	dev->features |= NETIF_F_HW_VLAN_TX |
+			 NETIF_F_HW_VLAN_RX |
+			 NETIF_F_HW_VLAN_FILTER;
+	if (mdev->profile.num_lro)
+		dev->features |= NETIF_F_LRO;
+	if (mdev->LSO_support) {
+		dev->features |= NETIF_F_TSO;
+		dev->features |= NETIF_F_TSO6;
+	}
+
+	mdev->pndev[port] = dev;
+
+	netif_carrier_off(dev);
+	err = register_netdev(dev);
+	if (err) {
+		mlx4_err(mdev, "Netdev registration failed\n");
+		goto out;
+	}
+	priv->registered = 1;
+	queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+	return 0;
+
+out:
+	mlx4_en_destroy_netdev(dev);
+	return err;
+}
+
diff --git a/drivers/net/mlx4/en_params.c b/drivers/net/mlx4/en_params.c
new file mode 100644
index 00000000000..c2e69b1bcd0
--- /dev/null
+++ b/drivers/net/mlx4/en_params.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include "mlx4_en.h"
+#include "en_port.h"
+
+#define MLX4_EN_PARM_INT(X, def_val, desc) \
+	static unsigned int X = def_val;\
+	module_param(X , uint, 0444); \
+	MODULE_PARM_DESC(X, desc);
+
+
+/*
+ * Device scope module parameters
+ */
+
+
+/* Use a XOR rathern than Toeplitz hash function for RSS */
+MLX4_EN_PARM_INT(rss_xor, 0, "Use XOR hash function for RSS");
+
+/* RSS hash type mask - default to <saddr, daddr, sport, dport> */
+MLX4_EN_PARM_INT(rss_mask, 0xf, "RSS hash type bitmask");
+
+/* Number of LRO sessions per Rx ring (rounded up to a power of two) */
+MLX4_EN_PARM_INT(num_lro, MLX4_EN_MAX_LRO_DESCRIPTORS,
+		 "Number of LRO sessions per ring or disabled (0)");
+
+/* Priority pausing */
+MLX4_EN_PARM_INT(pptx, MLX4_EN_DEF_TX_PAUSE,
+		 "Pause policy on TX: 0 never generate pause frames "
+		 "1 generate pause frames according to RX buffer threshold");
+MLX4_EN_PARM_INT(pprx, MLX4_EN_DEF_RX_PAUSE,
+		 "Pause policy on RX: 0 ignore received pause frames "
+		 "1 respect received pause frames");
+MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]."
+			   " Per priority bit mask");
+MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]."
+			   " Per priority bit mask");
+
+/* Interrupt moderation tunning */
+MLX4_EN_PARM_INT(rx_moder_cnt, MLX4_EN_AUTO_CONF,
+	       "Max coalesced descriptors for Rx interrupt moderation");
+MLX4_EN_PARM_INT(rx_moder_time, MLX4_EN_AUTO_CONF,
+	       "Timeout following last packet for Rx interrupt moderation");
+MLX4_EN_PARM_INT(auto_moder, 1, "Enable dynamic interrupt moderation");
+
+MLX4_EN_PARM_INT(rx_ring_num1, 0, "Number or Rx rings for port 1 (0 = #cores)");
+MLX4_EN_PARM_INT(rx_ring_num2, 0, "Number or Rx rings for port 2 (0 = #cores)");
+
+MLX4_EN_PARM_INT(tx_ring_size1, MLX4_EN_AUTO_CONF, "Tx ring size for port 1");
+MLX4_EN_PARM_INT(tx_ring_size2, MLX4_EN_AUTO_CONF, "Tx ring size for port 2");
+MLX4_EN_PARM_INT(rx_ring_size1, MLX4_EN_AUTO_CONF, "Rx ring size for port 1");
+MLX4_EN_PARM_INT(rx_ring_size2, MLX4_EN_AUTO_CONF, "Rx ring size for port 2");
+
+
+int mlx4_en_get_profile(struct mlx4_en_dev *mdev)
+{
+	struct mlx4_en_profile *params = &mdev->profile;
+
+	params->rx_moder_cnt = min_t(int, rx_moder_cnt, MLX4_EN_AUTO_CONF);
+	params->rx_moder_time = min_t(int, rx_moder_time, MLX4_EN_AUTO_CONF);
+	params->auto_moder = auto_moder;
+	params->rss_xor = (rss_xor != 0);
+	params->rss_mask = rss_mask & 0x1f;
+	params->num_lro = min_t(int, num_lro , MLX4_EN_MAX_LRO_DESCRIPTORS);
+	params->rx_pause = pprx;
+	params->rx_ppp = pfcrx;
+	params->tx_pause = pptx;
+	params->tx_ppp = pfctx;
+	if (params->rx_ppp || params->tx_ppp) {
+		params->prof[1].tx_ring_num = MLX4_EN_TX_RING_NUM;
+		params->prof[2].tx_ring_num = MLX4_EN_TX_RING_NUM;
+	} else {
+		params->prof[1].tx_ring_num = 1;
+		params->prof[2].tx_ring_num = 1;
+	}
+	params->prof[1].rx_ring_num = min_t(int, rx_ring_num1, MAX_RX_RINGS);
+	params->prof[2].rx_ring_num = min_t(int, rx_ring_num2, MAX_RX_RINGS);
+
+	if (tx_ring_size1 == MLX4_EN_AUTO_CONF)
+		tx_ring_size1 = MLX4_EN_DEF_TX_RING_SIZE;
+	params->prof[1].tx_ring_size =
+		(tx_ring_size1 < MLX4_EN_MIN_TX_SIZE) ?
+		 MLX4_EN_MIN_TX_SIZE : roundup_pow_of_two(tx_ring_size1);
+
+	if (tx_ring_size2 == MLX4_EN_AUTO_CONF)
+		tx_ring_size2 = MLX4_EN_DEF_TX_RING_SIZE;
+	params->prof[2].tx_ring_size =
+		(tx_ring_size2 < MLX4_EN_MIN_TX_SIZE) ?
+		 MLX4_EN_MIN_TX_SIZE : roundup_pow_of_two(tx_ring_size2);
+
+	if (rx_ring_size1 == MLX4_EN_AUTO_CONF)
+		rx_ring_size1 = MLX4_EN_DEF_RX_RING_SIZE;
+	params->prof[1].rx_ring_size =
+		(rx_ring_size1 < MLX4_EN_MIN_RX_SIZE) ?
+		 MLX4_EN_MIN_RX_SIZE : roundup_pow_of_two(rx_ring_size1);
+
+	if (rx_ring_size2 == MLX4_EN_AUTO_CONF)
+		rx_ring_size2 = MLX4_EN_DEF_RX_RING_SIZE;
+	params->prof[2].rx_ring_size =
+		(rx_ring_size2 < MLX4_EN_MIN_RX_SIZE) ?
+		 MLX4_EN_MIN_RX_SIZE : roundup_pow_of_two(rx_ring_size2);
+	return 0;
+}
+
+
+/*
+ * Ethtool support
+ */
+
+static void mlx4_en_update_lro_stats(struct mlx4_en_priv *priv)
+{
+	int i;
+
+	priv->port_stats.lro_aggregated = 0;
+	priv->port_stats.lro_flushed = 0;
+	priv->port_stats.lro_no_desc = 0;
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->port_stats.lro_aggregated += priv->rx_ring[i].lro.stats.aggregated;
+		priv->port_stats.lro_flushed += priv->rx_ring[i].lro.stats.flushed;
+		priv->port_stats.lro_no_desc += priv->rx_ring[i].lro.stats.no_desc;
+	}
+}
+
+static void
+mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	sprintf(drvinfo->driver, DRV_NAME " (%s)", mdev->dev->board_id);
+	strncpy(drvinfo->version, DRV_VERSION " (" DRV_RELDATE ")", 32);
+	sprintf(drvinfo->fw_version, "%d.%d.%d",
+		(u16) (mdev->dev->caps.fw_ver >> 32),
+		(u16) ((mdev->dev->caps.fw_ver >> 16) & 0xffff),
+		(u16) (mdev->dev->caps.fw_ver & 0xffff));
+	strncpy(drvinfo->bus_info, pci_name(mdev->dev->pdev), 32);
+	drvinfo->n_stats = 0;
+	drvinfo->regdump_len = 0;
+	drvinfo->eedump_len = 0;
+}
+
+static u32 mlx4_en_get_tso(struct net_device *dev)
+{
+	return (dev->features & NETIF_F_TSO) != 0;
+}
+
+static int mlx4_en_set_tso(struct net_device *dev, u32 data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (data) {
+		if (!priv->mdev->LSO_support)
+			return -EPERM;
+		dev->features |= (NETIF_F_TSO | NETIF_F_TSO6);
+	} else
+		dev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
+	return 0;
+}
+
+static u32 mlx4_en_get_rx_csum(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	return priv->rx_csum;
+}
+
+static int mlx4_en_set_rx_csum(struct net_device *dev, u32 data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	priv->rx_csum = (data != 0);
+	return 0;
+}
+
+static const char main_strings[][ETH_GSTRING_LEN] = {
+	"rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_errors",
+	"tx_errors", "rx_dropped", "tx_dropped", "multicast", "collisions",
+	"rx_length_errors", "rx_over_errors", "rx_crc_errors",
+	"rx_frame_errors", "rx_fifo_errors", "rx_missed_errors",
+	"tx_aborted_errors", "tx_carrier_errors", "tx_fifo_errors",
+	"tx_heartbeat_errors", "tx_window_errors",
+
+	/* port statistics */
+	"lro_aggregated", "lro_flushed", "lro_no_desc", "tso_packets",
+	"queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_failed",
+	"rx_csum_good", "rx_csum_none", "tx_chksum_offload",
+
+	/* packet statistics */
+	"broadcast", "rx_prio_0", "rx_prio_1", "rx_prio_2", "rx_prio_3",
+	"rx_prio_4", "rx_prio_5", "rx_prio_6", "rx_prio_7", "tx_prio_0",
+	"tx_prio_1", "tx_prio_2", "tx_prio_3", "tx_prio_4", "tx_prio_5",
+	"tx_prio_6", "tx_prio_7",
+};
+#define NUM_MAIN_STATS	21
+#define NUM_ALL_STATS	(NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PKT_STATS + NUM_PERF_STATS)
+
+static u32 mlx4_en_get_msglevel(struct net_device *dev)
+{
+	return ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable;
+}
+
+static void mlx4_en_set_msglevel(struct net_device *dev, u32 val)
+{
+	((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable = val;
+}
+
+static void mlx4_en_get_wol(struct net_device *netdev,
+			    struct ethtool_wolinfo *wol)
+{
+	wol->supported = 0;
+	wol->wolopts = 0;
+
+	return;
+}
+
+static int mlx4_en_get_sset_count(struct net_device *dev, int sset)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (sset != ETH_SS_STATS)
+		return -EOPNOTSUPP;
+
+	return NUM_ALL_STATS + (priv->tx_ring_num + priv->rx_ring_num) * 2;
+}
+
+static void mlx4_en_get_ethtool_stats(struct net_device *dev,
+		struct ethtool_stats *stats, uint64_t *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int index = 0;
+	int i;
+
+	spin_lock_bh(&priv->stats_lock);
+
+	mlx4_en_update_lro_stats(priv);
+
+	for (i = 0; i < NUM_MAIN_STATS; i++)
+		data[index++] = ((unsigned long *) &priv->stats)[i];
+	for (i = 0; i < NUM_PORT_STATS; i++)
+		data[index++] = ((unsigned long *) &priv->port_stats)[i];
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		data[index++] = priv->tx_ring[i].packets;
+		data[index++] = priv->tx_ring[i].bytes;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		data[index++] = priv->rx_ring[i].packets;
+		data[index++] = priv->rx_ring[i].bytes;
+	}
+	for (i = 0; i < NUM_PKT_STATS; i++)
+		data[index++] = ((unsigned long *) &priv->pkstats)[i];
+	spin_unlock_bh(&priv->stats_lock);
+
+}
+
+static void mlx4_en_get_strings(struct net_device *dev,
+				uint32_t stringset, uint8_t *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int index = 0;
+	int i;
+
+	if (stringset != ETH_SS_STATS)
+		return;
+
+	/* Add main counters */
+	for (i = 0; i < NUM_MAIN_STATS; i++)
+		strcpy(data + (index++) * ETH_GSTRING_LEN, main_strings[i]);
+	for (i = 0; i < NUM_PORT_STATS; i++)
+		strcpy(data + (index++) * ETH_GSTRING_LEN,
+			main_strings[i + NUM_MAIN_STATS]);
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		sprintf(data + (index++) * ETH_GSTRING_LEN,
+			"tx%d_packets", i);
+		sprintf(data + (index++) * ETH_GSTRING_LEN,
+			"tx%d_bytes", i);
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		sprintf(data + (index++) * ETH_GSTRING_LEN,
+			"rx%d_packets", i);
+		sprintf(data + (index++) * ETH_GSTRING_LEN,
+			"rx%d_bytes", i);
+	}
+	for (i = 0; i < NUM_PKT_STATS; i++)
+		strcpy(data + (index++) * ETH_GSTRING_LEN,
+			main_strings[i + NUM_MAIN_STATS + NUM_PORT_STATS]);
+}
+
+static int mlx4_en_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	cmd->autoneg = AUTONEG_DISABLE;
+	cmd->supported = SUPPORTED_10000baseT_Full;
+	cmd->advertising = SUPPORTED_10000baseT_Full;
+	if (netif_carrier_ok(dev)) {
+		cmd->speed = SPEED_10000;
+		cmd->duplex = DUPLEX_FULL;
+	} else {
+		cmd->speed = -1;
+		cmd->duplex = -1;
+	}
+	return 0;
+}
+
+static int mlx4_en_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	if ((cmd->autoneg == AUTONEG_ENABLE) ||
+	    (cmd->speed != SPEED_10000) || (cmd->duplex != DUPLEX_FULL))
+		return -EINVAL;
+
+	/* Nothing to change */
+	return 0;
+}
+
+static int mlx4_en_get_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	coal->tx_coalesce_usecs = 0;
+	coal->tx_max_coalesced_frames = 0;
+	coal->rx_coalesce_usecs = priv->rx_usecs;
+	coal->rx_max_coalesced_frames = priv->rx_frames;
+
+	coal->pkt_rate_low = priv->pkt_rate_low;
+	coal->rx_coalesce_usecs_low = priv->rx_usecs_low;
+	coal->pkt_rate_high = priv->pkt_rate_high;
+	coal->rx_coalesce_usecs_high = priv->rx_usecs_high;
+	coal->rate_sample_interval = priv->sample_interval;
+	coal->use_adaptive_rx_coalesce = priv->adaptive_rx_coal;
+	return 0;
+}
+
+static int mlx4_en_set_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int err, i;
+
+	priv->rx_frames = (coal->rx_max_coalesced_frames ==
+			   MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TARGET /
+				priv->dev->mtu + 1 :
+				coal->rx_max_coalesced_frames;
+	priv->rx_usecs = (coal->rx_coalesce_usecs ==
+			  MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TIME :
+				coal->rx_coalesce_usecs;
+
+	/* Set adaptive coalescing params */
+	priv->pkt_rate_low = coal->pkt_rate_low;
+	priv->rx_usecs_low = coal->rx_coalesce_usecs_low;
+	priv->pkt_rate_high = coal->pkt_rate_high;
+	priv->rx_usecs_high = coal->rx_coalesce_usecs_high;
+	priv->sample_interval = coal->rate_sample_interval;
+	priv->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+	priv->last_moder_time = MLX4_EN_AUTO_CONF;
+	if (priv->adaptive_rx_coal)
+		return 0;
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_cq[i].moder_cnt = priv->rx_frames;
+		priv->rx_cq[i].moder_time = priv->rx_usecs;
+		err = mlx4_en_set_cq_moder(priv, &priv->rx_cq[i]);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int mlx4_en_set_pauseparam(struct net_device *dev,
+				struct ethtool_pauseparam *pause)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	mdev->profile.tx_pause = pause->tx_pause != 0;
+	mdev->profile.rx_pause = pause->rx_pause != 0;
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    mdev->profile.tx_pause,
+				    mdev->profile.tx_ppp,
+				    mdev->profile.rx_pause,
+				    mdev->profile.rx_ppp);
+	if (err)
+		mlx4_err(mdev, "Failed setting pause params to\n");
+
+	return err;
+}
+
+static void mlx4_en_get_pauseparam(struct net_device *dev,
+				 struct ethtool_pauseparam *pause)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	pause->tx_pause = mdev->profile.tx_pause;
+	pause->rx_pause = mdev->profile.rx_pause;
+}
+
+static void mlx4_en_get_ringparam(struct net_device *dev,
+				  struct ethtool_ringparam *param)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	memset(param, 0, sizeof(*param));
+	param->rx_max_pending = mdev->dev->caps.max_rq_sg;
+	param->tx_max_pending = mdev->dev->caps.max_sq_sg;
+	param->rx_pending = mdev->profile.prof[priv->port].rx_ring_size;
+	param->tx_pending = mdev->profile.prof[priv->port].tx_ring_size;
+}
+
+const struct ethtool_ops mlx4_en_ethtool_ops = {
+	.get_drvinfo = mlx4_en_get_drvinfo,
+	.get_settings = mlx4_en_get_settings,
+	.set_settings = mlx4_en_set_settings,
+#ifdef NETIF_F_TSO
+	.get_tso = mlx4_en_get_tso,
+	.set_tso = mlx4_en_set_tso,
+#endif
+	.get_sg = ethtool_op_get_sg,
+	.set_sg = ethtool_op_set_sg,
+	.get_link = ethtool_op_get_link,
+	.get_rx_csum = mlx4_en_get_rx_csum,
+	.set_rx_csum = mlx4_en_set_rx_csum,
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.set_tx_csum = ethtool_op_set_tx_ipv6_csum,
+	.get_strings = mlx4_en_get_strings,
+	.get_sset_count = mlx4_en_get_sset_count,
+	.get_ethtool_stats = mlx4_en_get_ethtool_stats,
+	.get_wol = mlx4_en_get_wol,
+	.get_msglevel = mlx4_en_get_msglevel,
+	.set_msglevel = mlx4_en_set_msglevel,
+	.get_coalesce = mlx4_en_get_coalesce,
+	.set_coalesce = mlx4_en_set_coalesce,
+	.get_pauseparam = mlx4_en_get_pauseparam,
+	.set_pauseparam = mlx4_en_set_pauseparam,
+	.get_ringparam = mlx4_en_get_ringparam,
+	.get_flags = ethtool_op_get_flags,
+	.set_flags = ethtool_op_set_flags,
+};
+
+
+
+
+
diff --git a/drivers/net/mlx4/en_port.c b/drivers/net/mlx4/en_port.c
new file mode 100644
index 00000000000..c5a4c038975
--- /dev/null
+++ b/drivers/net/mlx4/en_port.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include <linux/if_vlan.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+#include "en_port.h"
+#include "mlx4_en.h"
+
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port,
+			u64 mac, u64 clear, u8 mode)
+{
+	return mlx4_cmd(dev, (mac | (clear << 63)), port, mode,
+			MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, struct vlan_group *grp)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vlan_fltr_mbox *filter;
+	int i;
+	int j;
+	int index = 0;
+	u32 entry;
+	int err = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	filter = mailbox->buf;
+	if (grp) {
+		memset(filter, 0, sizeof *filter);
+		for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) {
+			entry = 0;
+			for (j = 0; j < 32; j++)
+				if (vlan_group_get_device(grp, index++))
+					entry |= 1 << j;
+			filter->entry[i] = cpu_to_be32(entry);
+		}
+	} else {
+		/* When no vlans are configured we block all vlans */
+		memset(filter, 0, sizeof(*filter));
+	}
+	err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_VLAN_FLTR,
+		       MLX4_CMD_TIME_CLASS_B);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	int err;
+	u32 in_mod;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof *context);
+
+	context->flags = SET_PORT_GEN_ALL_VALID;
+	context->mtu = cpu_to_be16(mtu);
+	context->pptx = (pptx * (!pfctx)) << 7;
+	context->pfctx = pfctx;
+	context->pprx = (pprx * (!pfcrx)) << 7;
+	context->pfcrx = pfcrx;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_rqp_calc_context *context;
+	int err;
+	u32 in_mod;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof *context);
+
+	context->base_qpn = cpu_to_be32(base_qpn);
+	context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT | base_qpn);
+	context->mcast = cpu_to_be32(1 << SET_PORT_PROMISC_SHIFT | base_qpn);
+	context->intra_no_vlan = 0;
+	context->no_vlan = MLX4_NO_VLAN_IDX;
+	context->intra_vlan_miss = 0;
+	context->vlan_miss = MLX4_VLAN_MISS_IDX;
+
+	in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
+{
+	struct mlx4_en_stat_out_mbox *mlx4_en_stats;
+	struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
+	struct net_device_stats *stats = &priv->stats;
+	struct mlx4_cmd_mailbox *mailbox;
+	u64 in_mod = reset << 8 | port;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, sizeof(*mlx4_en_stats));
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0,
+			   MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B);
+	if (err)
+		goto out;
+
+	mlx4_en_stats = mailbox->buf;
+
+	spin_lock_bh(&priv->stats_lock);
+
+	stats->rx_packets = be32_to_cpu(mlx4_en_stats->RTOTFRMS) -
+			    be32_to_cpu(mlx4_en_stats->RDROP);
+	stats->tx_packets = be64_to_cpu(mlx4_en_stats->TTOT_prio_0) +
+			    be64_to_cpu(mlx4_en_stats->TTOT_prio_1) +
+			    be64_to_cpu(mlx4_en_stats->TTOT_prio_2) +
+			    be64_to_cpu(mlx4_en_stats->TTOT_prio_3) +
+			    be64_to_cpu(mlx4_en_stats->TTOT_prio_4) +
+			    be64_to_cpu(mlx4_en_stats->TTOT_prio_5) +
+			    be64_to_cpu(mlx4_en_stats->TTOT_prio_6) +
+			    be64_to_cpu(mlx4_en_stats->TTOT_prio_7) +
+			    be64_to_cpu(mlx4_en_stats->TTOT_novlan) +
+			    be64_to_cpu(mlx4_en_stats->TTOT_loopbk);
+	stats->rx_bytes = be64_to_cpu(mlx4_en_stats->ROCT_prio_0) +
+			  be64_to_cpu(mlx4_en_stats->ROCT_prio_1) +
+			  be64_to_cpu(mlx4_en_stats->ROCT_prio_2) +
+			  be64_to_cpu(mlx4_en_stats->ROCT_prio_3) +
+			  be64_to_cpu(mlx4_en_stats->ROCT_prio_4) +
+			  be64_to_cpu(mlx4_en_stats->ROCT_prio_5) +
+			  be64_to_cpu(mlx4_en_stats->ROCT_prio_6) +
+			  be64_to_cpu(mlx4_en_stats->ROCT_prio_7) +
+			  be64_to_cpu(mlx4_en_stats->ROCT_novlan);
+
+	stats->tx_bytes = be64_to_cpu(mlx4_en_stats->TTTLOCT_prio_0) +
+			  be64_to_cpu(mlx4_en_stats->TTTLOCT_prio_1) +
+			  be64_to_cpu(mlx4_en_stats->TTTLOCT_prio_2) +
+			  be64_to_cpu(mlx4_en_stats->TTTLOCT_prio_3) +
+			  be64_to_cpu(mlx4_en_stats->TTTLOCT_prio_4) +
+			  be64_to_cpu(mlx4_en_stats->TTTLOCT_prio_5) +
+			  be64_to_cpu(mlx4_en_stats->TTTLOCT_prio_6) +
+			  be64_to_cpu(mlx4_en_stats->TTTLOCT_prio_7) +
+			  be64_to_cpu(mlx4_en_stats->TTTLOCT_novlan) +
+			  be64_to_cpu(mlx4_en_stats->TTTLOCT_loopbk);
+
+	stats->rx_errors = be64_to_cpu(mlx4_en_stats->PCS) +
+			   be32_to_cpu(mlx4_en_stats->RdropLength) +
+			   be32_to_cpu(mlx4_en_stats->RJBBR) +
+			   be32_to_cpu(mlx4_en_stats->RCRC) +
+			   be32_to_cpu(mlx4_en_stats->RRUNT);
+	stats->tx_errors = be32_to_cpu(mlx4_en_stats->TDROP);
+	stats->multicast = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_1) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_2) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_3) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_4) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_5) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_6) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_7) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_novlan);
+	stats->collisions = 0;
+	stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength);
+	stats->rx_over_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC);
+	stats->rx_frame_errors = 0;
+	stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	stats->rx_missed_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	stats->tx_aborted_errors = 0;
+	stats->tx_carrier_errors = 0;
+	stats->tx_fifo_errors = 0;
+	stats->tx_heartbeat_errors = 0;
+	stats->tx_window_errors = 0;
+
+	priv->pkstats.broadcast =
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_0) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_1) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_2) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_3) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_4) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_5) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_6) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_7) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_novlan);
+	priv->pkstats.rx_prio[0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0);
+	priv->pkstats.rx_prio[1] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1);
+	priv->pkstats.rx_prio[2] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2);
+	priv->pkstats.rx_prio[3] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3);
+	priv->pkstats.rx_prio[4] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4);
+	priv->pkstats.rx_prio[5] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5);
+	priv->pkstats.rx_prio[6] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6);
+	priv->pkstats.rx_prio[7] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7);
+	priv->pkstats.tx_prio[0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0);
+	priv->pkstats.tx_prio[1] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1);
+	priv->pkstats.tx_prio[2] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2);
+	priv->pkstats.tx_prio[3] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3);
+	priv->pkstats.tx_prio[4] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4);
+	priv->pkstats.tx_prio[5] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5);
+	priv->pkstats.tx_prio[6] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6);
+	priv->pkstats.tx_prio[7] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7);
+	spin_unlock_bh(&priv->stats_lock);
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
diff --git a/drivers/net/mlx4/en_port.h b/drivers/net/mlx4/en_port.h
new file mode 100644
index 00000000000..e6477f12beb
--- /dev/null
+++ b/drivers/net/mlx4/en_port.h
@@ -0,0 +1,570 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_PORT_H_
+#define _MLX4_EN_PORT_H_
+
+
+#define SET_PORT_GEN_ALL_VALID	0x7
+#define SET_PORT_PROMISC_SHIFT	31
+
+enum {
+	MLX4_CMD_SET_VLAN_FLTR  = 0x47,
+	MLX4_CMD_SET_MCAST_FLTR = 0x48,
+	MLX4_CMD_DUMP_ETH_STATS = 0x49,
+};
+
+struct mlx4_set_port_general_context {
+	u8 reserved[3];
+	u8 flags;
+	u16 reserved2;
+	__be16 mtu;
+	u8 pptx;
+	u8 pfctx;
+	u16 reserved3;
+	u8 pprx;
+	u8 pfcrx;
+	u16 reserved4;
+};
+
+struct mlx4_set_port_rqp_calc_context {
+	__be32 base_qpn;
+	__be32 flags;
+	u8 reserved[3];
+	u8 mac_miss;
+	u8 intra_no_vlan;
+	u8 no_vlan;
+	u8 intra_vlan_miss;
+	u8 vlan_miss;
+	u8 reserved2[3];
+	u8 no_vlan_prio;
+	__be32 promisc;
+	__be32 mcast;
+};
+
+#define VLAN_FLTR_SIZE	128
+struct mlx4_set_vlan_fltr_mbox {
+	__be32 entry[VLAN_FLTR_SIZE];
+};
+
+
+enum {
+	MLX4_MCAST_CONFIG       = 0,
+	MLX4_MCAST_DISABLE      = 1,
+	MLX4_MCAST_ENABLE       = 2,
+};
+
+
+struct mlx4_en_stat_out_mbox {
+	/* Received frames with a length of 64 octets */
+	__be64 R64_prio_0;
+	__be64 R64_prio_1;
+	__be64 R64_prio_2;
+	__be64 R64_prio_3;
+	__be64 R64_prio_4;
+	__be64 R64_prio_5;
+	__be64 R64_prio_6;
+	__be64 R64_prio_7;
+	__be64 R64_novlan;
+	/* Received frames with a length of 127 octets */
+	__be64 R127_prio_0;
+	__be64 R127_prio_1;
+	__be64 R127_prio_2;
+	__be64 R127_prio_3;
+	__be64 R127_prio_4;
+	__be64 R127_prio_5;
+	__be64 R127_prio_6;
+	__be64 R127_prio_7;
+	__be64 R127_novlan;
+	/* Received frames with a length of 255 octets */
+	__be64 R255_prio_0;
+	__be64 R255_prio_1;
+	__be64 R255_prio_2;
+	__be64 R255_prio_3;
+	__be64 R255_prio_4;
+	__be64 R255_prio_5;
+	__be64 R255_prio_6;
+	__be64 R255_prio_7;
+	__be64 R255_novlan;
+	/* Received frames with a length of 511 octets */
+	__be64 R511_prio_0;
+	__be64 R511_prio_1;
+	__be64 R511_prio_2;
+	__be64 R511_prio_3;
+	__be64 R511_prio_4;
+	__be64 R511_prio_5;
+	__be64 R511_prio_6;
+	__be64 R511_prio_7;
+	__be64 R511_novlan;
+	/* Received frames with a length of 1023 octets */
+	__be64 R1023_prio_0;
+	__be64 R1023_prio_1;
+	__be64 R1023_prio_2;
+	__be64 R1023_prio_3;
+	__be64 R1023_prio_4;
+	__be64 R1023_prio_5;
+	__be64 R1023_prio_6;
+	__be64 R1023_prio_7;
+	__be64 R1023_novlan;
+	/* Received frames with a length of 1518 octets */
+	__be64 R1518_prio_0;
+	__be64 R1518_prio_1;
+	__be64 R1518_prio_2;
+	__be64 R1518_prio_3;
+	__be64 R1518_prio_4;
+	__be64 R1518_prio_5;
+	__be64 R1518_prio_6;
+	__be64 R1518_prio_7;
+	__be64 R1518_novlan;
+	/* Received frames with a length of 1522 octets */
+	__be64 R1522_prio_0;
+	__be64 R1522_prio_1;
+	__be64 R1522_prio_2;
+	__be64 R1522_prio_3;
+	__be64 R1522_prio_4;
+	__be64 R1522_prio_5;
+	__be64 R1522_prio_6;
+	__be64 R1522_prio_7;
+	__be64 R1522_novlan;
+	/* Received frames with a length of 1548 octets */
+	__be64 R1548_prio_0;
+	__be64 R1548_prio_1;
+	__be64 R1548_prio_2;
+	__be64 R1548_prio_3;
+	__be64 R1548_prio_4;
+	__be64 R1548_prio_5;
+	__be64 R1548_prio_6;
+	__be64 R1548_prio_7;
+	__be64 R1548_novlan;
+	/* Received frames with a length of 1548 < octets < MTU */
+	__be64 R2MTU_prio_0;
+	__be64 R2MTU_prio_1;
+	__be64 R2MTU_prio_2;
+	__be64 R2MTU_prio_3;
+	__be64 R2MTU_prio_4;
+	__be64 R2MTU_prio_5;
+	__be64 R2MTU_prio_6;
+	__be64 R2MTU_prio_7;
+	__be64 R2MTU_novlan;
+	/* Received frames with a length of MTU< octets and good CRC */
+	__be64 RGIANT_prio_0;
+	__be64 RGIANT_prio_1;
+	__be64 RGIANT_prio_2;
+	__be64 RGIANT_prio_3;
+	__be64 RGIANT_prio_4;
+	__be64 RGIANT_prio_5;
+	__be64 RGIANT_prio_6;
+	__be64 RGIANT_prio_7;
+	__be64 RGIANT_novlan;
+	/* Received broadcast frames with good CRC */
+	__be64 RBCAST_prio_0;
+	__be64 RBCAST_prio_1;
+	__be64 RBCAST_prio_2;
+	__be64 RBCAST_prio_3;
+	__be64 RBCAST_prio_4;
+	__be64 RBCAST_prio_5;
+	__be64 RBCAST_prio_6;
+	__be64 RBCAST_prio_7;
+	__be64 RBCAST_novlan;
+	/* Received multicast frames with good CRC */
+	__be64 MCAST_prio_0;
+	__be64 MCAST_prio_1;
+	__be64 MCAST_prio_2;
+	__be64 MCAST_prio_3;
+	__be64 MCAST_prio_4;
+	__be64 MCAST_prio_5;
+	__be64 MCAST_prio_6;
+	__be64 MCAST_prio_7;
+	__be64 MCAST_novlan;
+	/* Received unicast not short or GIANT frames with good CRC */
+	__be64 RTOTG_prio_0;
+	__be64 RTOTG_prio_1;
+	__be64 RTOTG_prio_2;
+	__be64 RTOTG_prio_3;
+	__be64 RTOTG_prio_4;
+	__be64 RTOTG_prio_5;
+	__be64 RTOTG_prio_6;
+	__be64 RTOTG_prio_7;
+	__be64 RTOTG_novlan;
+
+	/* Count of total octets of received frames, includes framing characters */
+	__be64 RTTLOCT_prio_0;
+	/* Count of total octets of received frames, not including framing
+	   characters */
+	__be64 RTTLOCT_NOFRM_prio_0;
+	/* Count of Total number of octets received
+	   (only for frames without errors) */
+	__be64 ROCT_prio_0;
+
+	__be64 RTTLOCT_prio_1;
+	__be64 RTTLOCT_NOFRM_prio_1;
+	__be64 ROCT_prio_1;
+
+	__be64 RTTLOCT_prio_2;
+	__be64 RTTLOCT_NOFRM_prio_2;
+	__be64 ROCT_prio_2;
+
+	__be64 RTTLOCT_prio_3;
+	__be64 RTTLOCT_NOFRM_prio_3;
+	__be64 ROCT_prio_3;
+
+	__be64 RTTLOCT_prio_4;
+	__be64 RTTLOCT_NOFRM_prio_4;
+	__be64 ROCT_prio_4;
+
+	__be64 RTTLOCT_prio_5;
+	__be64 RTTLOCT_NOFRM_prio_5;
+	__be64 ROCT_prio_5;
+
+	__be64 RTTLOCT_prio_6;
+	__be64 RTTLOCT_NOFRM_prio_6;
+	__be64 ROCT_prio_6;
+
+	__be64 RTTLOCT_prio_7;
+	__be64 RTTLOCT_NOFRM_prio_7;
+	__be64 ROCT_prio_7;
+
+	__be64 RTTLOCT_novlan;
+	__be64 RTTLOCT_NOFRM_novlan;
+	__be64 ROCT_novlan;
+
+	/* Count of Total received frames including bad frames */
+	__be64 RTOT_prio_0;
+	/* Count of  Total number of received frames with 802.1Q encapsulation */
+	__be64 R1Q_prio_0;
+	__be64 reserved1;
+
+	__be64 RTOT_prio_1;
+	__be64 R1Q_prio_1;
+	__be64 reserved2;
+
+	__be64 RTOT_prio_2;
+	__be64 R1Q_prio_2;
+	__be64 reserved3;
+
+	__be64 RTOT_prio_3;
+	__be64 R1Q_prio_3;
+	__be64 reserved4;
+
+	__be64 RTOT_prio_4;
+	__be64 R1Q_prio_4;
+	__be64 reserved5;
+
+	__be64 RTOT_prio_5;
+	__be64 R1Q_prio_5;
+	__be64 reserved6;
+
+	__be64 RTOT_prio_6;
+	__be64 R1Q_prio_6;
+	__be64 reserved7;
+
+	__be64 RTOT_prio_7;
+	__be64 R1Q_prio_7;
+	__be64 reserved8;
+
+	__be64 RTOT_novlan;
+	__be64 R1Q_novlan;
+	__be64 reserved9;
+
+	/* Total number of Successfully Received Control Frames */
+	__be64 RCNTL;
+	__be64 reserved10;
+	__be64 reserved11;
+	__be64 reserved12;
+	/* Count of received frames with a length/type field  value between 46
+	   (42 for VLANtagged frames) and 1500 (also 1500 for VLAN-tagged frames),
+	   inclusive */
+	__be64 RInRangeLengthErr;
+	/* Count of received frames with length/type field between 1501 and 1535
+	   decimal, inclusive */
+	__be64 ROutRangeLengthErr;
+	/* Count of received frames that are longer than max allowed size for
+	   802.3 frames (1518/1522) */
+	__be64 RFrmTooLong;
+	/* Count frames received with PCS error */
+	__be64 PCS;
+
+	/* Transmit frames with a length of 64 octets */
+	__be64 T64_prio_0;
+	__be64 T64_prio_1;
+	__be64 T64_prio_2;
+	__be64 T64_prio_3;
+	__be64 T64_prio_4;
+	__be64 T64_prio_5;
+	__be64 T64_prio_6;
+	__be64 T64_prio_7;
+	__be64 T64_novlan;
+	__be64 T64_loopbk;
+	/* Transmit frames with a length of 65 to 127 octets. */
+	__be64 T127_prio_0;
+	__be64 T127_prio_1;
+	__be64 T127_prio_2;
+	__be64 T127_prio_3;
+	__be64 T127_prio_4;
+	__be64 T127_prio_5;
+	__be64 T127_prio_6;
+	__be64 T127_prio_7;
+	__be64 T127_novlan;
+	__be64 T127_loopbk;
+	/* Transmit frames with a length of 128 to 255 octets */
+	__be64 T255_prio_0;
+	__be64 T255_prio_1;
+	__be64 T255_prio_2;
+	__be64 T255_prio_3;
+	__be64 T255_prio_4;
+	__be64 T255_prio_5;
+	__be64 T255_prio_6;
+	__be64 T255_prio_7;
+	__be64 T255_novlan;
+	__be64 T255_loopbk;
+	/* Transmit frames with a length of 256 to 511 octets */
+	__be64 T511_prio_0;
+	__be64 T511_prio_1;
+	__be64 T511_prio_2;
+	__be64 T511_prio_3;
+	__be64 T511_prio_4;
+	__be64 T511_prio_5;
+	__be64 T511_prio_6;
+	__be64 T511_prio_7;
+	__be64 T511_novlan;
+	__be64 T511_loopbk;
+	/* Transmit frames with a length of 512 to 1023 octets */
+	__be64 T1023_prio_0;
+	__be64 T1023_prio_1;
+	__be64 T1023_prio_2;
+	__be64 T1023_prio_3;
+	__be64 T1023_prio_4;
+	__be64 T1023_prio_5;
+	__be64 T1023_prio_6;
+	__be64 T1023_prio_7;
+	__be64 T1023_novlan;
+	__be64 T1023_loopbk;
+	/* Transmit frames with a length of 1024 to 1518 octets */
+	__be64 T1518_prio_0;
+	__be64 T1518_prio_1;
+	__be64 T1518_prio_2;
+	__be64 T1518_prio_3;
+	__be64 T1518_prio_4;
+	__be64 T1518_prio_5;
+	__be64 T1518_prio_6;
+	__be64 T1518_prio_7;
+	__be64 T1518_novlan;
+	__be64 T1518_loopbk;
+	/* Counts transmit frames with a length of 1519 to 1522 bytes */
+	__be64 T1522_prio_0;
+	__be64 T1522_prio_1;
+	__be64 T1522_prio_2;
+	__be64 T1522_prio_3;
+	__be64 T1522_prio_4;
+	__be64 T1522_prio_5;
+	__be64 T1522_prio_6;
+	__be64 T1522_prio_7;
+	__be64 T1522_novlan;
+	__be64 T1522_loopbk;
+	/* Transmit frames with a length of 1523 to 1548 octets */
+	__be64 T1548_prio_0;
+	__be64 T1548_prio_1;
+	__be64 T1548_prio_2;
+	__be64 T1548_prio_3;
+	__be64 T1548_prio_4;
+	__be64 T1548_prio_5;
+	__be64 T1548_prio_6;
+	__be64 T1548_prio_7;
+	__be64 T1548_novlan;
+	__be64 T1548_loopbk;
+	/* Counts transmit frames with a length of 1549 to MTU bytes */
+	__be64 T2MTU_prio_0;
+	__be64 T2MTU_prio_1;
+	__be64 T2MTU_prio_2;
+	__be64 T2MTU_prio_3;
+	__be64 T2MTU_prio_4;
+	__be64 T2MTU_prio_5;
+	__be64 T2MTU_prio_6;
+	__be64 T2MTU_prio_7;
+	__be64 T2MTU_novlan;
+	__be64 T2MTU_loopbk;
+	/* Transmit frames with a length greater than MTU octets and a good CRC. */
+	__be64 TGIANT_prio_0;
+	__be64 TGIANT_prio_1;
+	__be64 TGIANT_prio_2;
+	__be64 TGIANT_prio_3;
+	__be64 TGIANT_prio_4;
+	__be64 TGIANT_prio_5;
+	__be64 TGIANT_prio_6;
+	__be64 TGIANT_prio_7;
+	__be64 TGIANT_novlan;
+	__be64 TGIANT_loopbk;
+	/* Transmit broadcast frames with a good CRC */
+	__be64 TBCAST_prio_0;
+	__be64 TBCAST_prio_1;
+	__be64 TBCAST_prio_2;
+	__be64 TBCAST_prio_3;
+	__be64 TBCAST_prio_4;
+	__be64 TBCAST_prio_5;
+	__be64 TBCAST_prio_6;
+	__be64 TBCAST_prio_7;
+	__be64 TBCAST_novlan;
+	__be64 TBCAST_loopbk;
+	/* Transmit multicast frames with a good CRC */
+	__be64 TMCAST_prio_0;
+	__be64 TMCAST_prio_1;
+	__be64 TMCAST_prio_2;
+	__be64 TMCAST_prio_3;
+	__be64 TMCAST_prio_4;
+	__be64 TMCAST_prio_5;
+	__be64 TMCAST_prio_6;
+	__be64 TMCAST_prio_7;
+	__be64 TMCAST_novlan;
+	__be64 TMCAST_loopbk;
+	/* Transmit good frames that are neither broadcast nor multicast */
+	__be64 TTOTG_prio_0;
+	__be64 TTOTG_prio_1;
+	__be64 TTOTG_prio_2;
+	__be64 TTOTG_prio_3;
+	__be64 TTOTG_prio_4;
+	__be64 TTOTG_prio_5;
+	__be64 TTOTG_prio_6;
+	__be64 TTOTG_prio_7;
+	__be64 TTOTG_novlan;
+	__be64 TTOTG_loopbk;
+
+	/* total octets of transmitted frames, including framing characters */
+	__be64 TTTLOCT_prio_0;
+	/* total octets of transmitted frames, not including framing characters */
+	__be64 TTTLOCT_NOFRM_prio_0;
+	/* ifOutOctets */
+	__be64 TOCT_prio_0;
+
+	__be64 TTTLOCT_prio_1;
+	__be64 TTTLOCT_NOFRM_prio_1;
+	__be64 TOCT_prio_1;
+
+	__be64 TTTLOCT_prio_2;
+	__be64 TTTLOCT_NOFRM_prio_2;
+	__be64 TOCT_prio_2;
+
+	__be64 TTTLOCT_prio_3;
+	__be64 TTTLOCT_NOFRM_prio_3;
+	__be64 TOCT_prio_3;
+
+	__be64 TTTLOCT_prio_4;
+	__be64 TTTLOCT_NOFRM_prio_4;
+	__be64 TOCT_prio_4;
+
+	__be64 TTTLOCT_prio_5;
+	__be64 TTTLOCT_NOFRM_prio_5;
+	__be64 TOCT_prio_5;
+
+	__be64 TTTLOCT_prio_6;
+	__be64 TTTLOCT_NOFRM_prio_6;
+	__be64 TOCT_prio_6;
+
+	__be64 TTTLOCT_prio_7;
+	__be64 TTTLOCT_NOFRM_prio_7;
+	__be64 TOCT_prio_7;
+
+	__be64 TTTLOCT_novlan;
+	__be64 TTTLOCT_NOFRM_novlan;
+	__be64 TOCT_novlan;
+
+	__be64 TTTLOCT_loopbk;
+	__be64 TTTLOCT_NOFRM_loopbk;
+	__be64 TOCT_loopbk;
+
+	/* Total frames transmitted with a good CRC that are not aborted  */
+	__be64 TTOT_prio_0;
+	/* Total number of frames transmitted with 802.1Q encapsulation */
+	__be64 T1Q_prio_0;
+	__be64 reserved13;
+
+	__be64 TTOT_prio_1;
+	__be64 T1Q_prio_1;
+	__be64 reserved14;
+
+	__be64 TTOT_prio_2;
+	__be64 T1Q_prio_2;
+	__be64 reserved15;
+
+	__be64 TTOT_prio_3;
+	__be64 T1Q_prio_3;
+	__be64 reserved16;
+
+	__be64 TTOT_prio_4;
+	__be64 T1Q_prio_4;
+	__be64 reserved17;
+
+	__be64 TTOT_prio_5;
+	__be64 T1Q_prio_5;
+	__be64 reserved18;
+
+	__be64 TTOT_prio_6;
+	__be64 T1Q_prio_6;
+	__be64 reserved19;
+
+	__be64 TTOT_prio_7;
+	__be64 T1Q_prio_7;
+	__be64 reserved20;
+
+	__be64 TTOT_novlan;
+	__be64 T1Q_novlan;
+	__be64 reserved21;
+
+	__be64 TTOT_loopbk;
+	__be64 T1Q_loopbk;
+	__be64 reserved22;
+
+	/* Received frames with a length greater than MTU octets and a bad CRC */
+	__be32 RJBBR;
+	/* Received frames with a bad CRC that are not runts, jabbers,
+	   or alignment errors */
+	__be32 RCRC;
+	/* Received frames with SFD with a length of less than 64 octets and a
+	   bad CRC */
+	__be32 RRUNT;
+	/* Received frames with a length less than 64 octets and a good CRC */
+	__be32 RSHORT;
+	/* Total Number of Received Packets Dropped */
+	__be32 RDROP;
+	/* Drop due to overflow  */
+	__be32 RdropOvflw;
+	/* Drop due to overflow */
+	__be32 RdropLength;
+	/* Total of good frames. Does not include frames received with
+	   frame-too-long, FCS, or length errors */
+	__be32 RTOTFRMS;
+	/* Total dropped Xmited packets */
+	__be32 TDROP;
+};
+
+
+#endif
diff --git a/drivers/net/mlx4/en_resources.c b/drivers/net/mlx4/en_resources.c
new file mode 100644
index 00000000000..a0545209e50
--- /dev/null
+++ b/drivers/net/mlx4/en_resources.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_en.h"
+
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+			     int is_tx, int rss, int qpn, int cqn, int srqn,
+			     struct mlx4_qp_context *context)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	memset(context, 0, sizeof *context);
+	context->flags = cpu_to_be32(7 << 16 | rss << 13);
+	context->pd = cpu_to_be32(mdev->priv_pdn);
+	context->mtu_msgmax = 0xff;
+	context->rq_size_stride = 0;
+	if (is_tx)
+		context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+	else
+		context->sq_size_stride = 1;
+	context->usr_page = cpu_to_be32(mdev->priv_uar.index);
+	context->local_qpn = cpu_to_be32(qpn);
+	context->pri_path.ackto = 1 & 0x07;
+	context->pri_path.sched_queue = 0x83 | (priv->port - 1) << 6;
+	context->pri_path.counter_index = 0xff;
+	context->cqn_send = cpu_to_be32(cqn);
+	context->cqn_recv = cpu_to_be32(cqn);
+	context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2);
+	if (!rss)
+		context->srqn = cpu_to_be32(MLX4_EN_USE_SRQ | srqn);
+}
+
+
+int mlx4_en_map_buffer(struct mlx4_buf *buf)
+{
+	struct page **pages;
+	int i;
+
+	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
+		return 0;
+
+	pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	for (i = 0; i < buf->nbufs; ++i)
+		pages[i] = virt_to_page(buf->page_list[i].buf);
+
+	buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+	kfree(pages);
+	if (!buf->direct.buf)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_en_unmap_buffer(struct mlx4_buf *buf)
+{
+	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
+		return;
+
+	vunmap(buf->direct.buf);
+}
diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c
new file mode 100644
index 00000000000..6232227f56c
--- /dev/null
+++ b/drivers/net/mlx4/en_rx.c
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/vmalloc.h>
+
+#include "mlx4_en.h"
+
+static void *get_wqe(struct mlx4_en_rx_ring *ring, int n)
+{
+	int offset = n << ring->srq.wqe_shift;
+	return ring->buf + offset;
+}
+
+static void mlx4_en_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+{
+	return;
+}
+
+static int mlx4_en_get_frag_header(struct skb_frag_struct *frags, void **mac_hdr,
+				   void **ip_hdr, void **tcpudp_hdr,
+				   u64 *hdr_flags, void *priv)
+{
+	*mac_hdr = page_address(frags->page) + frags->page_offset;
+	*ip_hdr = *mac_hdr + ETH_HLEN;
+	*tcpudp_hdr = (struct tcphdr *)(*ip_hdr + sizeof(struct iphdr));
+	*hdr_flags = LRO_IPV4 | LRO_TCP;
+
+	return 0;
+}
+
+static int mlx4_en_alloc_frag(struct mlx4_en_priv *priv,
+			      struct mlx4_en_rx_desc *rx_desc,
+			      struct skb_frag_struct *skb_frags,
+			      struct mlx4_en_rx_alloc *ring_alloc,
+			      int i)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+	struct mlx4_en_rx_alloc *page_alloc = &ring_alloc[i];
+	struct page *page;
+	dma_addr_t dma;
+
+	if (page_alloc->offset == frag_info->last_offset) {
+		/* Allocate new page */
+		page = alloc_pages(GFP_ATOMIC | __GFP_COMP, MLX4_EN_ALLOC_ORDER);
+		if (!page)
+			return -ENOMEM;
+
+		skb_frags[i].page = page_alloc->page;
+		skb_frags[i].page_offset = page_alloc->offset;
+		page_alloc->page = page;
+		page_alloc->offset = frag_info->frag_align;
+	} else {
+		page = page_alloc->page;
+		get_page(page);
+
+		skb_frags[i].page = page;
+		skb_frags[i].page_offset = page_alloc->offset;
+		page_alloc->offset += frag_info->frag_stride;
+	}
+	dma = pci_map_single(mdev->pdev, page_address(skb_frags[i].page) +
+			     skb_frags[i].page_offset, frag_info->frag_size,
+			     PCI_DMA_FROMDEVICE);
+	rx_desc->data[i].addr = cpu_to_be64(dma);
+	return 0;
+}
+
+static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
+				  struct mlx4_en_rx_ring *ring)
+{
+	struct mlx4_en_rx_alloc *page_alloc;
+	int i;
+
+	for (i = 0; i < priv->num_frags; i++) {
+		page_alloc = &ring->page_alloc[i];
+		page_alloc->page = alloc_pages(GFP_ATOMIC | __GFP_COMP,
+					       MLX4_EN_ALLOC_ORDER);
+		if (!page_alloc->page)
+			goto out;
+
+		page_alloc->offset = priv->frag_info[i].frag_align;
+		mlx4_dbg(DRV, priv, "Initialized allocator:%d with page:%p\n",
+			 i, page_alloc->page);
+	}
+	return 0;
+
+out:
+	while (i--) {
+		page_alloc = &ring->page_alloc[i];
+		put_page(page_alloc->page);
+		page_alloc->page = NULL;
+	}
+	return -ENOMEM;
+}
+
+static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
+				      struct mlx4_en_rx_ring *ring)
+{
+	struct mlx4_en_rx_alloc *page_alloc;
+	int i;
+
+	for (i = 0; i < priv->num_frags; i++) {
+		page_alloc = &ring->page_alloc[i];
+		mlx4_dbg(DRV, priv, "Freeing allocator:%d count:%d\n",
+			 i, page_count(page_alloc->page));
+
+		put_page(page_alloc->page);
+		page_alloc->page = NULL;
+	}
+}
+
+
+static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
+				 struct mlx4_en_rx_ring *ring, int index)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
+	struct skb_frag_struct *skb_frags = ring->rx_info +
+					    (index << priv->log_rx_info);
+	int possible_frags;
+	int i;
+
+	/* Pre-link descriptor */
+	rx_desc->next.next_wqe_index = cpu_to_be16((index + 1) & ring->size_mask);
+
+	/* Set size and memtype fields */
+	for (i = 0; i < priv->num_frags; i++) {
+		skb_frags[i].size = priv->frag_info[i].frag_size;
+		rx_desc->data[i].byte_count =
+			cpu_to_be32(priv->frag_info[i].frag_size);
+		rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
+	}
+
+	/* If the number of used fragments does not fill up the ring stride,
+	 * remaining (unused) fragments must be padded with null address/size
+	 * and a special memory key */
+	possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE;
+	for (i = priv->num_frags; i < possible_frags; i++) {
+		rx_desc->data[i].byte_count = 0;
+		rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD);
+		rx_desc->data[i].addr = 0;
+	}
+}
+
+
+static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
+				   struct mlx4_en_rx_ring *ring, int index)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride);
+	struct skb_frag_struct *skb_frags = ring->rx_info +
+					    (index << priv->log_rx_info);
+	int i;
+
+	for (i = 0; i < priv->num_frags; i++)
+		if (mlx4_en_alloc_frag(priv, rx_desc, skb_frags, ring->page_alloc, i))
+			goto err;
+
+	return 0;
+
+err:
+	while (i--)
+		put_page(skb_frags[i].page);
+	return -ENOMEM;
+}
+
+static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
+{
+	*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
+}
+
+static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rx_ring *ring;
+	int ring_ind;
+	int buf_ind;
+
+	for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) {
+		for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+			ring = &priv->rx_ring[ring_ind];
+
+			if (mlx4_en_prepare_rx_desc(priv, ring,
+						    ring->actual_size)) {
+				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
+					mlx4_err(mdev, "Failed to allocate "
+						       "enough rx buffers\n");
+					return -ENOMEM;
+				} else {
+					if (netif_msg_rx_err(priv))
+						mlx4_warn(mdev,
+							  "Only %d buffers allocated\n",
+							  ring->actual_size);
+					goto out;
+				}
+			}
+			ring->actual_size++;
+			ring->prod++;
+		}
+	}
+out:
+	return 0;
+}
+
+static int mlx4_en_fill_rx_buf(struct net_device *dev,
+			       struct mlx4_en_rx_ring *ring)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int num = 0;
+	int err;
+
+	while ((u32) (ring->prod - ring->cons) < ring->actual_size) {
+		err = mlx4_en_prepare_rx_desc(priv, ring, ring->prod &
+					      ring->size_mask);
+		if (err) {
+			if (netif_msg_rx_err(priv))
+				mlx4_warn(priv->mdev,
+					  "Failed preparing rx descriptor\n");
+			priv->port_stats.rx_alloc_failed++;
+			break;
+		}
+		++num;
+		++ring->prod;
+	}
+	if ((u32) (ring->prod - ring->cons) == ring->size)
+		ring->full = 1;
+
+	return num;
+}
+
+static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct skb_frag_struct *skb_frags;
+	struct mlx4_en_rx_desc *rx_desc;
+	dma_addr_t dma;
+	int index;
+	int nr;
+
+	mlx4_dbg(DRV, priv, "Freeing Rx buf - cons:%d prod:%d\n",
+			ring->cons, ring->prod);
+
+	/* Unmap and free Rx buffers */
+	BUG_ON((u32) (ring->prod - ring->cons) > ring->size);
+	while (ring->cons != ring->prod) {
+		index = ring->cons & ring->size_mask;
+		rx_desc = ring->buf + (index << ring->log_stride);
+		skb_frags = ring->rx_info + (index << priv->log_rx_info);
+		mlx4_dbg(DRV, priv, "Processing descriptor:%d\n", index);
+
+		for (nr = 0; nr < priv->num_frags; nr++) {
+			mlx4_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
+			dma = be64_to_cpu(rx_desc->data[nr].addr);
+
+			mlx4_dbg(DRV, priv, "Unmaping buffer at dma:0x%llx\n", (u64) dma);
+			pci_unmap_single(mdev->pdev, dma, skb_frags[nr].size,
+					 PCI_DMA_FROMDEVICE);
+			put_page(skb_frags[nr].page);
+		}
+		++ring->cons;
+	}
+}
+
+
+void mlx4_en_rx_refill(struct work_struct *work)
+{
+	struct delayed_work *delay = container_of(work, struct delayed_work, work);
+	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+						 refill_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+	struct mlx4_en_rx_ring *ring;
+	int need_refill = 0;
+	int i;
+
+	mutex_lock(&mdev->state_lock);
+	if (!mdev->device_up || !priv->port_up)
+		goto out;
+
+	/* We only get here if there are no receive buffers, so we can't race
+	 * with Rx interrupts while filling buffers */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		ring = &priv->rx_ring[i];
+		if (ring->need_refill) {
+			if (mlx4_en_fill_rx_buf(dev, ring)) {
+				ring->need_refill = 0;
+				mlx4_en_update_rx_prod_db(ring);
+			} else
+				need_refill = 1;
+		}
+	}
+	if (need_refill)
+		queue_delayed_work(mdev->workqueue, &priv->refill_task, HZ);
+
+out:
+	mutex_unlock(&mdev->state_lock);
+}
+
+
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_ring *ring, u32 size, u16 stride)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+	int tmp;
+
+	/* Sanity check SRQ size before proceeding */
+	if (size >= mdev->dev->caps.max_srq_wqes)
+		return -EINVAL;
+
+	ring->prod = 0;
+	ring->cons = 0;
+	ring->size = size;
+	ring->size_mask = size - 1;
+	ring->stride = stride;
+	ring->log_stride = ffs(ring->stride) - 1;
+	ring->buf_size = ring->size * ring->stride;
+
+	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
+					sizeof(struct skb_frag_struct));
+	ring->rx_info = vmalloc(tmp);
+	if (!ring->rx_info) {
+		mlx4_err(mdev, "Failed allocating rx_info ring\n");
+		return -ENOMEM;
+	}
+	mlx4_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d\n",
+		 ring->rx_info, tmp);
+
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres,
+				 ring->buf_size, 2 * PAGE_SIZE);
+	if (err)
+		goto err_ring;
+
+	err = mlx4_en_map_buffer(&ring->wqres.buf);
+	if (err) {
+		mlx4_err(mdev, "Failed to map RX buffer\n");
+		goto err_hwq;
+	}
+	ring->buf = ring->wqres.buf.direct.buf;
+
+	/* Configure lro mngr */
+	memset(&ring->lro, 0, sizeof(struct net_lro_mgr));
+	ring->lro.dev = priv->dev;
+	ring->lro.features = LRO_F_NAPI;
+	ring->lro.frag_align_pad = NET_IP_ALIGN;
+	ring->lro.ip_summed = CHECKSUM_UNNECESSARY;
+	ring->lro.ip_summed_aggr = CHECKSUM_UNNECESSARY;
+	ring->lro.max_desc = mdev->profile.num_lro;
+	ring->lro.max_aggr = MAX_SKB_FRAGS;
+	ring->lro.lro_arr = kzalloc(mdev->profile.num_lro *
+				    sizeof(struct net_lro_desc),
+				    GFP_KERNEL);
+	if (!ring->lro.lro_arr) {
+		mlx4_err(mdev, "Failed to allocate lro array\n");
+		goto err_map;
+	}
+	ring->lro.get_frag_header = mlx4_en_get_frag_header;
+
+	return 0;
+
+err_map:
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+err_hwq:
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+err_ring:
+	vfree(ring->rx_info);
+	ring->rx_info = NULL;
+	return err;
+}
+
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_en_rx_ring *ring;
+	int i;
+	int ring_ind;
+	int err;
+	int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					DS_SIZE * priv->num_frags);
+	int max_gs = (stride - sizeof(struct mlx4_wqe_srq_next_seg)) / DS_SIZE;
+
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = &priv->rx_ring[ring_ind];
+
+		ring->prod = 0;
+		ring->cons = 0;
+		ring->actual_size = 0;
+		ring->cqn = priv->rx_cq[ring_ind].mcq.cqn;
+
+		ring->stride = stride;
+		ring->log_stride = ffs(ring->stride) - 1;
+		ring->buf_size = ring->size * ring->stride;
+
+		memset(ring->buf, 0, ring->buf_size);
+		mlx4_en_update_rx_prod_db(ring);
+
+		/* Initailize all descriptors */
+		for (i = 0; i < ring->size; i++)
+			mlx4_en_init_rx_desc(priv, ring, i);
+
+		/* Initialize page allocators */
+		err = mlx4_en_init_allocator(priv, ring);
+		if (err) {
+			 mlx4_err(mdev, "Failed initializing ring allocator\n");
+			 goto err_allocator;
+		}
+
+		/* Fill Rx buffers */
+		ring->full = 0;
+	}
+	if (mlx4_en_fill_rx_buffers(priv))
+		goto err_buffers;
+
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = &priv->rx_ring[ring_ind];
+
+		mlx4_en_update_rx_prod_db(ring);
+
+		/* Configure SRQ representing the ring */
+		ring->srq.max    = ring->size;
+		ring->srq.max_gs = max_gs;
+		ring->srq.wqe_shift = ilog2(ring->stride);
+
+		for (i = 0; i < ring->srq.max; ++i) {
+			next = get_wqe(ring, i);
+			next->next_wqe_index =
+			cpu_to_be16((i + 1) & (ring->srq.max - 1));
+		}
+
+		err = mlx4_srq_alloc(mdev->dev, mdev->priv_pdn, &ring->wqres.mtt,
+				     ring->wqres.db.dma, &ring->srq);
+		if (err){
+			mlx4_err(mdev, "Failed to allocate srq\n");
+			goto err_srq;
+		}
+		ring->srq.event = mlx4_en_srq_event;
+	}
+
+	return 0;
+
+err_srq:
+	while (ring_ind >= 0) {
+		ring = &priv->rx_ring[ring_ind];
+		mlx4_srq_free(mdev->dev, &ring->srq);
+		ring_ind--;
+	}
+
+err_buffers:
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++)
+		mlx4_en_free_rx_buf(priv, &priv->rx_ring[ring_ind]);
+
+	ring_ind = priv->rx_ring_num - 1;
+err_allocator:
+	while (ring_ind >= 0) {
+		mlx4_en_destroy_allocator(priv, &priv->rx_ring[ring_ind]);
+		ring_ind--;
+	}
+	return err;
+}
+
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	kfree(ring->lro.lro_arr);
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+	vfree(ring->rx_info);
+	ring->rx_info = NULL;
+}
+
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mlx4_srq_free(mdev->dev, &ring->srq);
+	mlx4_en_free_rx_buf(priv, ring);
+	mlx4_en_destroy_allocator(priv, ring);
+}
+
+
+/* Unmap a completed descriptor and free unused pages */
+static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
+				    struct mlx4_en_rx_desc *rx_desc,
+				    struct skb_frag_struct *skb_frags,
+				    struct skb_frag_struct *skb_frags_rx,
+				    struct mlx4_en_rx_alloc *page_alloc,
+				    int length)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_frag_info *frag_info;
+	int nr;
+	dma_addr_t dma;
+
+	/* Collect used fragments while replacing them in the HW descirptors */
+	for (nr = 0; nr < priv->num_frags; nr++) {
+		frag_info = &priv->frag_info[nr];
+		if (length <= frag_info->frag_prefix_size)
+			break;
+
+		/* Save page reference in skb */
+		skb_frags_rx[nr].page = skb_frags[nr].page;
+		skb_frags_rx[nr].size = skb_frags[nr].size;
+		skb_frags_rx[nr].page_offset = skb_frags[nr].page_offset;
+		dma = be64_to_cpu(rx_desc->data[nr].addr);
+
+		/* Allocate a replacement page */
+		if (mlx4_en_alloc_frag(priv, rx_desc, skb_frags, page_alloc, nr))
+			goto fail;
+
+		/* Unmap buffer */
+		pci_unmap_single(mdev->pdev, dma, skb_frags[nr].size,
+				 PCI_DMA_FROMDEVICE);
+	}
+	/* Adjust size of last fragment to match actual length */
+	skb_frags_rx[nr - 1].size = length -
+		priv->frag_info[nr - 1].frag_prefix_size;
+	return nr;
+
+fail:
+	/* Drop all accumulated fragments (which have already been replaced in
+	 * the descriptor) of this packet; remaining fragments are reused... */
+	while (nr > 0) {
+		nr--;
+		put_page(skb_frags_rx[nr].page);
+	}
+	return 0;
+}
+
+
+static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
+				      struct mlx4_en_rx_desc *rx_desc,
+				      struct skb_frag_struct *skb_frags,
+				      struct mlx4_en_rx_alloc *page_alloc,
+				      unsigned int length)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct sk_buff *skb;
+	void *va;
+	int used_frags;
+	dma_addr_t dma;
+
+	skb = dev_alloc_skb(SMALL_PACKET_SIZE + NET_IP_ALIGN);
+	if (!skb) {
+		mlx4_dbg(RX_ERR, priv, "Failed allocating skb\n");
+		return NULL;
+	}
+	skb->dev = priv->dev;
+	skb_reserve(skb, NET_IP_ALIGN);
+	skb->len = length;
+	skb->truesize = length + sizeof(struct sk_buff);
+
+	/* Get pointer to first fragment so we could copy the headers into the
+	 * (linear part of the) skb */
+	va = page_address(skb_frags[0].page) + skb_frags[0].page_offset;
+
+	if (length <= SMALL_PACKET_SIZE) {
+		/* We are copying all relevant data to the skb - temporarily
+		 * synch buffers for the copy */
+		dma = be64_to_cpu(rx_desc->data[0].addr);
+		dma_sync_single_range_for_cpu(&mdev->pdev->dev, dma, 0,
+					      length, DMA_FROM_DEVICE);
+		skb_copy_to_linear_data(skb, va, length);
+		dma_sync_single_range_for_device(&mdev->pdev->dev, dma, 0,
+						 length, DMA_FROM_DEVICE);
+		skb->tail += length;
+	} else {
+
+		/* Move relevant fragments to skb */
+		used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, skb_frags,
+						      skb_shinfo(skb)->frags,
+						      page_alloc, length);
+		skb_shinfo(skb)->nr_frags = used_frags;
+
+		/* Copy headers into the skb linear buffer */
+		memcpy(skb->data, va, HEADER_COPY_SIZE);
+		skb->tail += HEADER_COPY_SIZE;
+
+		/* Skip headers in first fragment */
+		skb_shinfo(skb)->frags[0].page_offset += HEADER_COPY_SIZE;
+
+		/* Adjust size of first fragment */
+		skb_shinfo(skb)->frags[0].size -= HEADER_COPY_SIZE;
+		skb->data_len = length - HEADER_COPY_SIZE;
+	}
+	return skb;
+}
+
+static void mlx4_en_copy_desc(struct mlx4_en_priv *priv,
+			      struct mlx4_en_rx_ring *ring,
+			      int from, int to, int num)
+{
+	struct skb_frag_struct *skb_frags_from;
+	struct skb_frag_struct *skb_frags_to;
+	struct mlx4_en_rx_desc *rx_desc_from;
+	struct mlx4_en_rx_desc *rx_desc_to;
+	int from_index, to_index;
+	int nr, i;
+
+	for (i = 0; i < num; i++) {
+		from_index = (from + i) & ring->size_mask;
+		to_index = (to + i) & ring->size_mask;
+		skb_frags_from = ring->rx_info + (from_index << priv->log_rx_info);
+		skb_frags_to = ring->rx_info + (to_index << priv->log_rx_info);
+		rx_desc_from = ring->buf + (from_index << ring->log_stride);
+		rx_desc_to = ring->buf + (to_index << ring->log_stride);
+
+		for (nr = 0; nr < priv->num_frags; nr++) {
+			skb_frags_to[nr].page = skb_frags_from[nr].page;
+			skb_frags_to[nr].page_offset = skb_frags_from[nr].page_offset;
+			rx_desc_to->data[nr].addr = rx_desc_from->data[nr].addr;
+		}
+	}
+}
+
+
+int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_cqe *cqe;
+	struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring];
+	struct skb_frag_struct *skb_frags;
+	struct skb_frag_struct lro_frags[MLX4_EN_MAX_RX_FRAGS];
+	struct mlx4_en_rx_desc *rx_desc;
+	struct sk_buff *skb;
+	int index;
+	int nr;
+	unsigned int length;
+	int polled = 0;
+	int ip_summed;
+
+	if (!priv->port_up)
+		return 0;
+
+	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
+	 * descriptor offset can be deduced from the CQE index instead of
+	 * reading 'cqe->index' */
+	index = cq->mcq.cons_index & ring->size_mask;
+	cqe = &cq->buf[index];
+
+	/* Process all completed CQEs */
+	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+		    cq->mcq.cons_index & cq->size)) {
+
+		skb_frags = ring->rx_info + (index << priv->log_rx_info);
+		rx_desc = ring->buf + (index << ring->log_stride);
+
+		/*
+		 * make sure we read the CQE after we read the ownership bit
+		 */
+		rmb();
+
+		/* Drop packet on bad receive or bad checksum */
+		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+						MLX4_CQE_OPCODE_ERROR)) {
+			mlx4_err(mdev, "CQE completed in error - vendor "
+				  "syndrom:%d syndrom:%d\n",
+				  ((struct mlx4_err_cqe *) cqe)->vendor_err_syndrome,
+				  ((struct mlx4_err_cqe *) cqe)->syndrome);
+			goto next;
+		}
+		if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) {
+			mlx4_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n");
+			goto next;
+		}
+
+		/*
+		 * Packet is OK - process it.
+		 */
+		length = be32_to_cpu(cqe->byte_cnt);
+		ring->bytes += length;
+		ring->packets++;
+
+		if (likely(priv->rx_csum)) {
+			if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
+			    (cqe->checksum == cpu_to_be16(0xffff))) {
+				priv->port_stats.rx_chksum_good++;
+				/* This packet is eligible for LRO if it is:
+				 * - DIX Ethernet (type interpretation)
+				 * - TCP/IP (v4)
+				 * - without IP options
+				 * - not an IP fragment */
+				if (mlx4_en_can_lro(cqe->status) &&
+				    dev->features & NETIF_F_LRO) {
+
+					nr = mlx4_en_complete_rx_desc(
+						priv, rx_desc,
+						skb_frags, lro_frags,
+						ring->page_alloc, length);
+					if (!nr)
+						goto next;
+
+					if (priv->vlgrp && (cqe->vlan_my_qpn &
+							    cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK))) {
+						lro_vlan_hwaccel_receive_frags(
+						       &ring->lro, lro_frags,
+						       length, length,
+						       priv->vlgrp,
+						       be16_to_cpu(cqe->sl_vid),
+						       NULL, 0);
+					} else
+						lro_receive_frags(&ring->lro,
+								  lro_frags,
+								  length,
+								  length,
+								  NULL, 0);
+
+					goto next;
+				}
+
+				/* LRO not possible, complete processing here */
+				ip_summed = CHECKSUM_UNNECESSARY;
+				INC_PERF_COUNTER(priv->pstats.lro_misses);
+			} else {
+				ip_summed = CHECKSUM_NONE;
+				priv->port_stats.rx_chksum_none++;
+			}
+		} else {
+			ip_summed = CHECKSUM_NONE;
+			priv->port_stats.rx_chksum_none++;
+		}
+
+		skb = mlx4_en_rx_skb(priv, rx_desc, skb_frags,
+				     ring->page_alloc, length);
+		if (!skb) {
+			priv->stats.rx_dropped++;
+			goto next;
+		}
+
+		skb->ip_summed = ip_summed;
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* Push it up the stack */
+		if (priv->vlgrp && (be32_to_cpu(cqe->vlan_my_qpn) &
+				    MLX4_CQE_VLAN_PRESENT_MASK)) {
+			vlan_hwaccel_receive_skb(skb, priv->vlgrp,
+						be16_to_cpu(cqe->sl_vid));
+		} else
+			netif_receive_skb(skb);
+
+		dev->last_rx = jiffies;
+
+next:
+		++cq->mcq.cons_index;
+		index = (cq->mcq.cons_index) & ring->size_mask;
+		cqe = &cq->buf[index];
+		if (++polled == budget) {
+			/* We are here because we reached the NAPI budget -
+			 * flush only pending LRO sessions */
+			lro_flush_all(&ring->lro);
+			goto out;
+		}
+	}
+
+	/* If CQ is empty flush all LRO sessions unconditionally */
+	lro_flush_all(&ring->lro);
+
+out:
+	AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
+	mlx4_cq_set_ci(&cq->mcq);
+	wmb(); /* ensure HW sees CQ consumer before we post new buffers */
+	ring->cons = cq->mcq.cons_index;
+	ring->prod += polled; /* Polled descriptors were realocated in place */
+	if (unlikely(!ring->full)) {
+		mlx4_en_copy_desc(priv, ring, ring->cons - polled,
+				  ring->prod - polled, polled);
+		mlx4_en_fill_rx_buf(dev, ring);
+	}
+	mlx4_en_update_rx_prod_db(ring);
+	return polled;
+}
+
+
+void mlx4_en_rx_irq(struct mlx4_cq *mcq)
+{
+	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+
+	if (priv->port_up)
+		netif_rx_schedule(cq->dev, &cq->napi);
+	else
+		mlx4_en_arm_cq(priv, cq);
+}
+
+/* Rx CQ polling - called by NAPI */
+int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
+{
+	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
+	struct net_device *dev = cq->dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int done;
+
+	done = mlx4_en_process_rx_cq(dev, cq, budget);
+
+	/* If we used up all the quota - we're probably not done yet... */
+	if (done == budget)
+		INC_PERF_COUNTER(priv->pstats.napi_quota);
+	else {
+		/* Done for now */
+		netif_rx_complete(dev, napi);
+		mlx4_en_arm_cq(priv, cq);
+	}
+	return done;
+}
+
+
+/* Calculate the last offset position that accomodates a full fragment
+ * (assuming fagment size = stride-align) */
+static int mlx4_en_last_alloc_offset(struct mlx4_en_priv *priv, u16 stride, u16 align)
+{
+	u16 res = MLX4_EN_ALLOC_SIZE % stride;
+	u16 offset = MLX4_EN_ALLOC_SIZE - stride - res + align;
+
+	mlx4_dbg(DRV, priv, "Calculated last offset for stride:%d align:%d "
+			    "res:%d offset:%d\n", stride, align, res, offset);
+	return offset;
+}
+
+
+static int frag_sizes[] = {
+	FRAG_SZ0,
+	FRAG_SZ1,
+	FRAG_SZ2,
+	FRAG_SZ3
+};
+
+void mlx4_en_calc_rx_buf(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int eff_mtu = dev->mtu + ETH_HLEN + VLAN_HLEN + ETH_LLC_SNAP_SIZE;
+	int buf_size = 0;
+	int i = 0;
+
+	while (buf_size < eff_mtu) {
+		priv->frag_info[i].frag_size =
+			(eff_mtu > buf_size + frag_sizes[i]) ?
+				frag_sizes[i] : eff_mtu - buf_size;
+		priv->frag_info[i].frag_prefix_size = buf_size;
+		if (!i)	{
+			priv->frag_info[i].frag_align = NET_IP_ALIGN;
+			priv->frag_info[i].frag_stride =
+				ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES);
+		} else {
+			priv->frag_info[i].frag_align = 0;
+			priv->frag_info[i].frag_stride =
+				ALIGN(frag_sizes[i], SMP_CACHE_BYTES);
+		}
+		priv->frag_info[i].last_offset = mlx4_en_last_alloc_offset(
+						priv, priv->frag_info[i].frag_stride,
+						priv->frag_info[i].frag_align);
+		buf_size += priv->frag_info[i].frag_size;
+		i++;
+	}
+
+	priv->num_frags = i;
+	priv->rx_skb_size = eff_mtu;
+	priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct skb_frag_struct));
+
+	mlx4_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d "
+		  "num_frags:%d):\n", eff_mtu, priv->num_frags);
+	for (i = 0; i < priv->num_frags; i++) {
+		mlx4_dbg(DRV, priv, "  frag:%d - size:%d prefix:%d align:%d "
+				"stride:%d last_offset:%d\n", i,
+				priv->frag_info[i].frag_size,
+				priv->frag_info[i].frag_prefix_size,
+				priv->frag_info[i].frag_align,
+				priv->frag_info[i].frag_stride,
+				priv->frag_info[i].last_offset);
+	}
+}
+
+/* RSS related functions */
+
+/* Calculate rss size and map each entry in rss table to rx ring */
+void mlx4_en_set_default_rss_map(struct mlx4_en_priv *priv,
+				 struct mlx4_en_rss_map *rss_map,
+				 int num_entries, int num_rings)
+{
+	int i;
+
+	rss_map->size = roundup_pow_of_two(num_entries);
+	mlx4_dbg(DRV, priv, "Setting default RSS map of %d entires\n",
+		 rss_map->size);
+
+	for (i = 0; i < rss_map->size; i++) {
+		rss_map->map[i] = i % num_rings;
+		mlx4_dbg(DRV, priv, "Entry %d ---> ring %d\n", i, rss_map->map[i]);
+	}
+}
+
+static void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event)
+{
+    return;
+}
+
+
+static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv,
+				 int qpn, int srqn, int cqn,
+				 enum mlx4_qp_state *state,
+				 struct mlx4_qp *qp)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_qp_context *context;
+	int err = 0;
+
+	context = kmalloc(sizeof *context , GFP_KERNEL);
+	if (!context) {
+		mlx4_err(mdev, "Failed to allocate qp context\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_qp_alloc(mdev->dev, qpn, qp);
+	if (err) {
+		mlx4_err(mdev, "Failed to allocate qp #%d\n", qpn);
+		goto out;
+		return err;
+	}
+	qp->event = mlx4_en_sqp_event;
+
+	memset(context, 0, sizeof *context);
+	mlx4_en_fill_qp_context(priv, 0, 0, 0, 0, qpn, cqn, srqn, context);
+
+	err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, context, qp, state);
+	if (err) {
+		mlx4_qp_remove(mdev->dev, qp);
+		mlx4_qp_free(mdev->dev, qp);
+	}
+out:
+	kfree(context);
+	return err;
+}
+
+/* Allocate rx qp's and configure them according to rss map */
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	struct mlx4_qp_context context;
+	struct mlx4_en_rss_context *rss_context;
+	void *ptr;
+	int rss_xor = mdev->profile.rss_xor;
+	u8 rss_mask = mdev->profile.rss_mask;
+	int i, srqn, qpn, cqn;
+	int err = 0;
+	int good_qps = 0;
+
+	mlx4_dbg(DRV, priv, "Configuring rss steering for port %u\n", priv->port);
+	err = mlx4_qp_reserve_range(mdev->dev, rss_map->size,
+				    rss_map->size, &rss_map->base_qpn);
+	if (err) {
+		mlx4_err(mdev, "Failed reserving %d qps for port %u\n",
+			 rss_map->size, priv->port);
+		return err;
+	}
+
+	for (i = 0; i < rss_map->size; i++) {
+		cqn = priv->rx_ring[rss_map->map[i]].cqn;
+		srqn = priv->rx_ring[rss_map->map[i]].srq.srqn;
+		qpn = rss_map->base_qpn + i;
+		err = mlx4_en_config_rss_qp(priv, qpn, srqn, cqn,
+					    &rss_map->state[i],
+					    &rss_map->qps[i]);
+		if (err)
+			goto rss_err;
+
+		++good_qps;
+	}
+
+	/* Configure RSS indirection qp */
+	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &priv->base_qpn);
+	if (err) {
+		mlx4_err(mdev, "Failed to reserve range for RSS "
+			       "indirection qp\n");
+		goto rss_err;
+	}
+	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp);
+	if (err) {
+		mlx4_err(mdev, "Failed to allocate RSS indirection QP\n");
+		goto reserve_err;
+	}
+	rss_map->indir_qp.event = mlx4_en_sqp_event;
+	mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn,
+				priv->rx_ring[0].cqn, 0, &context);
+
+	ptr = ((void *) &context) + 0x3c;
+	rss_context = (struct mlx4_en_rss_context *) ptr;
+	rss_context->base_qpn = cpu_to_be32(ilog2(rss_map->size) << 24 |
+					    (rss_map->base_qpn));
+	rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn);
+	rss_context->hash_fn = rss_xor & 0x3;
+	rss_context->flags = rss_mask << 2;
+
+	err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context,
+			       &rss_map->indir_qp, &rss_map->indir_state);
+	if (err)
+		goto indir_err;
+
+	return 0;
+
+indir_err:
+	mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp);
+	mlx4_qp_remove(mdev->dev, &rss_map->indir_qp);
+	mlx4_qp_free(mdev->dev, &rss_map->indir_qp);
+reserve_err:
+	mlx4_qp_release_range(mdev->dev, priv->base_qpn, 1);
+rss_err:
+	for (i = 0; i < good_qps; i++) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+	}
+	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, rss_map->size);
+	return err;
+}
+
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	int i;
+
+	mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp);
+	mlx4_qp_remove(mdev->dev, &rss_map->indir_qp);
+	mlx4_qp_free(mdev->dev, &rss_map->indir_qp);
+	mlx4_qp_release_range(mdev->dev, priv->base_qpn, 1);
+
+	for (i = 0; i < rss_map->size; i++) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+	}
+	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, rss_map->size);
+}
+
+
+
+
+
diff --git a/drivers/net/mlx4/en_tx.c b/drivers/net/mlx4/en_tx.c
new file mode 100644
index 00000000000..8592f8fb847
--- /dev/null
+++ b/drivers/net/mlx4/en_tx.c
@@ -0,0 +1,820 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <asm/page.h>
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/vmalloc.h>
+
+#include "mlx4_en.h"
+
+enum {
+	MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
+};
+
+static int inline_thold __read_mostly = MAX_INLINE;
+
+module_param_named(inline_thold, inline_thold, int, 0444);
+MODULE_PARM_DESC(inline_thold, "treshold for using inline data");
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_tx_ring *ring, u32 size,
+			   u16 stride)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int tmp;
+	int err;
+
+	ring->size = size;
+	ring->size_mask = size - 1;
+	ring->stride = stride;
+
+	inline_thold = min(inline_thold, MAX_INLINE);
+
+	spin_lock_init(&ring->comp_lock);
+
+	tmp = size * sizeof(struct mlx4_en_tx_info);
+	ring->tx_info = vmalloc(tmp);
+	if (!ring->tx_info) {
+		mlx4_err(mdev, "Failed allocating tx_info ring\n");
+		return -ENOMEM;
+	}
+	mlx4_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
+		 ring->tx_info, tmp);
+
+	ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
+	if (!ring->bounce_buf) {
+		mlx4_err(mdev, "Failed allocating bounce buffer\n");
+		err = -ENOMEM;
+		goto err_tx;
+	}
+	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
+
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
+				 2 * PAGE_SIZE);
+	if (err) {
+		mlx4_err(mdev, "Failed allocating hwq resources\n");
+		goto err_bounce;
+	}
+
+	err = mlx4_en_map_buffer(&ring->wqres.buf);
+	if (err) {
+		mlx4_err(mdev, "Failed to map TX buffer\n");
+		goto err_hwq_res;
+	}
+
+	ring->buf = ring->wqres.buf.direct.buf;
+
+	mlx4_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d "
+		 "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size,
+		 ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map);
+
+	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn);
+	if (err) {
+		mlx4_err(mdev, "Failed reserving qp for tx ring.\n");
+		goto err_map;
+	}
+
+	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp);
+	if (err) {
+		mlx4_err(mdev, "Failed allocating qp %d\n", ring->qpn);
+		goto err_reserve;
+	}
+
+	return 0;
+
+err_reserve:
+	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
+err_map:
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+err_hwq_res:
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+err_bounce:
+	kfree(ring->bounce_buf);
+	ring->bounce_buf = NULL;
+err_tx:
+	vfree(ring->tx_info);
+	ring->tx_info = NULL;
+	return err;
+}
+
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	mlx4_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
+
+	mlx4_qp_remove(mdev->dev, &ring->qp);
+	mlx4_qp_free(mdev->dev, &ring->qp);
+	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+	kfree(ring->bounce_buf);
+	ring->bounce_buf = NULL;
+	vfree(ring->tx_info);
+	ring->tx_info = NULL;
+}
+
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring,
+			     int cq, int srqn)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	ring->cqn = cq;
+	ring->prod = 0;
+	ring->cons = 0xffffffff;
+	ring->last_nr_txbb = 1;
+	ring->poll_cnt = 0;
+	ring->blocked = 0;
+	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
+	memset(ring->buf, 0, ring->buf_size);
+
+	ring->qp_state = MLX4_QP_STATE_RST;
+	ring->doorbell_qpn = swab32(ring->qp.qpn << 8);
+
+	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
+				ring->cqn, srqn, &ring->context);
+
+	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
+			       &ring->qp, &ring->qp_state);
+
+	return err;
+}
+
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
+}
+
+
+static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring,
+				int index, u8 owner)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
+	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
+	struct sk_buff *skb = tx_info->skb;
+	struct skb_frag_struct *frag;
+	void *end = ring->buf + ring->buf_size;
+	int frags = skb_shinfo(skb)->nr_frags;
+	int i;
+	__be32 *ptr = (__be32 *)tx_desc;
+	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
+
+	/* Optimize the common case when there are no wraparounds */
+	if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
+		if (tx_info->linear) {
+			pci_unmap_single(mdev->pdev,
+					 (dma_addr_t) be64_to_cpu(data->addr),
+					 be32_to_cpu(data->byte_count),
+					 PCI_DMA_TODEVICE);
+			++data;
+		}
+
+		for (i = 0; i < frags; i++) {
+			frag = &skb_shinfo(skb)->frags[i];
+			pci_unmap_page(mdev->pdev,
+				       (dma_addr_t) be64_to_cpu(data[i].addr),
+				       frag->size, PCI_DMA_TODEVICE);
+		}
+		/* Stamp the freed descriptor */
+		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += STAMP_DWORDS;
+		}
+
+	} else {
+		if ((void *) data >= end) {
+			data = (struct mlx4_wqe_data_seg *)
+					(ring->buf + ((void *) data - end));
+		}
+
+		if (tx_info->linear) {
+			pci_unmap_single(mdev->pdev,
+					 (dma_addr_t) be64_to_cpu(data->addr),
+					 be32_to_cpu(data->byte_count),
+					 PCI_DMA_TODEVICE);
+			++data;
+		}
+
+		for (i = 0; i < frags; i++) {
+			/* Check for wraparound before unmapping */
+			if ((void *) data >= end)
+				data = (struct mlx4_wqe_data_seg *) ring->buf;
+			frag = &skb_shinfo(skb)->frags[i];
+			pci_unmap_page(mdev->pdev,
+					(dma_addr_t) be64_to_cpu(data->addr),
+					 frag->size, PCI_DMA_TODEVICE);
+		}
+		/* Stamp the freed descriptor */
+		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += STAMP_DWORDS;
+			if ((void *) ptr >= end) {
+				ptr = ring->buf;
+				stamp ^= cpu_to_be32(0x80000000);
+			}
+		}
+
+	}
+	dev_kfree_skb_any(skb);
+	return tx_info->nr_txbb;
+}
+
+
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int cnt = 0;
+
+	/* Skip last polled descriptor */
+	ring->cons += ring->last_nr_txbb;
+	mlx4_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
+		 ring->cons, ring->prod);
+
+	if ((u32) (ring->prod - ring->cons) > ring->size) {
+		if (netif_msg_tx_err(priv))
+			mlx4_warn(priv->mdev, "Tx consumer passed producer!\n");
+		return 0;
+	}
+
+	while (ring->cons != ring->prod) {
+		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
+						ring->cons & ring->size_mask,
+						!!(ring->cons & ring->size));
+		ring->cons += ring->last_nr_txbb;
+		cnt++;
+	}
+
+	if (cnt)
+		mlx4_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
+
+	return cnt;
+}
+
+void mlx4_en_set_prio_map(struct mlx4_en_priv *priv, u16 *prio_map, u32 ring_num)
+{
+	int block = 8 / ring_num;
+	int extra = 8 - (block * ring_num);
+	int num = 0;
+	u16 ring = 1;
+	int prio;
+
+	if (ring_num == 1) {
+		for (prio = 0; prio < 8; prio++)
+			prio_map[prio] = 0;
+		return;
+	}
+
+	for (prio = 0; prio < 8; prio++) {
+		if (extra && (num == block + 1)) {
+			ring++;
+			num = 0;
+			extra--;
+		} else if (!extra && (num == block)) {
+			ring++;
+			num = 0;
+		}
+		prio_map[prio] = ring;
+		mlx4_dbg(DRV, priv, " prio:%d --> ring:%d\n", prio, ring);
+		num++;
+	}
+}
+
+static void mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_cq *mcq = &cq->mcq;
+	struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+	struct mlx4_cqe *cqe = cq->buf;
+	u16 index;
+	u16 new_index;
+	u32 txbbs_skipped = 0;
+	u32 cq_last_sav;
+
+	/* index always points to the first TXBB of the last polled descriptor */
+	index = ring->cons & ring->size_mask;
+	new_index = be16_to_cpu(cqe->wqe_index) & ring->size_mask;
+	if (index == new_index)
+		return;
+
+	if (!priv->port_up)
+		return;
+
+	/*
+	 * We use a two-stage loop:
+	 * - the first samples the HW-updated CQE
+	 * - the second frees TXBBs until the last sample
+	 * This lets us amortize CQE cache misses, while still polling the CQ
+	 * until is quiescent.
+	 */
+	cq_last_sav = mcq->cons_index;
+	do {
+		do {
+			/* Skip over last polled CQE */
+			index = (index + ring->last_nr_txbb) & ring->size_mask;
+			txbbs_skipped += ring->last_nr_txbb;
+
+			/* Poll next CQE */
+			ring->last_nr_txbb = mlx4_en_free_tx_desc(
+						priv, ring, index,
+						!!((ring->cons + txbbs_skipped) &
+						   ring->size));
+			++mcq->cons_index;
+
+		} while (index != new_index);
+
+		new_index = be16_to_cpu(cqe->wqe_index) & ring->size_mask;
+	} while (index != new_index);
+	AVG_PERF_COUNTER(priv->pstats.tx_coal_avg,
+			 (u32) (mcq->cons_index - cq_last_sav));
+
+	/*
+	 * To prevent CQ overflow we first update CQ consumer and only then
+	 * the ring consumer.
+	 */
+	mlx4_cq_set_ci(mcq);
+	wmb();
+	ring->cons += txbbs_skipped;
+
+	/* Wakeup Tx queue if this ring stopped it */
+	if (unlikely(ring->blocked)) {
+		if (((u32) (ring->prod - ring->cons) <=
+		     ring->size - HEADROOM - MAX_DESC_TXBBS) && !cq->armed) {
+
+			/* TODO: support multiqueue netdevs. Currently, we block
+			 * when *any* ring is full. Note that:
+			 * - 2 Tx rings can unblock at the same time and call
+			 *   netif_wake_queue(), which is OK since this
+			 *   operation is idempotent.
+			 * - We might wake the queue just after another ring
+			 *   stopped it. This is no big deal because the next
+			 *   transmission on that ring would stop the queue.
+			 */
+			ring->blocked = 0;
+			netif_wake_queue(dev);
+			priv->port_stats.wake_queue++;
+		}
+	}
+}
+
+void mlx4_en_tx_irq(struct mlx4_cq *mcq)
+{
+	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+	struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+
+	spin_lock_irq(&ring->comp_lock);
+	cq->armed = 0;
+	mlx4_en_process_tx_cq(cq->dev, cq);
+	if (ring->blocked)
+		mlx4_en_arm_cq(priv, cq);
+	else
+		mod_timer(&cq->timer, jiffies + 1);
+	spin_unlock_irq(&ring->comp_lock);
+}
+
+
+void mlx4_en_poll_tx_cq(unsigned long data)
+{
+	struct mlx4_en_cq *cq = (struct mlx4_en_cq *) data;
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+	struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+	u32 inflight;
+
+	INC_PERF_COUNTER(priv->pstats.tx_poll);
+
+	netif_tx_lock(priv->dev);
+	spin_lock_irq(&ring->comp_lock);
+	mlx4_en_process_tx_cq(cq->dev, cq);
+	inflight = (u32) (ring->prod - ring->cons - ring->last_nr_txbb);
+
+	/* If there are still packets in flight and the timer has not already
+	 * been scheduled by the Tx routine then schedule it here to guarantee
+	 * completion processing of these packets */
+	if (inflight && priv->port_up)
+		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
+
+	spin_unlock_irq(&ring->comp_lock);
+	netif_tx_unlock(priv->dev);
+}
+
+static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
+						      struct mlx4_en_tx_ring *ring,
+						      u32 index,
+						      unsigned int desc_size)
+{
+	u32 copy = (ring->size - index) * TXBB_SIZE;
+	int i;
+
+	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			wmb();
+
+		*((u32 *) (ring->buf + i)) =
+			*((u32 *) (ring->bounce_buf + copy + i));
+	}
+
+	for (i = copy - 4; i >= 4 ; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			wmb();
+
+		*((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
+			*((u32 *) (ring->bounce_buf + i));
+	}
+
+	/* Return real descriptor location */
+	return ring->buf + index * TXBB_SIZE;
+}
+
+static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind)
+{
+	struct mlx4_en_cq *cq = &priv->tx_cq[tx_ind];
+	struct mlx4_en_tx_ring *ring = &priv->tx_ring[tx_ind];
+
+	/* If we don't have a pending timer, set one up to catch our recent
+	   post in case the interface becomes idle */
+	if (!timer_pending(&cq->timer))
+		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
+
+	/* Poll the CQ every mlx4_en_TX_MODER_POLL packets */
+	if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0)
+		mlx4_en_process_tx_cq(priv->dev, cq);
+}
+
+static void *get_frag_ptr(struct sk_buff *skb)
+{
+	struct skb_frag_struct *frag =  &skb_shinfo(skb)->frags[0];
+	struct page *page = frag->page;
+	void *ptr;
+
+	ptr = page_address(page);
+	if (unlikely(!ptr))
+		return NULL;
+
+	return ptr + frag->page_offset;
+}
+
+static int is_inline(struct sk_buff *skb, void **pfrag)
+{
+	void *ptr;
+
+	if (inline_thold && !skb_is_gso(skb) && skb->len <= inline_thold) {
+		if (skb_shinfo(skb)->nr_frags == 1) {
+			ptr = get_frag_ptr(skb);
+			if (unlikely(!ptr))
+				return 0;
+
+			if (pfrag)
+				*pfrag = ptr;
+
+			return 1;
+		} else if (unlikely(skb_shinfo(skb)->nr_frags))
+			return 0;
+		else
+			return 1;
+	}
+
+	return 0;
+}
+
+static int inline_size(struct sk_buff *skb)
+{
+	if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
+	    <= MLX4_INLINE_ALIGN)
+		return ALIGN(skb->len + CTRL_SIZE +
+			     sizeof(struct mlx4_wqe_inline_seg), 16);
+	else
+		return ALIGN(skb->len + CTRL_SIZE + 2 *
+			     sizeof(struct mlx4_wqe_inline_seg), 16);
+}
+
+static int get_real_size(struct sk_buff *skb, struct net_device *dev,
+			 int *lso_header_size)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int real_size;
+
+	if (skb_is_gso(skb)) {
+		*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		real_size = CTRL_SIZE + skb_shinfo(skb)->nr_frags * DS_SIZE +
+			ALIGN(*lso_header_size + 4, DS_SIZE);
+		if (unlikely(*lso_header_size != skb_headlen(skb))) {
+			/* We add a segment for the skb linear buffer only if
+			 * it contains data */
+			if (*lso_header_size < skb_headlen(skb))
+				real_size += DS_SIZE;
+			else {
+				if (netif_msg_tx_err(priv))
+					mlx4_warn(mdev, "Non-linear headers\n");
+				dev_kfree_skb_any(skb);
+				return 0;
+			}
+		}
+		if (unlikely(*lso_header_size > MAX_LSO_HDR_SIZE)) {
+			if (netif_msg_tx_err(priv))
+				mlx4_warn(mdev, "LSO header size too big\n");
+			dev_kfree_skb_any(skb);
+			return 0;
+		}
+	} else {
+		*lso_header_size = 0;
+		if (!is_inline(skb, NULL))
+			real_size = CTRL_SIZE + (skb_shinfo(skb)->nr_frags + 1) * DS_SIZE;
+		else
+			real_size = inline_size(skb);
+	}
+
+	return real_size;
+}
+
+static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *skb,
+			     int real_size, u16 *vlan_tag, int tx_ind, void *fragptr)
+{
+	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
+	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
+
+	if (skb->len <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | skb->len);
+		skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb));
+		if (skb_shinfo(skb)->nr_frags)
+			memcpy(((void *)(inl + 1)) + skb_headlen(skb), fragptr,
+			       skb_shinfo(skb)->frags[0].size);
+
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		if (skb_headlen(skb) <= spc) {
+			skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb));
+			if (skb_headlen(skb) < spc) {
+				memcpy(((void *)(inl + 1)) + skb_headlen(skb),
+					fragptr, spc - skb_headlen(skb));
+				fragptr +=  spc - skb_headlen(skb);
+			}
+			inl = (void *) (inl + 1) + spc;
+			memcpy(((void *)(inl + 1)), fragptr, skb->len - spc);
+		} else {
+			skb_copy_from_linear_data(skb, inl + 1, spc);
+			inl = (void *) (inl + 1) + spc;
+			skb_copy_from_linear_data_offset(skb, spc, inl + 1,
+					skb_headlen(skb) - spc);
+			if (skb_shinfo(skb)->nr_frags)
+				memcpy(((void *)(inl + 1)) + skb_headlen(skb) - spc,
+					fragptr, skb_shinfo(skb)->frags[0].size);
+		}
+
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc));
+	}
+	tx_desc->ctrl.vlan_tag = cpu_to_be16(*vlan_tag);
+	tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!(*vlan_tag);
+	tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
+}
+
+static int get_vlan_info(struct mlx4_en_priv *priv, struct sk_buff *skb,
+			 u16 *vlan_tag)
+{
+	int tx_ind;
+
+	/* Obtain VLAN information if present */
+	if (priv->vlgrp && vlan_tx_tag_present(skb)) {
+		*vlan_tag = vlan_tx_tag_get(skb);
+		/* Set the Tx ring to use according to vlan priority */
+		tx_ind = priv->tx_prio_map[*vlan_tag >> 13];
+	} else {
+		*vlan_tag = 0;
+		tx_ind = 0;
+	}
+	return tx_ind;
+}
+
+int mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *ring;
+	struct mlx4_en_cq *cq;
+	struct mlx4_en_tx_desc *tx_desc;
+	struct mlx4_wqe_data_seg *data;
+	struct skb_frag_struct *frag;
+	struct mlx4_en_tx_info *tx_info;
+	int tx_ind = 0;
+	int nr_txbb;
+	int desc_size;
+	int real_size;
+	dma_addr_t dma;
+	u32 index;
+	__be32 op_own;
+	u16 vlan_tag;
+	int i;
+	int lso_header_size;
+	void *fragptr;
+
+	if (unlikely(!skb->len)) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+	real_size = get_real_size(skb, dev, &lso_header_size);
+	if (unlikely(!real_size))
+		return NETDEV_TX_OK;
+
+	/* Allign descriptor to TXBB size */
+	desc_size = ALIGN(real_size, TXBB_SIZE);
+	nr_txbb = desc_size / TXBB_SIZE;
+	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
+		if (netif_msg_tx_err(priv))
+			mlx4_warn(mdev, "Oversized header or SG list\n");
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	tx_ind = get_vlan_info(priv, skb, &vlan_tag);
+	ring = &priv->tx_ring[tx_ind];
+
+	/* Check available TXBBs And 2K spare for prefetch */
+	if (unlikely(((int)(ring->prod - ring->cons)) >
+		     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
+		/* every full Tx ring stops queue.
+		 * TODO: implement multi-queue support (per-queue stop) */
+		netif_stop_queue(dev);
+		ring->blocked = 1;
+		priv->port_stats.queue_stopped++;
+
+		/* Use interrupts to find out when queue opened */
+		cq = &priv->tx_cq[tx_ind];
+		mlx4_en_arm_cq(priv, cq);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Now that we know what Tx ring to use */
+	if (unlikely(!priv->port_up)) {
+		if (netif_msg_tx_err(priv))
+			mlx4_warn(mdev, "xmit: port down!\n");
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	/* Track current inflight packets for performance analysis */
+	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
+			 (u32) (ring->prod - ring->cons - 1));
+
+	/* Packet is good - grab an index and transmit it */
+	index = ring->prod & ring->size_mask;
+
+	/* See if we have enough space for whole descriptor TXBB for setting
+	 * SW ownership on next descriptor; if not, use a bounce buffer. */
+	if (likely(index + nr_txbb <= ring->size))
+		tx_desc = ring->buf + index * TXBB_SIZE;
+	else
+		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
+
+	/* Save skb in tx_info ring */
+	tx_info = &ring->tx_info[index];
+	tx_info->skb = skb;
+	tx_info->nr_txbb = nr_txbb;
+
+	/* Prepare ctrl segement apart opcode+ownership, which depends on
+	 * whether LSO is used */
+	tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
+	tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!vlan_tag;
+	tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
+	tx_desc->ctrl.srcrb_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
+						MLX4_WQE_CTRL_SOLICITED);
+	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+		tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+							 MLX4_WQE_CTRL_TCP_UDP_CSUM);
+		priv->port_stats.tx_chksum_offload++;
+	}
+
+	/* Handle LSO (TSO) packets */
+	if (lso_header_size) {
+		/* Mark opcode as LSO */
+		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
+			((ring->prod & ring->size) ?
+				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+		/* Fill in the LSO prefix */
+		tx_desc->lso.mss_hdr_size = cpu_to_be32(
+			skb_shinfo(skb)->gso_size << 16 | lso_header_size);
+
+		/* Copy headers;
+		 * note that we already verified that it is linear */
+		memcpy(tx_desc->lso.header, skb->data, lso_header_size);
+		data = ((void *) &tx_desc->lso +
+			ALIGN(lso_header_size + 4, DS_SIZE));
+
+		priv->port_stats.tso_packets++;
+		i = ((skb->len - lso_header_size) / skb_shinfo(skb)->gso_size) +
+			!!((skb->len - lso_header_size) % skb_shinfo(skb)->gso_size);
+		ring->bytes += skb->len + (i - 1) * lso_header_size;
+		ring->packets += i;
+	} else {
+		/* Normal (Non LSO) packet */
+		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+			((ring->prod & ring->size) ?
+			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+		data = &tx_desc->data;
+		ring->bytes += max(skb->len, (unsigned int) ETH_ZLEN);
+		ring->packets++;
+
+	}
+	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);
+
+
+	/* valid only for none inline segments */
+	tx_info->data_offset = (void *) data - (void *) tx_desc;
+
+	tx_info->linear = (lso_header_size < skb_headlen(skb) && !is_inline(skb, NULL)) ? 1 : 0;
+	data += skb_shinfo(skb)->nr_frags + tx_info->linear - 1;
+
+	if (!is_inline(skb, &fragptr)) {
+		/* Map fragments */
+		for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
+			frag = &skb_shinfo(skb)->frags[i];
+			dma = pci_map_page(mdev->dev->pdev, frag->page, frag->page_offset,
+					   frag->size, PCI_DMA_TODEVICE);
+			data->addr = cpu_to_be64(dma);
+			data->lkey = cpu_to_be32(mdev->mr.key);
+			wmb();
+			data->byte_count = cpu_to_be32(frag->size);
+			--data;
+		}
+
+		/* Map linear part */
+		if (tx_info->linear) {
+			dma = pci_map_single(mdev->dev->pdev, skb->data + lso_header_size,
+					     skb_headlen(skb) - lso_header_size, PCI_DMA_TODEVICE);
+			data->addr = cpu_to_be64(dma);
+			data->lkey = cpu_to_be32(mdev->mr.key);
+			wmb();
+			data->byte_count = cpu_to_be32(skb_headlen(skb) - lso_header_size);
+		}
+	} else
+		build_inline_wqe(tx_desc, skb, real_size, &vlan_tag, tx_ind, fragptr);
+
+	ring->prod += nr_txbb;
+
+	/* If we used a bounce buffer then copy descriptor back into place */
+	if (tx_desc == (struct mlx4_en_tx_desc *) ring->bounce_buf)
+		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
+
+	/* Run destructor before passing skb to HW */
+	if (likely(!skb_shared(skb)))
+		skb_orphan(skb);
+
+	/* Ensure new descirptor hits memory
+	 * before setting ownership of this descriptor to HW */
+	wmb();
+	tx_desc->ctrl.owner_opcode = op_own;
+
+	/* Ring doorbell! */
+	wmb();
+	writel(ring->doorbell_qpn, mdev->uar_map + MLX4_SEND_DOORBELL);
+	dev->trans_start = jiffies;
+
+	/* Poll CQ here */
+	mlx4_en_xmit_poll(priv, tx_ind);
+
+	return 0;
+}
+
diff --git a/drivers/net/mlx4/mlx4_en.h b/drivers/net/mlx4/mlx4_en.h
new file mode 100644
index 00000000000..11fb17c6e97
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_en.h
@@ -0,0 +1,561 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_H_
+#define _MLX4_EN_H_
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/inet_lro.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/srq.h>
+#include <linux/mlx4/doorbell.h>
+
+#include "en_port.h"
+
+#define DRV_NAME	"mlx4_en"
+#define DRV_VERSION	"1.4.0"
+#define DRV_RELDATE	"Sep 2008"
+
+
+#define MLX4_EN_MSG_LEVEL	(NETIF_MSG_LINK | NETIF_MSG_IFDOWN)
+
+#define mlx4_dbg(mlevel, priv, format, arg...)	\
+	if (NETIF_MSG_##mlevel & priv->msg_enable) \
+	printk(KERN_DEBUG "%s %s: " format , DRV_NAME ,\
+		(&priv->mdev->pdev->dev)->bus_id , ## arg)
+
+#define mlx4_err(mdev, format, arg...) \
+	printk(KERN_ERR "%s %s: " format , DRV_NAME ,\
+		(&mdev->pdev->dev)->bus_id , ## arg)
+#define mlx4_info(mdev, format, arg...) \
+	printk(KERN_INFO "%s %s: " format , DRV_NAME ,\
+		(&mdev->pdev->dev)->bus_id , ## arg)
+#define mlx4_warn(mdev, format, arg...) \
+	printk(KERN_WARNING "%s %s: " format , DRV_NAME ,\
+		(&mdev->pdev->dev)->bus_id , ## arg)
+
+/*
+ * Device constants
+ */
+
+
+#define MLX4_EN_PAGE_SHIFT	12
+#define MLX4_EN_PAGE_SIZE	(1 << MLX4_EN_PAGE_SHIFT)
+#define MAX_TX_RINGS		16
+#define MAX_RX_RINGS		16
+#define MAX_RSS_MAP_SIZE	64
+#define RSS_FACTOR		2
+#define TXBB_SIZE		64
+#define HEADROOM		(2048 / TXBB_SIZE + 1)
+#define MAX_LSO_HDR_SIZE	92
+#define STAMP_STRIDE		64
+#define STAMP_DWORDS		(STAMP_STRIDE / 4)
+#define STAMP_SHIFT		31
+#define STAMP_VAL		0x7fffffff
+#define STATS_DELAY		(HZ / 4)
+
+/* Typical TSO descriptor with 16 gather entries is 352 bytes... */
+#define MAX_DESC_SIZE		512
+#define MAX_DESC_TXBBS		(MAX_DESC_SIZE / TXBB_SIZE)
+
+/*
+ * OS related constants and tunables
+ */
+
+#define MLX4_EN_WATCHDOG_TIMEOUT	(15 * HZ)
+
+#define MLX4_EN_ALLOC_ORDER	2
+#define MLX4_EN_ALLOC_SIZE	(PAGE_SIZE << MLX4_EN_ALLOC_ORDER)
+
+#define MLX4_EN_MAX_LRO_DESCRIPTORS	32
+
+/* Receive fragment sizes; we use at most 4 fragments (for 9600 byte MTU
+ * and 4K allocations) */
+enum {
+	FRAG_SZ0 = 512 - NET_IP_ALIGN,
+	FRAG_SZ1 = 1024,
+	FRAG_SZ2 = 4096,
+	FRAG_SZ3 = MLX4_EN_ALLOC_SIZE
+};
+#define MLX4_EN_MAX_RX_FRAGS	4
+
+/* Minimum ring size for our page-allocation sceme to work */
+#define MLX4_EN_MIN_RX_SIZE	(MLX4_EN_ALLOC_SIZE / SMP_CACHE_BYTES)
+#define MLX4_EN_MIN_TX_SIZE	(4096 / TXBB_SIZE)
+
+#define MLX4_EN_TX_RING_NUM		9
+#define MLX4_EN_DEF_TX_RING_SIZE	1024
+#define MLX4_EN_DEF_RX_RING_SIZE  	1024
+
+/* Target number of bytes to coalesce with interrupt moderation */
+#define MLX4_EN_RX_COAL_TARGET	0x20000
+#define MLX4_EN_RX_COAL_TIME	0x10
+
+#define MLX4_EN_TX_COAL_PKTS	5
+#define MLX4_EN_TX_COAL_TIME	0x80
+
+#define MLX4_EN_RX_RATE_LOW		400000
+#define MLX4_EN_RX_COAL_TIME_LOW	0
+#define MLX4_EN_RX_RATE_HIGH		450000
+#define MLX4_EN_RX_COAL_TIME_HIGH	128
+#define MLX4_EN_RX_SIZE_THRESH		1024
+#define MLX4_EN_RX_RATE_THRESH		(1000000 / MLX4_EN_RX_COAL_TIME_HIGH)
+#define MLX4_EN_SAMPLE_INTERVAL		0
+
+#define MLX4_EN_AUTO_CONF	0xffff
+
+#define MLX4_EN_DEF_RX_PAUSE	1
+#define MLX4_EN_DEF_TX_PAUSE	1
+
+/* Interval between sucessive polls in the Tx routine when polling is used
+   instead of interrupts (in per-core Tx rings) - should be power of 2 */
+#define MLX4_EN_TX_POLL_MODER	16
+#define MLX4_EN_TX_POLL_TIMEOUT	(HZ / 4)
+
+#define ETH_LLC_SNAP_SIZE	8
+
+#define SMALL_PACKET_SIZE      (256 - NET_IP_ALIGN)
+#define HEADER_COPY_SIZE       (128 - NET_IP_ALIGN)
+
+#define MLX4_EN_MIN_MTU		46
+#define ETH_BCAST		0xffffffffffffULL
+
+#ifdef MLX4_EN_PERF_STAT
+/* Number of samples to 'average' */
+#define AVG_SIZE			128
+#define AVG_FACTOR			1024
+#define NUM_PERF_STATS			NUM_PERF_COUNTERS
+
+#define INC_PERF_COUNTER(cnt)		(++(cnt))
+#define ADD_PERF_COUNTER(cnt, add)	((cnt) += (add))
+#define AVG_PERF_COUNTER(cnt, sample) \
+	((cnt) = ((cnt) * (AVG_SIZE - 1) + (sample) * AVG_FACTOR) / AVG_SIZE)
+#define GET_PERF_COUNTER(cnt)		(cnt)
+#define GET_AVG_PERF_COUNTER(cnt)	((cnt) / AVG_FACTOR)
+
+#else
+
+#define NUM_PERF_STATS			0
+#define INC_PERF_COUNTER(cnt)		do {} while (0)
+#define ADD_PERF_COUNTER(cnt, add)	do {} while (0)
+#define AVG_PERF_COUNTER(cnt, sample)	do {} while (0)
+#define GET_PERF_COUNTER(cnt)		(0)
+#define GET_AVG_PERF_COUNTER(cnt)	(0)
+#endif /* MLX4_EN_PERF_STAT */
+
+/*
+ * Configurables
+ */
+
+enum cq_type {
+	RX = 0,
+	TX = 1,
+};
+
+
+/*
+ * Useful macros
+ */
+#define ROUNDUP_LOG2(x)		ilog2(roundup_pow_of_two(x))
+#define XNOR(x, y)		(!(x) == !(y))
+#define ILLEGAL_MAC(addr)	(addr == 0xffffffffffffULL || addr == 0x0)
+
+
+struct mlx4_en_tx_info {
+	struct sk_buff *skb;
+	u32 nr_txbb;
+	u8 linear;
+	u8 data_offset;
+};
+
+
+#define MLX4_EN_BIT_DESC_OWN	0x80000000
+#define CTRL_SIZE	sizeof(struct mlx4_wqe_ctrl_seg)
+#define MLX4_EN_MEMTYPE_PAD	0x100
+#define DS_SIZE		sizeof(struct mlx4_wqe_data_seg)
+
+
+struct mlx4_en_tx_desc {
+	struct mlx4_wqe_ctrl_seg ctrl;
+	union {
+		struct mlx4_wqe_data_seg data; /* at least one data segment */
+		struct mlx4_wqe_lso_seg lso;
+		struct mlx4_wqe_inline_seg inl;
+	};
+};
+
+#define MLX4_EN_USE_SRQ		0x01000000
+
+struct mlx4_en_rx_alloc {
+	struct page *page;
+	u16 offset;
+};
+
+struct mlx4_en_tx_ring {
+	struct mlx4_hwq_resources wqres;
+	u32 size ; /* number of TXBBs */
+	u32 size_mask;
+	u16 stride;
+	u16 cqn;	/* index of port CQ associated with this ring */
+	u32 prod;
+	u32 cons;
+	u32 buf_size;
+	u32 doorbell_qpn;
+	void *buf;
+	u16 poll_cnt;
+	int blocked;
+	struct mlx4_en_tx_info *tx_info;
+	u8 *bounce_buf;
+	u32 last_nr_txbb;
+	struct mlx4_qp qp;
+	struct mlx4_qp_context context;
+	int qpn;
+	enum mlx4_qp_state qp_state;
+	struct mlx4_srq dummy;
+	unsigned long bytes;
+	unsigned long packets;
+	spinlock_t comp_lock;
+};
+
+struct mlx4_en_rx_desc {
+	struct mlx4_wqe_srq_next_seg next;
+	/* actual number of entries depends on rx ring stride */
+	struct mlx4_wqe_data_seg data[0];
+};
+
+struct mlx4_en_rx_ring {
+	struct mlx4_srq srq;
+	struct mlx4_hwq_resources wqres;
+	struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
+	struct net_lro_mgr lro;
+	u32 size ;	/* number of Rx descs*/
+	u32 actual_size;
+	u32 size_mask;
+	u16 stride;
+	u16 log_stride;
+	u16 cqn;	/* index of port CQ associated with this ring */
+	u32 prod;
+	u32 cons;
+	u32 buf_size;
+	int need_refill;
+	int full;
+	void *buf;
+	void *rx_info;
+	unsigned long bytes;
+	unsigned long packets;
+};
+
+
+static inline int mlx4_en_can_lro(__be16 status)
+{
+	return (status & cpu_to_be16(MLX4_CQE_STATUS_IPV4	|
+				     MLX4_CQE_STATUS_IPV4F	|
+				     MLX4_CQE_STATUS_IPV6	|
+				     MLX4_CQE_STATUS_IPV4OPT	|
+				     MLX4_CQE_STATUS_TCP	|
+				     MLX4_CQE_STATUS_UDP	|
+				     MLX4_CQE_STATUS_IPOK)) ==
+		cpu_to_be16(MLX4_CQE_STATUS_IPV4 |
+			    MLX4_CQE_STATUS_IPOK |
+			    MLX4_CQE_STATUS_TCP);
+}
+
+struct mlx4_en_cq {
+	struct mlx4_cq          mcq;
+	struct mlx4_hwq_resources wqres;
+	int                     ring;
+	spinlock_t              lock;
+	struct net_device      *dev;
+	struct napi_struct	napi;
+	/* Per-core Tx cq processing support */
+	struct timer_list timer;
+	int size;
+	int buf_size;
+	unsigned vector;
+	enum cq_type is_tx;
+	u16 moder_time;
+	u16 moder_cnt;
+	int armed;
+	struct mlx4_cqe *buf;
+#define MLX4_EN_OPCODE_ERROR	0x1e
+};
+
+struct mlx4_en_port_profile {
+	u32 flags;
+	u32 tx_ring_num;
+	u32 rx_ring_num;
+	u32 tx_ring_size;
+	u32 rx_ring_size;
+};
+
+struct mlx4_en_profile {
+	int rss_xor;
+	int num_lro;
+	u8 rss_mask;
+	u32 active_ports;
+	u32 small_pkt_int;
+	int rx_moder_cnt;
+	int rx_moder_time;
+	int auto_moder;
+	u8 rx_pause;
+	u8 rx_ppp;
+	u8 tx_pause;
+	u8 tx_ppp;
+	u8 no_reset;
+	struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_en_dev {
+	struct mlx4_dev         *dev;
+	struct pci_dev		*pdev;
+	struct mutex		state_lock;
+	struct net_device       *pndev[MLX4_MAX_PORTS + 1];
+	u32                     port_cnt;
+	bool			device_up;
+	struct mlx4_en_profile  profile;
+	u32			LSO_support;
+	struct workqueue_struct *workqueue;
+	struct device           *dma_device;
+	void __iomem            *uar_map;
+	struct mlx4_uar         priv_uar;
+	struct mlx4_mr		mr;
+	u32                     priv_pdn;
+	spinlock_t              uar_lock;
+};
+
+
+struct mlx4_en_rss_map {
+	int size;
+	int base_qpn;
+	u16 map[MAX_RSS_MAP_SIZE];
+	struct mlx4_qp qps[MAX_RSS_MAP_SIZE];
+	enum mlx4_qp_state state[MAX_RSS_MAP_SIZE];
+	struct mlx4_qp indir_qp;
+	enum mlx4_qp_state indir_state;
+};
+
+struct mlx4_en_rss_context {
+	__be32 base_qpn;
+	__be32 default_qpn;
+	u16 reserved;
+	u8 hash_fn;
+	u8 flags;
+	__be32 rss_key[10];
+};
+
+struct mlx4_en_pkt_stats {
+	unsigned long broadcast;
+	unsigned long rx_prio[8];
+	unsigned long tx_prio[8];
+#define NUM_PKT_STATS		17
+};
+
+struct mlx4_en_port_stats {
+	unsigned long lro_aggregated;
+	unsigned long lro_flushed;
+	unsigned long lro_no_desc;
+	unsigned long tso_packets;
+	unsigned long queue_stopped;
+	unsigned long wake_queue;
+	unsigned long tx_timeout;
+	unsigned long rx_alloc_failed;
+	unsigned long rx_chksum_good;
+	unsigned long rx_chksum_none;
+	unsigned long tx_chksum_offload;
+#define NUM_PORT_STATS		11
+};
+
+struct mlx4_en_perf_stats {
+	u32 tx_poll;
+	u64 tx_pktsz_avg;
+	u32 inflight_avg;
+	u16 tx_coal_avg;
+	u16 rx_coal_avg;
+	u32 napi_quota;
+#define NUM_PERF_COUNTERS		6
+};
+
+struct mlx4_en_frag_info {
+	u16 frag_size;
+	u16 frag_prefix_size;
+	u16 frag_stride;
+	u16 frag_align;
+	u16 last_offset;
+
+};
+
+struct mlx4_en_priv {
+	struct mlx4_en_dev *mdev;
+	struct mlx4_en_port_profile *prof;
+	struct net_device *dev;
+	struct vlan_group *vlgrp;
+	struct net_device_stats stats;
+	struct net_device_stats ret_stats;
+	spinlock_t stats_lock;
+
+	unsigned long last_moder_packets;
+	unsigned long last_moder_tx_packets;
+	unsigned long last_moder_bytes;
+	unsigned long last_moder_jiffies;
+	int last_moder_time;
+	u16 rx_usecs;
+	u16 rx_frames;
+	u16 tx_usecs;
+	u16 tx_frames;
+	u32 pkt_rate_low;
+	u16 rx_usecs_low;
+	u32 pkt_rate_high;
+	u16 rx_usecs_high;
+	u16 sample_interval;
+	u16 adaptive_rx_coal;
+	u32 msg_enable;
+
+	struct mlx4_hwq_resources res;
+	int link_state;
+	int last_link_state;
+	bool port_up;
+	int port;
+	int registered;
+	int allocated;
+	int stride;
+	int rx_csum;
+	u64 mac;
+	int mac_index;
+	unsigned max_mtu;
+	int base_qpn;
+
+	struct mlx4_en_rss_map rss_map;
+	u16 tx_prio_map[8];
+	u32 flags;
+#define MLX4_EN_FLAG_PROMISC	0x1
+	u32 tx_ring_num;
+	u32 rx_ring_num;
+	u32 rx_skb_size;
+	struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
+	u16 num_frags;
+	u16 log_rx_info;
+
+	struct mlx4_en_tx_ring tx_ring[MAX_TX_RINGS];
+	struct mlx4_en_rx_ring rx_ring[MAX_RX_RINGS];
+	struct mlx4_en_cq tx_cq[MAX_TX_RINGS];
+	struct mlx4_en_cq rx_cq[MAX_RX_RINGS];
+	struct work_struct mcast_task;
+	struct work_struct mac_task;
+	struct delayed_work refill_task;
+	struct work_struct watchdog_task;
+	struct work_struct linkstate_task;
+	struct delayed_work stats_task;
+	struct mlx4_en_perf_stats pstats;
+	struct mlx4_en_pkt_stats pkstats;
+	struct mlx4_en_port_stats port_stats;
+	struct dev_mc_list *mc_list;
+	struct mlx4_en_stat_out_mbox hw_stats;
+};
+
+
+void mlx4_en_destroy_netdev(struct net_device *dev);
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof);
+
+int mlx4_en_get_profile(struct mlx4_en_dev *mdev);
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+		      int entries, int ring, enum cq_type mode);
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+
+void mlx4_en_poll_tx_cq(unsigned long data);
+void mlx4_en_tx_irq(struct mlx4_cq *mcq);
+int mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring,
+			   u32 size, u16 stride);
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring);
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring,
+			     int cq, int srqn);
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring);
+
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_ring *ring,
+			   u32 size, u16 stride);
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring *ring);
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv);
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring);
+int mlx4_en_process_rx_cq(struct net_device *dev,
+			  struct mlx4_en_cq *cq,
+			  int budget);
+int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget);
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+			     int is_tx, int rss, int qpn, int cqn, int srqn,
+			     struct mlx4_qp_context *context);
+int mlx4_en_map_buffer(struct mlx4_buf *buf);
+void mlx4_en_unmap_buffer(struct mlx4_buf *buf);
+
+void mlx4_en_calc_rx_buf(struct net_device *dev);
+void mlx4_en_set_default_rss_map(struct mlx4_en_priv *priv,
+				 struct mlx4_en_rss_map *rss_map,
+				 int num_entries, int num_rings);
+void mlx4_en_set_prio_map(struct mlx4_en_priv *priv, u16 *prio_map, u32 ring_num);
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv);
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv);
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring);
+void mlx4_en_rx_refill(struct work_struct *work);
+void mlx4_en_rx_irq(struct mlx4_cq *mcq);
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, struct vlan_group *grp);
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx);
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc);
+
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
+
+/*
+ * Globals
+ */
+extern const struct ethtool_ops mlx4_en_ethtool_ops;
+#endif
-- 
cgit v1.2.3


From 57893d1cff4606915c13a4610d4e2d6048633f8e Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Wed, 22 Oct 2008 15:48:03 -0700
Subject: mlx4_core: Add Ethernet PCI device IDs

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/net/mlx4/main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index c1d447873bf..468921b8f4b 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -1195,6 +1195,8 @@ static struct pci_device_id mlx4_pci_table[] = {
 	{ PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */
 	{ PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */
 	{ PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
 	{ 0, }
 };
 
-- 
cgit v1.2.3


From 70c9c0db549245a49cabf42d5a74688077254d46 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@voltaire.com>
Date: Wed, 22 Oct 2008 15:49:29 -0700
Subject: IPoIB: Clean up ethtool support

Add a get_rx_csum method.  Remove the driver's own get_tso method, as
the ethtool kernel code uses the default one if nothing is provided.

Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 66af5c1a76e..e9795f60e5d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -42,6 +42,13 @@ static void ipoib_get_drvinfo(struct net_device *netdev,
 	strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1);
 }
 
+static u32 ipoib_get_rx_csum(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return test_bit(IPOIB_FLAG_CSUM, &priv->flags) &&
+		!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
 static int ipoib_get_coalesce(struct net_device *dev,
 			      struct ethtool_coalesce *coal)
 {
@@ -129,7 +136,7 @@ static void ipoib_get_ethtool_stats(struct net_device *dev,
 
 static const struct ethtool_ops ipoib_ethtool_ops = {
 	.get_drvinfo		= ipoib_get_drvinfo,
-	.get_tso		= ethtool_op_get_tso,
+	.get_rx_csum		= ipoib_get_rx_csum,
 	.get_coalesce		= ipoib_get_coalesce,
 	.set_coalesce		= ipoib_set_coalesce,
 	.get_flags		= ethtool_op_get_flags,
-- 
cgit v1.2.3


From 83bb63f62bda28be88b21216fbb59838a10f2348 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@voltaire.com>
Date: Wed, 22 Oct 2008 15:49:49 -0700
Subject: IPoIB: Set netdev offload features properly for child (VLAN)
 interfaces

Child devices were created without any offload features set, fix this by
moving the code that computes the features into generic function which is
now called through non-child and child device creation.

Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>

-- v1 has a bug where the 'result' flag in ipoib_vlan_add may be used uninitialized
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/ulp/ipoib/ipoib.h      |  1 +
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 67 ++++++++++++++++++-------------
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c |  4 ++
 3 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 68ba5c3482e..e0c7dfabf2b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -507,6 +507,7 @@ int ipoib_pkey_dev_delay_open(struct net_device *dev);
 void ipoib_drain_cq(struct net_device *dev);
 
 void ipoib_set_ethtool_ops(struct net_device *dev);
+int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca);
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index c0ee514396d..fddded7900d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1173,11 +1173,48 @@ int ipoib_add_pkey_attr(struct net_device *dev)
 	return device_create_file(&dev->dev, &dev_attr_pkey);
 }
 
+int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
+{
+	struct ib_device_attr *device_attr;
+	int result = -ENOMEM;
+
+	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
+	if (!device_attr) {
+		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
+		       hca->name, sizeof *device_attr);
+		return result;
+	}
+
+	result = ib_query_device(hca, device_attr);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
+		       hca->name, result);
+		kfree(device_attr);
+		return result;
+	}
+	priv->hca_caps = device_attr->device_cap_flags;
+
+	kfree(device_attr);
+
+	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
+		set_bit(IPOIB_FLAG_CSUM, &priv->flags);
+		priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
+	}
+
+	if (lro)
+		priv->dev->features |= NETIF_F_LRO;
+
+	if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO)
+		priv->dev->features |= NETIF_F_TSO;
+
+	return 0;
+}
+
+
 static struct net_device *ipoib_add_port(const char *format,
 					 struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
-	struct ib_device_attr *device_attr;
 	struct ib_port_attr attr;
 	int result = -ENOMEM;
 
@@ -1206,31 +1243,8 @@ static struct net_device *ipoib_add_port(const char *format,
 		goto device_init_failed;
 	}
 
-	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
-	if (!device_attr) {
-		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
-		       hca->name, sizeof *device_attr);
+	if (ipoib_set_dev_features(priv, hca))
 		goto device_init_failed;
-	}
-
-	result = ib_query_device(hca, device_attr);
-	if (result) {
-		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
-		       hca->name, result);
-		kfree(device_attr);
-		goto device_init_failed;
-	}
-	priv->hca_caps = device_attr->device_cap_flags;
-
-	kfree(device_attr);
-
-	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
-		set_bit(IPOIB_FLAG_CSUM, &priv->flags);
-		priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
-	}
-
-	if (lro)
-		priv->dev->features |= NETIF_F_LRO;
 
 	/*
 	 * Set the full membership bit, so that we join the right
@@ -1266,9 +1280,6 @@ static struct net_device *ipoib_add_port(const char *format,
 		goto event_failed;
 	}
 
-	if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO)
-		priv->dev->features |= NETIF_F_TSO;
-
 	result = register_netdev(priv->dev);
 	if (result) {
 		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index b08eb56196d..2cf1a408871 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -93,6 +93,10 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
 	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
 	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
 
+	result = ipoib_set_dev_features(priv, ppriv->ca);
+	if (result)
+		goto device_init_failed;
+
 	priv->pkey = pkey;
 
 	memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
-- 
cgit v1.2.3


From 19f4282149147b4a3e8c670373dc73ddd5d5facc Mon Sep 17 00:00:00 2001
From: Stefan Roscher <ossrosch@linux.vnet.ibm.com>
Date: Wed, 22 Oct 2008 15:52:31 -0700
Subject: IB/ehca: Fix reported max number of QPs and CQs in systems with >1
 adapter

Because ehca adapters can differ in the maximum number of QPs and CQs
we have to save the maximum number of these ressources per adapter and
not globally per ehca driver. This fix introduces 2 new members to the
shca structure to store the maximum value for QPs and CQs per adapter.

The module parameters are now used as initial values for those
variables.  If a user selects an invalid number of CQs or QPs we don't
print an error any longer, instead we will inform the user with a
warning and set the values to the respective maximum supported by the
HW.

Signed-off-by: Stefan Roscher <stefan.roscher@de.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/ehca/ehca_classes.h |  2 ++
 drivers/infiniband/hw/ehca/ehca_cq.c      |  4 ++--
 drivers/infiniband/hw/ehca/ehca_main.c    | 35 ++++++++++++++++++-------------
 drivers/infiniband/hw/ehca/ehca_qp.c      |  4 ++--
 4 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index 5d7b7855afb..4df887af66a 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -128,6 +128,8 @@ struct ehca_shca {
 	/* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */
 	u32 hca_cap_mr_pgsize;
 	int max_mtu;
+	int max_num_qps;
+	int max_num_cqs;
 	atomic_t num_cqs;
 	atomic_t num_qps;
 };
diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c
index 33647a95eb9..2f4c28a3027 100644
--- a/drivers/infiniband/hw/ehca/ehca_cq.c
+++ b/drivers/infiniband/hw/ehca/ehca_cq.c
@@ -132,9 +132,9 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
 	if (cqe >= 0xFFFFFFFF - 64 - additional_cqe)
 		return ERR_PTR(-EINVAL);
 
-	if (!atomic_add_unless(&shca->num_cqs, 1, ehca_max_cq)) {
+	if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) {
 		ehca_err(device, "Unable to create CQ, max number of %i "
-			"CQs reached.", ehca_max_cq);
+			"CQs reached.", shca->max_num_cqs);
 		ehca_err(device, "To increase the maximum number of CQs "
 			"use the number_of_cqs module parameter.\n");
 		return ERR_PTR(-ENOSPC);
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 598844d2edc..086959a0cbe 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -366,22 +366,23 @@ static int ehca_sense_attributes(struct ehca_shca *shca)
 			shca->hca_cap_mr_pgsize |= pgsize_map[i + 1];
 
 	/* Set maximum number of CQs and QPs to calculate EQ size */
-	if (ehca_max_qp == -1)
-		ehca_max_qp = min_t(int, rblock->max_qp, EHCA_MAX_NUM_QUEUES);
-	else if (ehca_max_qp < 1 || ehca_max_qp > rblock->max_qp) {
-		ehca_gen_err("Requested number of QPs is out of range (1 - %i) "
-			"specified by HW", rblock->max_qp);
-		ret = -EINVAL;
-		goto sense_attributes1;
+	if (shca->max_num_qps == -1)
+		shca->max_num_qps = min_t(int, rblock->max_qp,
+					  EHCA_MAX_NUM_QUEUES);
+	else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) {
+		ehca_gen_warn("The requested number of QPs is out of range "
+			      "(1 - %i) specified by HW. Value is set to %i",
+			      rblock->max_qp, rblock->max_qp);
+		shca->max_num_qps = rblock->max_qp;
 	}
 
-	if (ehca_max_cq == -1)
-		ehca_max_cq = min_t(int, rblock->max_cq, EHCA_MAX_NUM_QUEUES);
-	else if (ehca_max_cq < 1 || ehca_max_cq > rblock->max_cq) {
-		ehca_gen_err("Requested number of CQs is out of range (1 - %i) "
-			"specified by HW", rblock->max_cq);
-		ret = -EINVAL;
-		goto sense_attributes1;
+	if (shca->max_num_cqs == -1)
+		shca->max_num_cqs = min_t(int, rblock->max_cq,
+					  EHCA_MAX_NUM_QUEUES);
+	else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) {
+		ehca_gen_warn("The requested number of CQs is out of range "
+			      "(1 - %i) specified by HW. Value is set to %i",
+			      rblock->max_cq, rblock->max_cq);
 	}
 
 	/* query max MTU from first port -- it's the same for all ports */
@@ -733,9 +734,13 @@ static int __devinit ehca_probe(struct of_device *dev,
 		ehca_gen_err("Cannot allocate shca memory.");
 		return -ENOMEM;
 	}
+
 	mutex_init(&shca->modify_mutex);
 	atomic_set(&shca->num_cqs, 0);
 	atomic_set(&shca->num_qps, 0);
+	shca->max_num_qps = ehca_max_qp;
+	shca->max_num_cqs = ehca_max_cq;
+
 	for (i = 0; i < ARRAY_SIZE(shca->sport); i++)
 		spin_lock_init(&shca->sport[i].mod_sqp_lock);
 
@@ -755,7 +760,7 @@ static int __devinit ehca_probe(struct of_device *dev,
 		goto probe1;
 	}
 
-	eq_size = 2 * ehca_max_cq + 4 * ehca_max_qp;
+	eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps;
 	/* create event queues */
 	ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size);
 	if (ret) {
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index 40b578d601c..4d54b9f6456 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -465,9 +465,9 @@ static struct ehca_qp *internal_create_qp(
 	u32 swqe_size = 0, rwqe_size = 0, ib_qp_num;
 	unsigned long flags;
 
-	if (!atomic_add_unless(&shca->num_qps, 1, ehca_max_qp)) {
+	if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) {
 		ehca_err(pd->device, "Unable to create QP, max number of %i "
-			 "QPs reached.", ehca_max_qp);
+			 "QPs reached.", shca->max_num_qps);
 		ehca_err(pd->device, "To increase the maximum number of QPs "
 			 "use the number_of_qps module parameter.\n");
 		return ERR_PTR(-ENOSPC);
-- 
cgit v1.2.3


From 263c24a2bbbaca75805ed231e8346d86410af9d0 Mon Sep 17 00:00:00 2001
From: Stefan Roscher <ossrosch@linux.vnet.ibm.com>
Date: Wed, 22 Oct 2008 15:54:38 -0700
Subject: IB/ehca: Reject dynamic memory add/remove when ehca adapter is
 present

Since the ehca device driver does not support dynamic memory add and
remove operations, the driver must explicitly reject such requests in
order to prevent unpredictable behaviors related to existing memory
regions that cover all of memory being used by InfiniBand protocols in
the kernel.

The solution (for now at least) is to add a memory notifier to the
ehca device driver and if a request for dynamic memory add or remove
comes in, ehca will always reject it.  The user can add or remove
memory by hot-removing the ehca adapter, performing the memory
operation, and then hot-adding the ehca adapter back.

Signed-off-by: Stefan Roscher <stefan.roscher@de.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/ehca/ehca_main.c | 48 ++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 086959a0cbe..bb02a86aa52 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -44,6 +44,8 @@
 #include <linux/slab.h>
 #endif
 
+#include <linux/notifier.h>
+#include <linux/memory.h>
 #include "ehca_classes.h"
 #include "ehca_iverbs.h"
 #include "ehca_mrmw.h"
@@ -969,6 +971,41 @@ void ehca_poll_eqs(unsigned long data)
 	spin_unlock(&shca_list_lock);
 }
 
+static int ehca_mem_notifier(struct notifier_block *nb,
+			     unsigned long action, void *data)
+{
+	static unsigned long ehca_dmem_warn_time;
+
+	switch (action) {
+	case MEM_CANCEL_OFFLINE:
+	case MEM_CANCEL_ONLINE:
+	case MEM_ONLINE:
+	case MEM_OFFLINE:
+		return NOTIFY_OK;
+	case MEM_GOING_ONLINE:
+	case MEM_GOING_OFFLINE:
+		/* only ok if no hca is attached to the lpar */
+		spin_lock(&shca_list_lock);
+		if (list_empty(&shca_list)) {
+			spin_unlock(&shca_list_lock);
+			return NOTIFY_OK;
+		} else {
+			spin_unlock(&shca_list_lock);
+			if (printk_timed_ratelimit(&ehca_dmem_warn_time,
+						   30 * 1000))
+				ehca_gen_err("DMEM operations are not allowed"
+					     "as long as an ehca adapter is"
+					     "attached to the LPAR");
+			return NOTIFY_BAD;
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ehca_mem_nb = {
+	.notifier_call = ehca_mem_notifier,
+};
+
 static int __init ehca_module_init(void)
 {
 	int ret;
@@ -996,6 +1033,12 @@ static int __init ehca_module_init(void)
 		goto module_init2;
 	}
 
+	ret = register_memory_notifier(&ehca_mem_nb);
+	if (ret) {
+		ehca_gen_err("Failed registering memory add/remove notifier");
+		goto module_init3;
+	}
+
 	if (ehca_poll_all_eqs != 1) {
 		ehca_gen_err("WARNING!!!");
 		ehca_gen_err("It is possible to lose interrupts.");
@@ -1008,6 +1051,9 @@ static int __init ehca_module_init(void)
 
 	return 0;
 
+module_init3:
+	ibmebus_unregister_driver(&ehca_driver);
+
 module_init2:
 	ehca_destroy_slab_caches();
 
@@ -1023,6 +1069,8 @@ static void __exit ehca_module_exit(void)
 
 	ibmebus_unregister_driver(&ehca_driver);
 
+	unregister_memory_notifier(&ehca_mem_nb);
+
 	ehca_destroy_slab_caches();
 
 	ehca_destroy_comp_pool();
-- 
cgit v1.2.3