aboutsummaryrefslogtreecommitdiff
path: root/drivers/infiniband/hw/mlx4
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx4')
-rw-r--r--drivers/infiniband/hw/mlx4/Makefile2
-rw-r--r--drivers/infiniband/hw/mlx4/alias_GUID.c688
-rw-r--r--drivers/infiniband/hw/mlx4/cm.c437
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c31
-rw-r--r--drivers/infiniband/hw/mlx4/mad.c1573
-rw-r--r--drivers/infiniband/hw/mlx4/main.c273
-rw-r--r--drivers/infiniband/hw/mlx4/mcg.c1254
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h341
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c656
-rw-r--r--drivers/infiniband/hw/mlx4/sysfs.c794
10 files changed, 5908 insertions, 141 deletions
diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile
index 70f09c7826da..f4213b3a8fe1 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,3 @@
obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o
-mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o
+mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o alias_GUID.o sysfs.o
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
new file mode 100644
index 000000000000..0fcd5cd6f3ee
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+ /***********************************************************/
+/*This file support the handling of the Alias GUID feature. */
+/***********************************************************/
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_pack.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/delay.h>
+#include "mlx4_ib.h"
+
+/*
+The driver keeps the current state of all guids, as they are in the HW.
+Whenever we receive an smp mad GUIDInfo record, the data will be cached.
+*/
+
+struct mlx4_alias_guid_work_context {
+ u8 port;
+ struct mlx4_ib_dev *dev ;
+ struct ib_sa_query *sa_query;
+ struct completion done;
+ int query_id;
+ struct list_head list;
+ int block_num;
+};
+
+struct mlx4_next_alias_guid_work {
+ u8 port;
+ u8 block_num;
+ struct mlx4_sriov_alias_guid_info_rec_det rec_det;
+};
+
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num,
+ u8 port_num, u8 *p_data)
+{
+ int i;
+ u64 guid_indexes;
+ int slave_id;
+ int port_index = port_num - 1;
+
+ if (!mlx4_is_master(dev->dev))
+ return;
+
+ guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+ ports_guid[port_num - 1].
+ all_rec_per_port[block_num].guid_indexes);
+ pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+
+ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+ /* The location of the specific index starts from bit number 4
+ * until bit num 11 */
+ if (test_bit(i + 4, (unsigned long *)&guid_indexes)) {
+ slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+ if (slave_id >= dev->dev->num_slaves) {
+ pr_debug("The last slave: %d\n", slave_id);
+ return;
+ }
+
+ /* cache the guid: */
+ memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id],
+ &p_data[i * GUID_REC_SIZE],
+ GUID_REC_SIZE);
+ } else
+ pr_debug("Guid number: %d in block: %d"
+ " was not updated\n", i, block_num);
+ }
+}
+
+static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index)
+{
+ if (index >= NUM_ALIAS_GUID_PER_PORT) {
+ pr_err("%s: ERROR: asked for index:%d\n", __func__, index);
+ return (__force __be64) ((u64) 0xFFFFFFFFFFFFFFFFUL);
+ }
+ return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index];
+}
+
+
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index)
+{
+ return IB_SA_COMP_MASK(4 + index);
+}
+
+/*
+ * Whenever new GUID is set/unset (guid table change) create event and
+ * notify the relevant slave (master also should be notified).
+ * If the GUID value is not as we have in the cache the slave will not be
+ * updated; in this case it waits for the smp_snoop or the port management
+ * event to call the function and to update the slave.
+ * block_number - the index of the block (16 blocks available)
+ * port_number - 1 or 2
+ */
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+ int block_num, u8 port_num,
+ u8 *p_data)
+{
+ int i;
+ u64 guid_indexes;
+ int slave_id;
+ enum slave_port_state new_state;
+ enum slave_port_state prev_state;
+ __be64 tmp_cur_ag, form_cache_ag;
+ enum slave_port_gen_event gen_event;
+
+ if (!mlx4_is_master(dev->dev))
+ return;
+
+ guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+ ports_guid[port_num - 1].
+ all_rec_per_port[block_num].guid_indexes);
+ pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+
+ /*calculate the slaves and notify them*/
+ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+ /* the location of the specific index runs from bits 4..11 */
+ if (!(test_bit(i + 4, (unsigned long *)&guid_indexes)))
+ continue;
+
+ slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+ if (slave_id >= dev->dev->num_slaves)
+ return;
+ tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE];
+ form_cache_ag = get_cached_alias_guid(dev, port_num,
+ (NUM_ALIAS_GUID_IN_REC * block_num) + i);
+ /*
+ * Check if guid is not the same as in the cache,
+ * If it is different, wait for the snoop_smp or the port mgmt
+ * change event to update the slave on its port state change
+ */
+ if (tmp_cur_ag != form_cache_ag)
+ continue;
+ mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num);
+
+ /*2 cases: Valid GUID, and Invalid Guid*/
+
+ if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/
+ prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num);
+ new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+ MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID,
+ &gen_event);
+ pr_debug("slave: %d, port: %d prev_port_state: %d,"
+ " new_port_state: %d, gen_event: %d\n",
+ slave_id, port_num, prev_state, new_state, gen_event);
+ if (gen_event == SLAVE_PORT_GEN_EVENT_UP) {
+ pr_debug("sending PORT_UP event to slave: %d, port: %d\n",
+ slave_id, port_num);
+ mlx4_gen_port_state_change_eqe(dev->dev, slave_id,
+ port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE);
+ }
+ } else { /* request to invalidate GUID */
+ set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+ MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
+ &gen_event);
+ pr_debug("sending PORT DOWN event to slave: %d, port: %d\n",
+ slave_id, port_num);
+ mlx4_gen_port_state_change_eqe(dev->dev, slave_id, port_num,
+ MLX4_PORT_CHANGE_SUBTYPE_DOWN);
+ }
+ }
+}
+
+static void aliasguid_query_handler(int status,
+ struct ib_sa_guidinfo_rec *guid_rec,
+ void *context)
+{
+ struct mlx4_ib_dev *dev;
+ struct mlx4_alias_guid_work_context *cb_ctx = context;
+ u8 port_index ;
+ int i;
+ struct mlx4_sriov_alias_guid_info_rec_det *rec;
+ unsigned long flags, flags1;
+
+ if (!context)
+ return;
+
+ dev = cb_ctx->dev;
+ port_index = cb_ctx->port - 1;
+ rec = &dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[cb_ctx->block_num];
+
+ if (status) {
+ rec->status = MLX4_GUID_INFO_STATUS_IDLE;
+ pr_debug("(port: %d) failed: status = %d\n",
+ cb_ctx->port, status);
+ goto out;
+ }
+
+ if (guid_rec->block_num != cb_ctx->block_num) {
+ pr_err("block num mismatch: %d != %d\n",
+ cb_ctx->block_num, guid_rec->block_num);
+ goto out;
+ }
+
+ pr_debug("lid/port: %d/%d, block_num: %d\n",
+ be16_to_cpu(guid_rec->lid), cb_ctx->port,
+ guid_rec->block_num);
+
+ rec = &dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[guid_rec->block_num];
+
+ rec->status = MLX4_GUID_INFO_STATUS_SET;
+ rec->method = MLX4_GUID_INFO_RECORD_SET;
+
+ for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) {
+ __be64 tmp_cur_ag;
+ tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE];
+ /* check if the SM didn't assign one of the records.
+ * if it didn't, if it was not sysadmin request:
+ * ask the SM to give a new GUID, (instead of the driver request).
+ */
+ if (tmp_cur_ag == MLX4_NOT_SET_GUID) {
+ mlx4_ib_warn(&dev->ib_dev, "%s:Record num %d in "
+ "block_num: %d was declined by SM, "
+ "ownership by %d (0 = driver, 1=sysAdmin,"
+ " 2=None)\n", __func__, i,
+ guid_rec->block_num, rec->ownership);
+ if (rec->ownership == MLX4_GUID_DRIVER_ASSIGN) {
+ /* if it is driver assign, asks for new GUID from SM*/
+ *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] =
+ MLX4_NOT_SET_GUID;
+
+ /* Mark the record as not assigned, and let it
+ * be sent again in the next work sched.*/
+ rec->status = MLX4_GUID_INFO_STATUS_IDLE;
+ rec->guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ }
+ } else {
+ /* properly assigned record. */
+ /* We save the GUID we just got from the SM in the
+ * admin_guid in order to be persistent, and in the
+ * request from the sm the process will ask for the same GUID */
+ if (rec->ownership == MLX4_GUID_SYSADMIN_ASSIGN &&
+ tmp_cur_ag != *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]) {
+ /* the sysadmin assignment failed.*/
+ mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set"
+ " admin guid after SysAdmin "
+ "configuration. "
+ "Record num %d in block_num:%d "
+ "was declined by SM, "
+ "new val(0x%llx) was kept\n",
+ __func__, i,
+ guid_rec->block_num,
+ be64_to_cpu(*(__be64 *) &
+ rec->all_recs[i * GUID_REC_SIZE]));
+ } else {
+ memcpy(&rec->all_recs[i * GUID_REC_SIZE],
+ &guid_rec->guid_info_list[i * GUID_REC_SIZE],
+ GUID_REC_SIZE);
+ }
+ }
+ }
+ /*
+ The func is call here to close the cases when the
+ sm doesn't send smp, so in the sa response the driver
+ notifies the slave.
+ */
+ mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num,
+ cb_ctx->port,
+ guid_rec->guid_info_list);
+out:
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ if (!dev->sriov.is_going_down)
+ queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq,
+ &dev->sriov.alias_guid.ports_guid[port_index].
+ alias_guid_work, 0);
+ if (cb_ctx->sa_query) {
+ list_del(&cb_ctx->list);
+ kfree(cb_ctx);
+ } else
+ complete(&cb_ctx->done);
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index)
+{
+ int i;
+ u64 cur_admin_val;
+ ib_sa_comp_mask comp_mask = 0;
+
+ dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status
+ = MLX4_GUID_INFO_STATUS_IDLE;
+ dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].method
+ = MLX4_GUID_INFO_RECORD_SET;
+
+ /* calculate the comp_mask for that record.*/
+ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+ cur_admin_val =
+ *(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1].
+ all_rec_per_port[index].all_recs[GUID_REC_SIZE * i];
+ /*
+ check the admin value: if it's for delete (~00LL) or
+ it is the first guid of the first record (hw guid) or
+ the records is not in ownership of the sysadmin and the sm doesn't
+ need to assign GUIDs, then don't put it up for assignment.
+ */
+ if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val ||
+ (!index && !i) ||
+ MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid.
+ ports_guid[port - 1].all_rec_per_port[index].ownership)
+ continue;
+ comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ }
+ dev->sriov.alias_guid.ports_guid[port - 1].
+ all_rec_per_port[index].guid_indexes = comp_mask;
+}
+
+static int set_guid_rec(struct ib_device *ibdev,
+ u8 port, int index,
+ struct mlx4_sriov_alias_guid_info_rec_det *rec_det)
+{
+ int err;
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct ib_sa_guidinfo_rec guid_info_rec;
+ ib_sa_comp_mask comp_mask;
+ struct ib_port_attr attr;
+ struct mlx4_alias_guid_work_context *callback_context;
+ unsigned long resched_delay, flags, flags1;
+ struct list_head *head =
+ &dev->sriov.alias_guid.ports_guid[port - 1].cb_list;
+
+ err = __mlx4_ib_query_port(ibdev, port, &attr, 1);
+ if (err) {
+ pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n",
+ err, port);
+ return err;
+ }
+ /*check the port was configured by the sm, otherwise no need to send */
+ if (attr.state != IB_PORT_ACTIVE) {
+ pr_debug("port %d not active...rescheduling\n", port);
+ resched_delay = 5 * HZ;
+ err = -EAGAIN;
+ goto new_schedule;
+ }
+
+ callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL);
+ if (!callback_context) {
+ err = -ENOMEM;
+ resched_delay = HZ * 5;
+ goto new_schedule;
+ }
+ callback_context->port = port;
+ callback_context->dev = dev;
+ callback_context->block_num = index;
+
+ memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec));
+
+ guid_info_rec.lid = cpu_to_be16(attr.lid);
+ guid_info_rec.block_num = index;
+
+ memcpy(guid_info_rec.guid_info_list, rec_det->all_recs,
+ GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC);
+ comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM |
+ rec_det->guid_indexes;
+
+ init_completion(&callback_context->done);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ list_add_tail(&callback_context->list, head);
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+
+ callback_context->query_id =
+ ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client,
+ ibdev, port, &guid_info_rec,
+ comp_mask, rec_det->method, 1000,
+ GFP_KERNEL, aliasguid_query_handler,
+ callback_context,
+ &callback_context->sa_query);
+ if (callback_context->query_id < 0) {
+ pr_debug("ib_sa_guid_info_rec_query failed, query_id: "
+ "%d. will reschedule to the next 1 sec.\n",
+ callback_context->query_id);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ list_del(&callback_context->list);
+ kfree(callback_context);
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ resched_delay = 1 * HZ;
+ err = -EAGAIN;
+ goto new_schedule;
+ }
+ err = 0;
+ goto out;
+
+new_schedule:
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ invalidate_guid_record(dev, port, index);
+ if (!dev->sriov.is_going_down) {
+ queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+ &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+ resched_delay);
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+
+out:
+ return err;
+}
+
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port)
+{
+ int i;
+ unsigned long flags, flags1;
+
+ pr_debug("port %d\n", port);
+
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++)
+ invalidate_guid_record(dev, port, i);
+
+ if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) {
+ /*
+ make sure no work waits in the queue, if the work is already
+ queued(not on the timer) the cancel will fail. That is not a problem
+ because we just want the work started.
+ */
+ __cancel_delayed_work(&dev->sriov.alias_guid.
+ ports_guid[port - 1].alias_guid_work);
+ queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+ &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+ 0);
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+/* The function returns the next record that was
+ * not configured (or failed to be configured) */
+static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port,
+ struct mlx4_next_alias_guid_work *rec)
+{
+ int j;
+ unsigned long flags;
+
+ for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+ if (dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status ==
+ MLX4_GUID_INFO_STATUS_IDLE) {
+ memcpy(&rec->rec_det,
+ &dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j],
+ sizeof (struct mlx4_sriov_alias_guid_info_rec_det));
+ rec->port = port;
+ rec->block_num = j;
+ dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status =
+ MLX4_GUID_INFO_STATUS_PENDING;
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+ return 0;
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+ }
+ return -ENOENT;
+}
+
+static void set_administratively_guid_record(struct mlx4_ib_dev *dev, int port,
+ int rec_index,
+ struct mlx4_sriov_alias_guid_info_rec_det *rec_det)
+{
+ dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].guid_indexes =
+ rec_det->guid_indexes;
+ memcpy(dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].all_recs,
+ rec_det->all_recs, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE);
+ dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].status =
+ rec_det->status;
+}
+
+static void set_all_slaves_guids(struct mlx4_ib_dev *dev, int port)
+{
+ int j;
+ struct mlx4_sriov_alias_guid_info_rec_det rec_det ;
+
+ for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT ; j++) {
+ memset(rec_det.all_recs, 0, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE);
+ rec_det.guid_indexes = (!j ? 0 : IB_SA_GUIDINFO_REC_GID0) |
+ IB_SA_GUIDINFO_REC_GID1 | IB_SA_GUIDINFO_REC_GID2 |
+ IB_SA_GUIDINFO_REC_GID3 | IB_SA_GUIDINFO_REC_GID4 |
+ IB_SA_GUIDINFO_REC_GID5 | IB_SA_GUIDINFO_REC_GID6 |
+ IB_SA_GUIDINFO_REC_GID7;
+ rec_det.status = MLX4_GUID_INFO_STATUS_IDLE;
+ set_administratively_guid_record(dev, port, j, &rec_det);
+ }
+}
+
+static void alias_guid_work(struct work_struct *work)
+{
+ struct delayed_work *delay = to_delayed_work(work);
+ int ret = 0;
+ struct mlx4_next_alias_guid_work *rec;
+ struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port =
+ container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det,
+ alias_guid_work);
+ struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent;
+ struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid,
+ struct mlx4_ib_sriov,
+ alias_guid);
+ struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov);
+
+ rec = kzalloc(sizeof *rec, GFP_KERNEL);
+ if (!rec) {
+ pr_err("alias_guid_work: No Memory\n");
+ return;
+ }
+
+ pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1);
+ ret = get_next_record_to_update(dev, sriov_alias_port->port, rec);
+ if (ret) {
+ pr_debug("No more records to update.\n");
+ goto out;
+ }
+
+ set_guid_rec(&dev->ib_dev, rec->port + 1, rec->block_num,
+ &rec->rec_det);
+
+out:
+ kfree(rec);
+}
+
+
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port)
+{
+ unsigned long flags, flags1;
+
+ if (!mlx4_is_master(dev->dev))
+ return;
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ if (!dev->sriov.is_going_down) {
+ queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq,
+ &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0);
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+ int i;
+ struct mlx4_ib_sriov *sriov = &dev->sriov;
+ struct mlx4_alias_guid_work_context *cb_ctx;
+ struct mlx4_sriov_alias_guid_port_rec_det *det;
+ struct ib_sa_query *sa_query;
+ unsigned long flags;
+
+ for (i = 0 ; i < dev->num_ports; i++) {
+ cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work);
+ det = &sriov->alias_guid.ports_guid[i];
+ spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+ while (!list_empty(&det->cb_list)) {
+ cb_ctx = list_entry(det->cb_list.next,
+ struct mlx4_alias_guid_work_context,
+ list);
+ sa_query = cb_ctx->sa_query;
+ cb_ctx->sa_query = NULL;
+ list_del(&cb_ctx->list);
+ spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+ ib_sa_cancel_query(cb_ctx->query_id, sa_query);
+ wait_for_completion(&cb_ctx->done);
+ kfree(cb_ctx);
+ spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+ }
+ spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+ }
+ for (i = 0 ; i < dev->num_ports; i++) {
+ flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+ destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+ }
+ ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+ kfree(dev->sriov.alias_guid.sa_client);
+}
+
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+ char alias_wq_name[15];
+ int ret = 0;
+ int i, j, k;
+ union ib_gid gid;
+
+ if (!mlx4_is_master(dev->dev))
+ return 0;
+ dev->sriov.alias_guid.sa_client =
+ kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL);
+ if (!dev->sriov.alias_guid.sa_client)
+ return -ENOMEM;
+
+ ib_sa_register_client(dev->sriov.alias_guid.sa_client);
+
+ spin_lock_init(&dev->sriov.alias_guid.ag_work_lock);
+
+ for (i = 1; i <= dev->num_ports; ++i) {
+ if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) {
+ ret = -EFAULT;
+ goto err_unregister;
+ }
+ }
+
+ for (i = 0 ; i < dev->num_ports; i++) {
+ memset(&dev->sriov.alias_guid.ports_guid[i], 0,
+ sizeof (struct mlx4_sriov_alias_guid_port_rec_det));
+ /*Check if the SM doesn't need to assign the GUIDs*/
+ for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+ if (mlx4_ib_sm_guid_assign) {
+ dev->sriov.alias_guid.ports_guid[i].
+ all_rec_per_port[j].
+ ownership = MLX4_GUID_DRIVER_ASSIGN;
+ continue;
+ }
+ dev->sriov.alias_guid.ports_guid[i].all_rec_per_port[j].
+ ownership = MLX4_GUID_NONE_ASSIGN;
+ /*mark each val as it was deleted,
+ till the sysAdmin will give it valid val*/
+ for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) {
+ *(__be64 *)&dev->sriov.alias_guid.ports_guid[i].
+ all_rec_per_port[j].all_recs[GUID_REC_SIZE * k] =
+ cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL);
+ }
+ }
+ INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list);
+ /*prepare the records, set them to be allocated by sm*/
+ for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++)
+ invalidate_guid_record(dev, i + 1, j);
+
+ dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid;
+ dev->sriov.alias_guid.ports_guid[i].port = i;
+ if (mlx4_ib_sm_guid_assign)
+ set_all_slaves_guids(dev, i);
+
+ snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i);
+ dev->sriov.alias_guid.ports_guid[i].wq =
+ create_singlethread_workqueue(alias_wq_name);
+ if (!dev->sriov.alias_guid.ports_guid[i].wq) {
+ ret = -ENOMEM;
+ goto err_thread;
+ }
+ INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work,
+ alias_guid_work);
+ }
+ return 0;
+
+err_thread:
+ for (--i; i >= 0; i--) {
+ destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+ dev->sriov.alias_guid.ports_guid[i].wq = NULL;
+ }
+
+err_unregister:
+ ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+ kfree(dev->sriov.alias_guid.sa_client);
+ dev->sriov.alias_guid.sa_client = NULL;
+ pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret);
+ return ret;
+}
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
new file mode 100644
index 000000000000..e25e4dafb8a8
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
+#include <linux/idr.h>
+#include <rdma/ib_cm.h>
+
+#include "mlx4_ib.h"
+
+#define CM_CLEANUP_CACHE_TIMEOUT (5 * HZ)
+
+struct id_map_entry {
+ struct rb_node node;
+
+ u32 sl_cm_id;
+ u32 pv_cm_id;
+ int slave_id;
+ int scheduled_delete;
+ struct mlx4_ib_dev *dev;
+
+ struct list_head list;
+ struct delayed_work timeout;
+};
+
+struct cm_generic_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+};
+
+struct cm_req_msg {
+ unsigned char unused[0x60];
+ union ib_gid primary_path_sgid;
+};
+
+
+static void set_local_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+ struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+ msg->local_comm_id = cpu_to_be32(cm_id);
+}
+
+static u32 get_local_comm_id(struct ib_mad *mad)
+{
+ struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+
+ return be32_to_cpu(msg->local_comm_id);
+}
+
+static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+ struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+ msg->remote_comm_id = cpu_to_be32(cm_id);
+}
+
+static u32 get_remote_comm_id(struct ib_mad *mad)
+{
+ struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+
+ return be32_to_cpu(msg->remote_comm_id);
+}
+
+static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad)
+{
+ struct cm_req_msg *msg = (struct cm_req_msg *)mad;
+
+ return msg->primary_path_sgid;
+}
+
+/* Lock should be taken before called */
+static struct id_map_entry *
+id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id)
+{
+ struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+ struct rb_node *node = sl_id_map->rb_node;
+
+ while (node) {
+ struct id_map_entry *id_map_entry =
+ rb_entry(node, struct id_map_entry, node);
+
+ if (id_map_entry->sl_cm_id > sl_cm_id)
+ node = node->rb_left;
+ else if (id_map_entry->sl_cm_id < sl_cm_id)
+ node = node->rb_right;
+ else if (id_map_entry->slave_id > slave_id)
+ node = node->rb_left;
+ else if (id_map_entry->slave_id < slave_id)
+ node = node->rb_right;
+ else
+ return id_map_entry;
+ }
+ return NULL;
+}
+
+static void id_map_ent_timeout(struct work_struct *work)
+{
+ struct delayed_work *delay = to_delayed_work(work);
+ struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout);
+ struct id_map_entry *db_ent, *found_ent;
+ struct mlx4_ib_dev *dev = ent->dev;
+ struct mlx4_ib_sriov *sriov = &dev->sriov;
+ struct rb_root *sl_id_map = &sriov->sl_id_map;
+ int pv_id = (int) ent->pv_cm_id;
+
+ spin_lock(&sriov->id_map_lock);
+ db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id);
+ if (!db_ent)
+ goto out;
+ found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id);
+ if (found_ent && found_ent == ent)
+ rb_erase(&found_ent->node, sl_id_map);
+ idr_remove(&sriov->pv_id_table, pv_id);
+
+out:
+ list_del(&ent->list);
+ spin_unlock(&sriov->id_map_lock);
+ kfree(ent);
+}
+
+static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id)
+{
+ struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+ struct rb_root *sl_id_map = &sriov->sl_id_map;
+ struct id_map_entry *ent, *found_ent;
+
+ spin_lock(&sriov->id_map_lock);
+ ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id);
+ if (!ent)
+ goto out;
+ found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id);
+ if (found_ent && found_ent == ent)
+ rb_erase(&found_ent->node, sl_id_map);
+ idr_remove(&sriov->pv_id_table, pv_cm_id);
+out:
+ spin_unlock(&sriov->id_map_lock);
+}
+
+static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new)
+{
+ struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+ struct rb_node **link = &sl_id_map->rb_node, *parent = NULL;
+ struct id_map_entry *ent;
+ int slave_id = new->slave_id;
+ int sl_cm_id = new->sl_cm_id;
+
+ ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id);
+ if (ent) {
+ pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n",
+ sl_cm_id);
+
+ rb_replace_node(&ent->node, &new->node, sl_id_map);
+ return;
+ }
+
+ /* Go to the bottom of the tree */
+ while (*link) {
+ parent = *link;
+ ent = rb_entry(parent, struct id_map_entry, node);
+
+ if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id))
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, sl_id_map);
+}
+
+static struct id_map_entry *
+id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id)
+{
+ int ret, id;
+ static int next_id;
+ struct id_map_entry *ent;
+ struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+ ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL);
+ if (!ent) {
+ mlx4_ib_warn(ibdev, "Couldn't allocate id cache entry - out of memory\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ent->sl_cm_id = sl_cm_id;
+ ent->slave_id = slave_id;
+ ent->scheduled_delete = 0;
+ ent->dev = to_mdev(ibdev);
+ INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout);
+
+ do {
+ spin_lock(&to_mdev(ibdev)->sriov.id_map_lock);
+ ret = idr_get_new_above(&sriov->pv_id_table, ent,
+ next_id, &id);
+ if (!ret) {
+ next_id = ((unsigned) id + 1) & MAX_ID_MASK;
+ ent->pv_cm_id = (u32)id;
+ sl_id_map_add(ibdev, ent);
+ }
+
+ spin_unlock(&sriov->id_map_lock);
+ } while (ret == -EAGAIN && idr_pre_get(&sriov->pv_id_table, GFP_KERNEL));
+ /*the function idr_get_new_above can return -ENOSPC, so don't insert in that case.*/
+ if (!ret) {
+ spin_lock(&sriov->id_map_lock);
+ list_add_tail(&ent->list, &sriov->cm_list);
+ spin_unlock(&sriov->id_map_lock);
+ return ent;
+ }
+ /*error flow*/
+ kfree(ent);
+ mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret);
+ return ERR_PTR(-ENOMEM);
+}
+
+static struct id_map_entry *
+id_map_get(struct ib_device *ibdev, int *pv_cm_id, int sl_cm_id, int slave_id)
+{
+ struct id_map_entry *ent;
+ struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+ spin_lock(&sriov->id_map_lock);
+ if (*pv_cm_id == -1) {
+ ent = id_map_find_by_sl_id(ibdev, sl_cm_id, slave_id);
+ if (ent)
+ *pv_cm_id = (int) ent->pv_cm_id;
+ } else
+ ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id);
+ spin_unlock(&sriov->id_map_lock);
+
+ return ent;
+}
+
+static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id)
+{
+ struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sriov->going_down_lock, flags);
+ spin_lock(&sriov->id_map_lock);
+ /*make sure that there is no schedule inside the scheduled work.*/
+ if (!sriov->is_going_down) {
+ id->scheduled_delete = 1;
+ schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+ }
+ spin_unlock(&sriov->id_map_lock);
+ spin_unlock_irqrestore(&sriov->going_down_lock, flags);
+}
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+ struct ib_mad *mad)
+{
+ struct id_map_entry *id;
+ u32 sl_cm_id;
+ int pv_cm_id = -1;
+
+ sl_cm_id = get_local_comm_id(mad);
+
+ if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_REP_ATTR_ID) {
+ id = id_map_alloc(ibdev, slave_id, sl_cm_id);
+ if (IS_ERR(id)) {
+ mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n",
+ __func__, slave_id, sl_cm_id);
+ return PTR_ERR(id);
+ }
+ } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID) {
+ return 0;
+ } else {
+ id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
+ }
+
+ if (!id) {
+ pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n",
+ slave_id, sl_cm_id);
+ return -EINVAL;
+ }
+
+ set_local_comm_id(mad, id->pv_cm_id);
+
+ if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+ schedule_delayed(ibdev, id);
+ else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID)
+ id_map_find_del(ibdev, pv_cm_id);
+
+ return 0;
+}
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+ struct ib_mad *mad)
+{
+ u32 pv_cm_id;
+ struct id_map_entry *id;
+
+ if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) {
+ union ib_gid gid;
+
+ gid = gid_from_req_msg(ibdev, mad);
+ *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id);
+ if (*slave < 0) {
+ mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n",
+ gid.global.interface_id);
+ return -ENOENT;
+ }
+ return 0;
+ }
+
+ pv_cm_id = get_remote_comm_id(mad);
+ id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1);
+
+ if (!id) {
+ pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id);
+ return -ENOENT;
+ }
+
+ *slave = id->slave_id;
+ set_remote_comm_id(mad, id->sl_cm_id);
+
+ if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+ schedule_delayed(ibdev, id);
+ else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID ||
+ mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) {
+ id_map_find_del(ibdev, (int) pv_cm_id);
+ }
+
+ return 0;
+}
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev)
+{
+ spin_lock_init(&dev->sriov.id_map_lock);
+ INIT_LIST_HEAD(&dev->sriov.cm_list);
+ dev->sriov.sl_id_map = RB_ROOT;
+ idr_init(&dev->sriov.pv_id_table);
+ idr_pre_get(&dev->sriov.pv_id_table, GFP_KERNEL);
+}
+
+/* slave = -1 ==> all slaves */
+/* TBD -- call paravirt clean for single slave. Need for slave RESET event */
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
+{
+ struct mlx4_ib_sriov *sriov = &dev->sriov;
+ struct rb_root *sl_id_map = &sriov->sl_id_map;
+ struct list_head lh;
+ struct rb_node *nd;
+ int need_flush = 1;
+ struct id_map_entry *map, *tmp_map;
+ /* cancel all delayed work queue entries */
+ INIT_LIST_HEAD(&lh);
+ spin_lock(&sriov->id_map_lock);
+ list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+ if (slave < 0 || slave == map->slave_id) {
+ if (map->scheduled_delete)
+ need_flush &= !!cancel_delayed_work(&map->timeout);
+ }
+ }
+
+ spin_unlock(&sriov->id_map_lock);
+
+ if (!need_flush)
+ flush_scheduled_work(); /* make sure all timers were flushed */
+
+ /* now, remove all leftover entries from databases*/
+ spin_lock(&sriov->id_map_lock);
+ if (slave < 0) {
+ while (rb_first(sl_id_map)) {
+ struct id_map_entry *ent =
+ rb_entry(rb_first(sl_id_map),
+ struct id_map_entry, node);
+
+ rb_erase(&ent->node, sl_id_map);
+ idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id);
+ }
+ list_splice_init(&dev->sriov.cm_list, &lh);
+ } else {
+ /* first, move nodes belonging to slave to db remove list */
+ nd = rb_first(sl_id_map);
+ while (nd) {
+ struct id_map_entry *ent =
+ rb_entry(nd, struct id_map_entry, node);
+ nd = rb_next(nd);
+ if (ent->slave_id == slave)
+ list_move_tail(&ent->list, &lh);
+ }
+ /* remove those nodes from databases */
+ list_for_each_entry_safe(map, tmp_map, &lh, list) {
+ rb_erase(&map->node, sl_id_map);
+ idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id);
+ }
+
+ /* add remaining nodes from cm_list */
+ list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+ if (slave == map->slave_id)
+ list_move_tail(&map->list, &lh);
+ }
+ }
+
+ spin_unlock(&sriov->id_map_lock);
+
+ /* free any map entries left behind due to cancel_delayed_work above */
+ list_for_each_entry_safe(map, tmp_map, &lh, list) {
+ list_del(&map->list);
+ kfree(map);
+ }
+}
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 6d4ef71cbcdf..c9eb6a6815ce 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -547,6 +547,26 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum)
checksum == cpu_to_be16(0xffff);
}
+static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc,
+ unsigned tail, struct mlx4_cqe *cqe)
+{
+ struct mlx4_ib_proxy_sqp_hdr *hdr;
+
+ ib_dma_sync_single_for_cpu(qp->ibqp.device,
+ qp->sqp_proxy_rcv[tail].map,
+ sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ DMA_FROM_DEVICE);
+ hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr);
+ wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index);
+ wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32);
+ wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
+ wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF;
+ wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0;
+ wc->dlid_path_bits = 0;
+
+ return 0;
+}
+
static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
struct mlx4_ib_qp **cur_qp,
struct ib_wc *wc)
@@ -559,6 +579,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
int is_error;
u32 g_mlpath_rqpn;
u16 wqe_ctr;
+ unsigned tail = 0;
repoll:
cqe = next_cqe_sw(cq);
@@ -634,7 +655,8 @@ repoll:
mlx4_ib_free_srq_wqe(srq, wqe_ctr);
} else {
wq = &(*cur_qp)->rq;
- wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ tail = wq->tail & (wq->wqe_cnt - 1);
+ wc->wr_id = wq->wrid[tail];
++wq->tail;
}
@@ -717,6 +739,13 @@ repoll:
break;
}
+ if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {
+ if ((*cur_qp)->mlx4_ib_qp_type &
+ (MLX4_IB_QPT_PROXY_SMI_OWNER |
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+ return use_tunnel_data(*cur_qp, cq, wc, tail, cqe);
+ }
+
wc->slid = be16_to_cpu(cqe->rlid);
g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
wc->src_qp = g_mlpath_rqpn & 0xffffff;
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 9c2ae7efd00f..21a794152d15 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -32,7 +32,10 @@
#include <rdma/ib_mad.h>
#include <rdma/ib_smi.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cache.h>
+#include <linux/random.h>
#include <linux/mlx4/cmd.h>
#include <linux/gfp.h>
#include <rdma/ib_pma.h>
@@ -44,7 +47,62 @@ enum {
MLX4_IB_VENDOR_CLASS2 = 0xa
};
-int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+#define MLX4_TUN_SEND_WRID_SHIFT 34
+#define MLX4_TUN_QPN_SHIFT 32
+#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT)
+#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT)
+
+#define MLX4_TUN_IS_RECV(a) (((a) >> MLX4_TUN_SEND_WRID_SHIFT) & 0x1)
+#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3)
+
+ /* Port mgmt change event handling */
+
+#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr)
+#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask)
+#define NUM_IDX_IN_PKEY_TBL_BLK 32
+#define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */
+#define GUID_TBL_BLK_NUM_ENTRIES 8
+#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES)
+
+struct mlx4_mad_rcv_buf {
+ struct ib_grh grh;
+ u8 payload[256];
+} __packed;
+
+struct mlx4_mad_snd_buf {
+ u8 payload[256];
+} __packed;
+
+struct mlx4_tunnel_mad {
+ struct ib_grh grh;
+ struct mlx4_ib_tunnel_header hdr;
+ struct ib_mad mad;
+} __packed;
+
+struct mlx4_rcv_tunnel_mad {
+ struct mlx4_rcv_tunnel_hdr hdr;
+ struct ib_grh grh;
+ struct ib_mad mad;
+} __packed;
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+ int block, u32 change_bitmap);
+
+__be64 mlx4_ib_gen_node_guid(void)
+{
+#define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40))
+ return cpu_to_be64(NODE_GUID_HI | random32());
+}
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
+{
+ return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
+ cpu_to_be64(0xff00000000000000LL);
+}
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
void *in_mad, void *response_mad)
{
@@ -71,10 +129,13 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
* Key check traps can't be generated unless we have in_wc to
* tell us where to send the trap.
*/
- if (ignore_mkey || !in_wc)
+ if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc)
op_modifier |= 0x1;
- if (ignore_bkey || !in_wc)
+ if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc)
op_modifier |= 0x2;
+ if (mlx4_is_mfunc(dev->dev) &&
+ (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc))
+ op_modifier |= 0x8;
if (in_wc) {
struct {
@@ -107,10 +168,10 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
in_modifier |= in_wc->slid << 16;
}
- err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma,
- in_modifier, op_modifier,
+ err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier,
+ mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier,
MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
- MLX4_CMD_NATIVE);
+ (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED);
if (!err)
memcpy(response_mad, outmailbox->buf, 256);
@@ -156,6 +217,10 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
{
struct ib_port_info *pinfo;
u16 lid;
+ __be16 *base;
+ u32 bn, pkey_change_bitmap;
+ int i;
+
struct mlx4_ib_dev *dev = to_mdev(ibdev);
if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
@@ -171,17 +236,46 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
pinfo->neighbormtu_mastersmsl & 0xf);
if (pinfo->clientrereg_resv_subnetto & 0x80)
- mlx4_ib_dispatch_event(dev, port_num,
- IB_EVENT_CLIENT_REREGISTER);
+ handle_client_rereg_event(dev, port_num);
if (prev_lid != lid)
- mlx4_ib_dispatch_event(dev, port_num,
- IB_EVENT_LID_CHANGE);
+ handle_lid_change_event(dev, port_num);
break;
case IB_SMP_ATTR_PKEY_TABLE:
- mlx4_ib_dispatch_event(dev, port_num,
- IB_EVENT_PKEY_CHANGE);
+ if (!mlx4_is_mfunc(dev->dev)) {
+ mlx4_ib_dispatch_event(dev, port_num,
+ IB_EVENT_PKEY_CHANGE);
+ break;
+ }
+
+ /* at this point, we are running in the master.
+ * Slaves do not receive SMPs.
+ */
+ bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF;
+ base = (__be16 *) &(((struct ib_smp *)mad)->data[0]);
+ pkey_change_bitmap = 0;
+ for (i = 0; i < 32; i++) {
+ pr_debug("PKEY[%d] = x%x\n",
+ i + bn*32, be16_to_cpu(base[i]));
+ if (be16_to_cpu(base[i]) !=
+ dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) {
+ pkey_change_bitmap |= (1 << i);
+ dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] =
+ be16_to_cpu(base[i]);
+ }
+ }
+ pr_debug("PKEY Change event: port=%d, "
+ "block=0x%x, change_bitmap=0x%x\n",
+ port_num, bn, pkey_change_bitmap);
+
+ if (pkey_change_bitmap) {
+ mlx4_ib_dispatch_event(dev, port_num,
+ IB_EVENT_PKEY_CHANGE);
+ if (!dev->sriov.is_going_down)
+ __propagate_pkey_ev(dev, port_num, bn,
+ pkey_change_bitmap);
+ }
break;
case IB_SMP_ATTR_GUID_INFO:
@@ -189,12 +283,56 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
if (!mlx4_is_master(dev->dev))
mlx4_ib_dispatch_event(dev, port_num,
IB_EVENT_GID_CHANGE);
+ /*if master, notify relevant slaves*/
+ if (mlx4_is_master(dev->dev) &&
+ !dev->sriov.is_going_down) {
+ bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod);
+ mlx4_ib_update_cache_on_guid_change(dev, bn, port_num,
+ (u8 *)(&((struct ib_smp *)mad)->data));
+ mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num,
+ (u8 *)(&((struct ib_smp *)mad)->data));
+ }
break;
+
default:
break;
}
}
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+ int block, u32 change_bitmap)
+{
+ int i, ix, slave, err;
+ int have_event = 0;
+
+ for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) {
+ if (slave == mlx4_master_func_num(dev->dev))
+ continue;
+ if (!mlx4_is_slave_active(dev->dev, slave))
+ continue;
+
+ have_event = 0;
+ for (i = 0; i < 32; i++) {
+ if (!(change_bitmap & (1 << i)))
+ continue;
+ for (ix = 0;
+ ix < dev->dev->caps.pkey_table_len[port_num]; ix++) {
+ if (dev->pkeys.virt2phys_pkey[slave][port_num - 1]
+ [ix] == i + 32 * block) {
+ err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num);
+ pr_debug("propagate_pkey_ev: slave %d,"
+ " port %d, ix %d (%d)\n",
+ slave, port_num, ix, err);
+ have_event = 1;
+ break;
+ }
+ }
+ if (have_event)
+ break;
+ }
+ }
+}
+
static void node_desc_override(struct ib_device *dev,
struct ib_mad *mad)
{
@@ -242,6 +380,268 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *ma
}
}
+static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave,
+ struct ib_sa_mad *sa_mad)
+{
+ int ret = 0;
+
+ /* dispatch to different sa handlers */
+ switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+ case IB_SA_ATTR_MC_MEMBER_REC:
+ ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ int i;
+
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+ if (dev->sriov.demux[port - 1].guid_cache[i] == guid)
+ return i;
+ }
+ return -1;
+}
+
+
+static int get_pkey_phys_indices(struct mlx4_ib_dev *ibdev, u8 port, u8 ph_pkey_ix,
+ u8 *full_pk_ix, u8 *partial_pk_ix,
+ int *is_full_member)
+{
+ u16 search_pkey;
+ int fm;
+ int err = 0;
+ u16 pk;
+
+ err = ib_get_cached_pkey(&ibdev->ib_dev, port, ph_pkey_ix, &search_pkey);
+ if (err)
+ return err;
+
+ fm = (search_pkey & 0x8000) ? 1 : 0;
+ if (fm) {
+ *full_pk_ix = ph_pkey_ix;
+ search_pkey &= 0x7FFF;
+ } else {
+ *partial_pk_ix = ph_pkey_ix;
+ search_pkey |= 0x8000;
+ }
+
+ if (ib_find_exact_cached_pkey(&ibdev->ib_dev, port, search_pkey, &pk))
+ pk = 0xFFFF;
+
+ if (fm)
+ *partial_pk_ix = (pk & 0xFF);
+ else
+ *full_pk_ix = (pk & 0xFF);
+
+ *is_full_member = fm;
+ return err;
+}
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+ enum ib_qp_type dest_qpt, struct ib_wc *wc,
+ struct ib_grh *grh, struct ib_mad *mad)
+{
+ struct ib_sge list;
+ struct ib_send_wr wr, *bad_wr;
+ struct mlx4_ib_demux_pv_ctx *tun_ctx;
+ struct mlx4_ib_demux_pv_qp *tun_qp;
+ struct mlx4_rcv_tunnel_mad *tun_mad;
+ struct ib_ah_attr attr;
+ struct ib_ah *ah;
+ struct ib_qp *src_qp = NULL;
+ unsigned tun_tx_ix = 0;
+ int dqpn;
+ int ret = 0;
+ int i;
+ int is_full_member = 0;
+ u16 tun_pkey_ix;
+ u8 ph_pkey_ix, full_pk_ix = 0, partial_pk_ix = 0;
+
+ if (dest_qpt > IB_QPT_GSI)
+ return -EINVAL;
+
+ tun_ctx = dev->sriov.demux[port-1].tun[slave];
+
+ /* check if proxy qp created */
+ if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE)
+ return -EAGAIN;
+
+ /* QP0 forwarding only for Dom0 */
+ if (!dest_qpt && (mlx4_master_func_num(dev->dev) != slave))
+ return -EINVAL;
+
+ if (!dest_qpt)
+ tun_qp = &tun_ctx->qp[0];
+ else
+ tun_qp = &tun_ctx->qp[1];
+
+ /* compute pkey index for slave */
+ /* get physical pkey -- virtualized Dom0 pkey to phys*/
+ if (dest_qpt) {
+ ph_pkey_ix =
+ dev->pkeys.virt2phys_pkey[mlx4_master_func_num(dev->dev)][port - 1][wc->pkey_index];
+
+ /* now, translate this to the slave pkey index */
+ ret = get_pkey_phys_indices(dev, port, ph_pkey_ix, &full_pk_ix,
+ &partial_pk_ix, &is_full_member);
+ if (ret)
+ return -EINVAL;
+
+ for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) {
+ if ((dev->pkeys.virt2phys_pkey[slave][port - 1][i] == full_pk_ix) ||
+ (is_full_member &&
+ (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == partial_pk_ix)))
+ break;
+ }
+ if (i == dev->dev->caps.pkey_table_len[port])
+ return -EINVAL;
+ tun_pkey_ix = i;
+ } else
+ tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+
+ dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1;
+
+ /* get tunnel tx data buf for slave */
+ src_qp = tun_qp->qp;
+
+ /* create ah. Just need an empty one with the port num for the post send.
+ * The driver will set the force loopback bit in post_send */
+ memset(&attr, 0, sizeof attr);
+ attr.port_num = port;
+ ah = ib_create_ah(tun_ctx->pd, &attr);
+ if (IS_ERR(ah))
+ return -ENOMEM;
+
+ /* allocate tunnel tx buf after pass failure returns */
+ spin_lock(&tun_qp->tx_lock);
+ if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >=
+ (MLX4_NUM_TUNNEL_BUFS - 1))
+ ret = -EAGAIN;
+ else
+ tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+ spin_unlock(&tun_qp->tx_lock);
+ if (ret)
+ goto out;
+
+ tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr);
+ if (tun_qp->tx_ring[tun_tx_ix].ah)
+ ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah);
+ tun_qp->tx_ring[tun_tx_ix].ah = ah;
+ ib_dma_sync_single_for_cpu(&dev->ib_dev,
+ tun_qp->tx_ring[tun_tx_ix].buf.map,
+ sizeof (struct mlx4_rcv_tunnel_mad),
+ DMA_TO_DEVICE);
+
+ /* copy over to tunnel buffer */
+ if (grh)
+ memcpy(&tun_mad->grh, grh, sizeof *grh);
+ memcpy(&tun_mad->mad, mad, sizeof *mad);
+
+ /* adjust tunnel data */
+ tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix);
+ tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12);
+ tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid);
+ tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF);
+ tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0;
+
+ ib_dma_sync_single_for_device(&dev->ib_dev,
+ tun_qp->tx_ring[tun_tx_ix].buf.map,
+ sizeof (struct mlx4_rcv_tunnel_mad),
+ DMA_TO_DEVICE);
+
+ list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map;
+ list.length = sizeof (struct mlx4_rcv_tunnel_mad);
+ list.lkey = tun_ctx->mr->lkey;
+
+ wr.wr.ud.ah = ah;
+ wr.wr.ud.port_num = port;
+ wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
+ wr.wr.ud.remote_qpn = dqpn;
+ wr.next = NULL;
+ wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt);
+ wr.sg_list = &list;
+ wr.num_sge = 1;
+ wr.opcode = IB_WR_SEND;
+ wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = ib_post_send(src_qp, &wr, &bad_wr);
+out:
+ if (ret)
+ ib_destroy_ah(ah);
+ return ret;
+}
+
+static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,
+ struct ib_wc *wc, struct ib_grh *grh,
+ struct ib_mad *mad)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ int err;
+ int slave;
+ u8 *slave_id;
+
+ /* Initially assume that this mad is for us */
+ slave = mlx4_master_func_num(dev->dev);
+
+ /* See if the slave id is encoded in a response mad */
+ if (mad->mad_hdr.method & 0x80) {
+ slave_id = (u8 *) &mad->mad_hdr.tid;
+ slave = *slave_id;
+ if (slave != 255) /*255 indicates the dom0*/
+ *slave_id = 0; /* remap tid */
+ }
+
+ /* If a grh is present, we demux according to it */
+ if (wc->wc_flags & IB_WC_GRH) {
+ slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id);
+ if (slave < 0) {
+ mlx4_ib_warn(ibdev, "failed matching grh\n");
+ return -ENOENT;
+ }
+ }
+ /* Class-specific handling */
+ switch (mad->mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_ADM:
+ if (mlx4_ib_demux_sa_handler(ibdev, port, slave,
+ (struct ib_sa_mad *) mad))
+ return 0;
+ break;
+ case IB_MGMT_CLASS_CM:
+ if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad))
+ return 0;
+ break;
+ case IB_MGMT_CLASS_DEVICE_MGMT:
+ if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP)
+ return 0;
+ break;
+ default:
+ /* Drop unsupported classes for slaves in tunnel mode */
+ if (slave != mlx4_master_func_num(dev->dev)) {
+ pr_debug("dropping unsupported ingress mad from class:%d "
+ "for slave:%d\n", mad->mad_hdr.mgmt_class, slave);
+ return 0;
+ }
+ }
+ /*make sure that no slave==255 was not handled yet.*/
+ if (slave >= dev->dev->caps.sqp_demux) {
+ mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+ slave, dev->dev->caps.sqp_demux);
+ return -ENOENT;
+ }
+
+ err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+ if (err)
+ pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+ slave, err);
+ return 0;
+}
+
static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
struct ib_wc *in_wc, struct ib_grh *in_grh,
struct ib_mad *in_mad, struct ib_mad *out_mad)
@@ -306,8 +706,9 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
prev_lid = pattr.lid;
err = mlx4_MAD_IFC(to_mdev(ibdev),
- mad_flags & IB_MAD_IGNORE_MKEY,
- mad_flags & IB_MAD_IGNORE_BKEY,
+ (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) |
+ (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) |
+ MLX4_MAD_IFC_NET_VIEW,
port_num, in_wc, in_grh, in_mad, out_mad);
if (err)
return IB_MAD_RESULT_FAILURE;
@@ -315,7 +716,9 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
if (!out_mad->mad_hdr.status) {
if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV))
smp_snoop(ibdev, port_num, in_mad, prev_lid);
- node_desc_override(ibdev, out_mad);
+ /* slaves get node desc from FW */
+ if (!mlx4_is_slave(to_mdev(ibdev)->dev))
+ node_desc_override(ibdev, out_mad);
}
/* set return bit in status of directed route responses */
@@ -398,6 +801,8 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
static void send_handler(struct ib_mad_agent *agent,
struct ib_mad_send_wc *mad_send_wc)
{
+ if (mad_send_wc->send_buf->context[0])
+ ib_destroy_ah(mad_send_wc->send_buf->context[0]);
ib_free_send_mad(mad_send_wc->send_buf);
}
@@ -456,6 +861,90 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
}
}
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+ mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE);
+
+ if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+ mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+ MLX4_EQ_PORT_INFO_LID_CHANGE_MASK);
+}
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+ /* re-configure the alias-guid and mcg's */
+ if (mlx4_is_master(dev->dev)) {
+ mlx4_ib_invalidate_all_guid_record(dev, port_num);
+
+ if (!dev->sriov.is_going_down) {
+ mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0);
+ mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+ MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK);
+ }
+ }
+ mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
+}
+
+static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+ struct mlx4_eqe *eqe)
+{
+ __propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe),
+ GET_MASK_FROM_EQE(eqe));
+}
+
+static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num,
+ u32 guid_tbl_blk_num, u32 change_bitmap)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ u16 i;
+
+ if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev))
+ return;
+
+ in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad) {
+ mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n");
+ goto out;
+ }
+
+ guid_tbl_blk_num *= 4;
+
+ for (i = 0; i < 4; i++) {
+ if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff)))
+ continue;
+ memset(in_mad, 0, sizeof *in_mad);
+ memset(out_mad, 0, sizeof *out_mad);
+
+ in_mad->base_version = 1;
+ in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ in_mad->class_version = 1;
+ in_mad->method = IB_MGMT_METHOD_GET;
+ in_mad->attr_id = IB_SMP_ATTR_GUID_INFO;
+ in_mad->attr_mod = cpu_to_be32(guid_tbl_blk_num + i);
+
+ if (mlx4_MAD_IFC(dev,
+ MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW,
+ port_num, NULL, NULL, in_mad, out_mad)) {
+ mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n");
+ goto out;
+ }
+
+ mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i,
+ port_num,
+ (u8 *)(&((struct ib_smp *)out_mad)->data));
+ mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i,
+ port_num,
+ (u8 *)(&((struct ib_smp *)out_mad)->data));
+ }
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return;
+}
+
void handle_port_mgmt_change_event(struct work_struct *work)
{
struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
@@ -463,6 +952,8 @@ void handle_port_mgmt_change_event(struct work_struct *work)
struct mlx4_eqe *eqe = &(ew->ib_eqe);
u8 port = eqe->event.port_mgmt_change.port;
u32 changed_attr;
+ u32 tbl_block;
+ u32 change_bitmap;
switch (eqe->subtype) {
case MLX4_DEV_PMC_SUBTYPE_PORT_INFO:
@@ -478,24 +969,36 @@ void handle_port_mgmt_change_event(struct work_struct *work)
/* Check if it is a lid change event */
if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK)
- mlx4_ib_dispatch_event(dev, port, IB_EVENT_LID_CHANGE);
+ handle_lid_change_event(dev, port);
/* Generate GUID changed event */
- if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK)
+ if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) {
mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+ /*if master, notify all slaves*/
+ if (mlx4_is_master(dev->dev))
+ mlx4_gen_slaves_port_mgt_ev(dev->dev, port,
+ MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK);
+ }
if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK)
- mlx4_ib_dispatch_event(dev, port,
- IB_EVENT_CLIENT_REREGISTER);
+ handle_client_rereg_event(dev, port);
break;
case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE:
mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE);
+ if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+ propagate_pkey_ev(dev, port, eqe);
break;
case MLX4_DEV_PMC_SUBTYPE_GUID_INFO:
/* paravirtualized master's guid is guid 0 -- does not change */
if (!mlx4_is_master(dev->dev))
mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+ /*if master, notify relevant slaves*/
+ else if (!dev->sriov.is_going_down) {
+ tbl_block = GET_BLK_PTR_FROM_EQE(eqe);
+ change_bitmap = GET_MASK_FROM_EQE(eqe);
+ handle_slaves_guid_change(dev, port, tbl_block, change_bitmap);
+ }
break;
default:
pr_warn("Unsupported subtype 0x%x for "
@@ -516,3 +1019,1035 @@ void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
ib_dispatch_event(&event);
}
+
+static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg)
+{
+ unsigned long flags;
+ struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+ struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+ queue_work(ctx->wq, &ctx->work);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
+ struct mlx4_ib_demux_pv_qp *tun_qp,
+ int index)
+{
+ struct ib_sge sg_list;
+ struct ib_recv_wr recv_wr, *bad_recv_wr;
+ int size;
+
+ size = (tun_qp->qp->qp_type == IB_QPT_UD) ?
+ sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf);
+
+ sg_list.addr = tun_qp->ring[index].map;
+ sg_list.length = size;
+ sg_list.lkey = ctx->mr->lkey;
+
+ recv_wr.next = NULL;
+ recv_wr.sg_list = &sg_list;
+ recv_wr.num_sge = 1;
+ recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV |
+ MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt);
+ ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map,
+ size, DMA_FROM_DEVICE);
+ return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr);
+}
+
+static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port,
+ int slave, struct ib_sa_mad *sa_mad)
+{
+ int ret = 0;
+
+ /* dispatch to different sa handlers */
+ switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+ case IB_SA_ATTR_MC_MEMBER_REC:
+ ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
+{
+ int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave;
+
+ return (qpn >= proxy_start && qpn <= proxy_start + 1);
+}
+
+
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+ enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
+ u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad)
+{
+ struct ib_sge list;
+ struct ib_send_wr wr, *bad_wr;
+ struct mlx4_ib_demux_pv_ctx *sqp_ctx;
+ struct mlx4_ib_demux_pv_qp *sqp;
+ struct mlx4_mad_snd_buf *sqp_mad;
+ struct ib_ah *ah;
+ struct ib_qp *send_qp = NULL;
+ unsigned wire_tx_ix = 0;
+ int ret = 0;
+ u16 wire_pkey_ix;
+ int src_qpnum;
+ u8 sgid_index;
+
+
+ sqp_ctx = dev->sriov.sqps[port-1];
+
+ /* check if proxy qp created */
+ if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE)
+ return -EAGAIN;
+
+ /* QP0 forwarding only for Dom0 */
+ if (dest_qpt == IB_QPT_SMI && (mlx4_master_func_num(dev->dev) != slave))
+ return -EINVAL;
+
+ if (dest_qpt == IB_QPT_SMI) {
+ src_qpnum = 0;
+ sqp = &sqp_ctx->qp[0];
+ wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+ } else {
+ src_qpnum = 1;
+ sqp = &sqp_ctx->qp[1];
+ wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index];
+ }
+
+ send_qp = sqp->qp;
+
+ /* create ah */
+ sgid_index = attr->grh.sgid_index;
+ attr->grh.sgid_index = 0;
+ ah = ib_create_ah(sqp_ctx->pd, attr);
+ if (IS_ERR(ah))
+ return -ENOMEM;
+ attr->grh.sgid_index = sgid_index;
+ to_mah(ah)->av.ib.gid_index = sgid_index;
+ /* get rid of force-loopback bit */
+ to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF);
+ spin_lock(&sqp->tx_lock);
+ if (sqp->tx_ix_head - sqp->tx_ix_tail >=
+ (MLX4_NUM_TUNNEL_BUFS - 1))
+ ret = -EAGAIN;
+ else
+ wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+ spin_unlock(&sqp->tx_lock);
+ if (ret)
+ goto out;
+
+ sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr);
+ if (sqp->tx_ring[wire_tx_ix].ah)
+ ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah);
+ sqp->tx_ring[wire_tx_ix].ah = ah;
+ ib_dma_sync_single_for_cpu(&dev->ib_dev,
+ sqp->tx_ring[wire_tx_ix].buf.map,
+ sizeof (struct mlx4_mad_snd_buf),
+ DMA_TO_DEVICE);
+
+ memcpy(&sqp_mad->payload, mad, sizeof *mad);
+
+ ib_dma_sync_single_for_device(&dev->ib_dev,
+ sqp->tx_ring[wire_tx_ix].buf.map,
+ sizeof (struct mlx4_mad_snd_buf),
+ DMA_TO_DEVICE);
+
+ list.addr = sqp->tx_ring[wire_tx_ix].buf.map;
+ list.length = sizeof (struct mlx4_mad_snd_buf);
+ list.lkey = sqp_ctx->mr->lkey;
+
+ wr.wr.ud.ah = ah;
+ wr.wr.ud.port_num = port;
+ wr.wr.ud.pkey_index = wire_pkey_ix;
+ wr.wr.ud.remote_qkey = qkey;
+ wr.wr.ud.remote_qpn = remote_qpn;
+ wr.next = NULL;
+ wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum);
+ wr.sg_list = &list;
+ wr.num_sge = 1;
+ wr.opcode = IB_WR_SEND;
+ wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = ib_post_send(send_qp, &wr, &bad_wr);
+out:
+ if (ret)
+ ib_destroy_ah(ah);
+ return ret;
+}
+
+static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+ struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)];
+ int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1);
+ struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr;
+ struct mlx4_ib_ah ah;
+ struct ib_ah_attr ah_attr;
+ u8 *slave_id;
+ int slave;
+
+ /* Get slave that sent this packet */
+ if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
+ wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX ||
+ (wc->src_qp & 0x1) != ctx->port - 1 ||
+ wc->src_qp & 0x4) {
+ mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp);
+ return;
+ }
+ slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8;
+ if (slave != ctx->slave) {
+ mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
+ "belongs to another slave\n", wc->src_qp);
+ return;
+ }
+ if (slave != mlx4_master_func_num(dev->dev) && !(wc->src_qp & 0x2)) {
+ mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
+ "non-master trying to send QP0 packets\n", wc->src_qp);
+ return;
+ }
+
+ /* Map transaction ID */
+ ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map,
+ sizeof (struct mlx4_tunnel_mad),
+ DMA_FROM_DEVICE);
+ switch (tunnel->mad.mad_hdr.method) {
+ case IB_MGMT_METHOD_SET:
+ case IB_MGMT_METHOD_GET:
+ case IB_MGMT_METHOD_REPORT:
+ case IB_SA_METHOD_GET_TABLE:
+ case IB_SA_METHOD_DELETE:
+ case IB_SA_METHOD_GET_MULTI:
+ case IB_SA_METHOD_GET_TRACE_TBL:
+ slave_id = (u8 *) &tunnel->mad.mad_hdr.tid;
+ if (*slave_id) {
+ mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d "
+ "class:%d slave:%d\n", *slave_id,
+ tunnel->mad.mad_hdr.mgmt_class, slave);
+ return;
+ } else
+ *slave_id = slave;
+ default:
+ /* nothing */;
+ }
+
+ /* Class-specific handling */
+ switch (tunnel->mad.mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_ADM:
+ if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave,
+ (struct ib_sa_mad *) &tunnel->mad))
+ return;
+ break;
+ case IB_MGMT_CLASS_CM:
+ if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave,
+ (struct ib_mad *) &tunnel->mad))
+ return;
+ break;
+ case IB_MGMT_CLASS_DEVICE_MGMT:
+ if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET &&
+ tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET)
+ return;
+ break;
+ default:
+ /* Drop unsupported classes for slaves in tunnel mode */
+ if (slave != mlx4_master_func_num(dev->dev)) {
+ mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d "
+ "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave);
+ return;
+ }
+ }
+
+ /* We are using standard ib_core services to send the mad, so generate a
+ * stadard address handle by decoding the tunnelled mlx4_ah fields */
+ memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av));
+ ah.ibah.device = ctx->ib_dev;
+ mlx4_ib_query_ah(&ah.ibah, &ah_attr);
+ if ((ah_attr.ah_flags & IB_AH_GRH) &&
+ (ah_attr.grh.sgid_index != slave)) {
+ mlx4_ib_warn(ctx->ib_dev, "slave:%d accessed invalid sgid_index:%d\n",
+ slave, ah_attr.grh.sgid_index);
+ return;
+ }
+
+ mlx4_ib_send_to_wire(dev, slave, ctx->port,
+ is_proxy_qp0(dev, wc->src_qp, slave) ?
+ IB_QPT_SMI : IB_QPT_GSI,
+ be16_to_cpu(tunnel->hdr.pkey_index),
+ be32_to_cpu(tunnel->hdr.remote_qpn),
+ be32_to_cpu(tunnel->hdr.qkey),
+ &ah_attr, &tunnel->mad);
+}
+
+static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+ enum ib_qp_type qp_type, int is_tun)
+{
+ int i;
+ struct mlx4_ib_demux_pv_qp *tun_qp;
+ int rx_buf_size, tx_buf_size;
+
+ if (qp_type > IB_QPT_GSI)
+ return -EINVAL;
+
+ tun_qp = &ctx->qp[qp_type];
+
+ tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS,
+ GFP_KERNEL);
+ if (!tun_qp->ring)
+ return -ENOMEM;
+
+ tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS,
+ sizeof (struct mlx4_ib_tun_tx_buf),
+ GFP_KERNEL);
+ if (!tun_qp->tx_ring) {
+ kfree(tun_qp->ring);
+ tun_qp->ring = NULL;
+ return -ENOMEM;
+ }
+
+ if (is_tun) {
+ rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+ tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+ } else {
+ rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+ tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+ }
+
+ for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+ tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL);
+ if (!tun_qp->ring[i].addr)
+ goto err;
+ tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev,
+ tun_qp->ring[i].addr,
+ rx_buf_size,
+ DMA_FROM_DEVICE);
+ }
+
+ for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+ tun_qp->tx_ring[i].buf.addr =
+ kmalloc(tx_buf_size, GFP_KERNEL);
+ if (!tun_qp->tx_ring[i].buf.addr)
+ goto tx_err;
+ tun_qp->tx_ring[i].buf.map =
+ ib_dma_map_single(ctx->ib_dev,
+ tun_qp->tx_ring[i].buf.addr,
+ tx_buf_size,
+ DMA_TO_DEVICE);
+ tun_qp->tx_ring[i].ah = NULL;
+ }
+ spin_lock_init(&tun_qp->tx_lock);
+ tun_qp->tx_ix_head = 0;
+ tun_qp->tx_ix_tail = 0;
+ tun_qp->proxy_qpt = qp_type;
+
+ return 0;
+
+tx_err:
+ while (i > 0) {
+ --i;
+ ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+ tx_buf_size, DMA_TO_DEVICE);
+ kfree(tun_qp->tx_ring[i].buf.addr);
+ }
+ kfree(tun_qp->tx_ring);
+ tun_qp->tx_ring = NULL;
+ i = MLX4_NUM_TUNNEL_BUFS;
+err:
+ while (i > 0) {
+ --i;
+ ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+ rx_buf_size, DMA_FROM_DEVICE);
+ kfree(tun_qp->ring[i].addr);
+ }
+ kfree(tun_qp->ring);
+ tun_qp->ring = NULL;
+ return -ENOMEM;
+}
+
+static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+ enum ib_qp_type qp_type, int is_tun)
+{
+ int i;
+ struct mlx4_ib_demux_pv_qp *tun_qp;
+ int rx_buf_size, tx_buf_size;
+
+ if (qp_type > IB_QPT_GSI)
+ return;
+
+ tun_qp = &ctx->qp[qp_type];
+ if (is_tun) {
+ rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+ tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+ } else {
+ rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+ tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+ }
+
+
+ for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+ ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+ rx_buf_size, DMA_FROM_DEVICE);
+ kfree(tun_qp->ring[i].addr);
+ }
+
+ for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+ ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+ tx_buf_size, DMA_TO_DEVICE);
+ kfree(tun_qp->tx_ring[i].buf.addr);
+ if (tun_qp->tx_ring[i].ah)
+ ib_destroy_ah(tun_qp->tx_ring[i].ah);
+ }
+ kfree(tun_qp->tx_ring);
+ kfree(tun_qp->ring);
+}
+
+static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
+{
+ struct mlx4_ib_demux_pv_ctx *ctx;
+ struct mlx4_ib_demux_pv_qp *tun_qp;
+ struct ib_wc wc;
+ int ret;
+ ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+ ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+ while (ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+ tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+ if (wc.status == IB_WC_SUCCESS) {
+ switch (wc.opcode) {
+ case IB_WC_RECV:
+ mlx4_ib_multiplex_mad(ctx, &wc);
+ ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp,
+ wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1));
+ if (ret)
+ pr_err("Failed reposting tunnel "
+ "buf:%lld\n", wc.wr_id);
+ break;
+ case IB_WC_SEND:
+ pr_debug("received tunnel send completion:"
+ "wrid=0x%llx, status=0x%x\n",
+ wc.wr_id, wc.status);
+ ib_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+ tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+ = NULL;
+ spin_lock(&tun_qp->tx_lock);
+ tun_qp->tx_ix_tail++;
+ spin_unlock(&tun_qp->tx_lock);
+
+ break;
+ default:
+ break;
+ }
+ } else {
+ pr_debug("mlx4_ib: completion error in tunnel: %d."
+ " status = %d, wrid = 0x%llx\n",
+ ctx->slave, wc.status, wc.wr_id);
+ if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+ ib_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+ tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+ = NULL;
+ spin_lock(&tun_qp->tx_lock);
+ tun_qp->tx_ix_tail++;
+ spin_unlock(&tun_qp->tx_lock);
+ }
+ }
+ }
+}
+
+static void pv_qp_event_handler(struct ib_event *event, void *qp_context)
+{
+ struct mlx4_ib_demux_pv_ctx *sqp = qp_context;
+
+ /* It's worse than that! He's dead, Jim! */
+ pr_err("Fatal error (%d) on a MAD QP on port %d\n",
+ event->event, sqp->port);
+}
+
+static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
+ enum ib_qp_type qp_type, int create_tun)
+{
+ int i, ret;
+ struct mlx4_ib_demux_pv_qp *tun_qp;
+ struct mlx4_ib_qp_tunnel_init_attr qp_init_attr;
+ struct ib_qp_attr attr;
+ int qp_attr_mask_INIT;
+
+ if (qp_type > IB_QPT_GSI)
+ return -EINVAL;
+
+ tun_qp = &ctx->qp[qp_type];
+
+ memset(&qp_init_attr, 0, sizeof qp_init_attr);
+ qp_init_attr.init_attr.send_cq = ctx->cq;
+ qp_init_attr.init_attr.recv_cq = ctx->cq;
+ qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS;
+ qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS;
+ qp_init_attr.init_attr.cap.max_send_sge = 1;
+ qp_init_attr.init_attr.cap.max_recv_sge = 1;
+ if (create_tun) {
+ qp_init_attr.init_attr.qp_type = IB_QPT_UD;
+ qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP;
+ qp_init_attr.port = ctx->port;
+ qp_init_attr.slave = ctx->slave;
+ qp_init_attr.proxy_qp_type = qp_type;
+ qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX |
+ IB_QP_QKEY | IB_QP_PORT;
+ } else {
+ qp_init_attr.init_attr.qp_type = qp_type;
+ qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP;
+ qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY;
+ }
+ qp_init_attr.init_attr.port_num = ctx->port;
+ qp_init_attr.init_attr.qp_context = ctx;
+ qp_init_attr.init_attr.event_handler = pv_qp_event_handler;
+ tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr);
+ if (IS_ERR(tun_qp->qp)) {
+ ret = PTR_ERR(tun_qp->qp);
+ tun_qp->qp = NULL;
+ pr_err("Couldn't create %s QP (%d)\n",
+ create_tun ? "tunnel" : "special", ret);
+ return ret;
+ }
+
+ memset(&attr, 0, sizeof attr);
+ attr.qp_state = IB_QPS_INIT;
+ attr.pkey_index =
+ to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0];
+ attr.qkey = IB_QP1_QKEY;
+ attr.port_num = ctx->port;
+ ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT);
+ if (ret) {
+ pr_err("Couldn't change %s qp state to INIT (%d)\n",
+ create_tun ? "tunnel" : "special", ret);
+ goto err_qp;
+ }
+ attr.qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE);
+ if (ret) {
+ pr_err("Couldn't change %s qp state to RTR (%d)\n",
+ create_tun ? "tunnel" : "special", ret);
+ goto err_qp;
+ }
+ attr.qp_state = IB_QPS_RTS;
+ attr.sq_psn = 0;
+ ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN);
+ if (ret) {
+ pr_err("Couldn't change %s qp state to RTS (%d)\n",
+ create_tun ? "tunnel" : "special", ret);
+ goto err_qp;
+ }
+
+ for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+ ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i);
+ if (ret) {
+ pr_err(" mlx4_ib_post_pv_buf error"
+ " (err = %d, i = %d)\n", ret, i);
+ goto err_qp;
+ }
+ }
+ return 0;
+
+err_qp:
+ ib_destroy_qp(tun_qp->qp);
+ tun_qp->qp = NULL;
+ return ret;
+}
+
+/*
+ * IB MAD completion callback for real SQPs
+ */
+static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
+{
+ struct mlx4_ib_demux_pv_ctx *ctx;
+ struct mlx4_ib_demux_pv_qp *sqp;
+ struct ib_wc wc;
+ struct ib_grh *grh;
+ struct ib_mad *mad;
+
+ ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+ ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+ while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+ sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+ if (wc.status == IB_WC_SUCCESS) {
+ switch (wc.opcode) {
+ case IB_WC_SEND:
+ ib_destroy_ah(sqp->tx_ring[wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+ sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+ = NULL;
+ spin_lock(&sqp->tx_lock);
+ sqp->tx_ix_tail++;
+ spin_unlock(&sqp->tx_lock);
+ break;
+ case IB_WC_RECV:
+ mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *)
+ (sqp->ring[wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload);
+ grh = &(((struct mlx4_mad_rcv_buf *)
+ (sqp->ring[wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh);
+ mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad);
+ if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)))
+ pr_err("Failed reposting SQP "
+ "buf:%lld\n", wc.wr_id);
+ break;
+ default:
+ BUG_ON(1);
+ break;
+ }
+ } else {
+ pr_debug("mlx4_ib: completion error in tunnel: %d."
+ " status = %d, wrid = 0x%llx\n",
+ ctx->slave, wc.status, wc.wr_id);
+ if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+ ib_destroy_ah(sqp->tx_ring[wc.wr_id &
+ (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+ sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+ = NULL;
+ spin_lock(&sqp->tx_lock);
+ sqp->tx_ix_tail++;
+ spin_unlock(&sqp->tx_lock);
+ }
+ }
+ }
+}
+
+static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port,
+ struct mlx4_ib_demux_pv_ctx **ret_ctx)
+{
+ struct mlx4_ib_demux_pv_ctx *ctx;
+
+ *ret_ctx = NULL;
+ ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL);
+ if (!ctx) {
+ pr_err("failed allocating pv resource context "
+ "for port %d, slave %d\n", port, slave);
+ return -ENOMEM;
+ }
+
+ ctx->ib_dev = &dev->ib_dev;
+ ctx->port = port;
+ ctx->slave = slave;
+ *ret_ctx = ctx;
+ return 0;
+}
+
+static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port)
+{
+ if (dev->sriov.demux[port - 1].tun[slave]) {
+ kfree(dev->sriov.demux[port - 1].tun[slave]);
+ dev->sriov.demux[port - 1].tun[slave] = NULL;
+ }
+}
+
+static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
+ int create_tun, struct mlx4_ib_demux_pv_ctx *ctx)
+{
+ int ret, cq_size;
+
+ if (ctx->state != DEMUX_PV_STATE_DOWN)
+ return -EEXIST;
+
+ ctx->state = DEMUX_PV_STATE_STARTING;
+ /* have QP0 only on port owner, and only if link layer is IB */
+ if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) &&
+ rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND)
+ ctx->has_smi = 1;
+
+ if (ctx->has_smi) {
+ ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun);
+ if (ret) {
+ pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret);
+ goto err_out;
+ }
+ }
+
+ ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun);
+ if (ret) {
+ pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret);
+ goto err_out_qp0;
+ }
+
+ cq_size = 2 * MLX4_NUM_TUNNEL_BUFS;
+ if (ctx->has_smi)
+ cq_size *= 2;
+
+ ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler,
+ NULL, ctx, cq_size, 0);
+ if (IS_ERR(ctx->cq)) {
+ ret = PTR_ERR(ctx->cq);
+ pr_err("Couldn't create tunnel CQ (%d)\n", ret);
+ goto err_buf;
+ }
+
+ ctx->pd = ib_alloc_pd(ctx->ib_dev);
+ if (IS_ERR(ctx->pd)) {
+ ret = PTR_ERR(ctx->pd);
+ pr_err("Couldn't create tunnel PD (%d)\n", ret);
+ goto err_cq;
+ }
+
+ ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(ctx->mr)) {
+ ret = PTR_ERR(ctx->mr);
+ pr_err("Couldn't get tunnel DMA MR (%d)\n", ret);
+ goto err_pd;
+ }
+
+ if (ctx->has_smi) {
+ ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun);
+ if (ret) {
+ pr_err("Couldn't create %s QP0 (%d)\n",
+ create_tun ? "tunnel for" : "", ret);
+ goto err_mr;
+ }
+ }
+
+ ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun);
+ if (ret) {
+ pr_err("Couldn't create %s QP1 (%d)\n",
+ create_tun ? "tunnel for" : "", ret);
+ goto err_qp0;
+ }
+
+ if (create_tun)
+ INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker);
+ else
+ INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker);
+
+ ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq;
+
+ ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ pr_err("Couldn't arm tunnel cq (%d)\n", ret);
+ goto err_wq;
+ }
+ ctx->state = DEMUX_PV_STATE_ACTIVE;
+ return 0;
+
+err_wq:
+ ctx->wq = NULL;
+ ib_destroy_qp(ctx->qp[1].qp);
+ ctx->qp[1].qp = NULL;
+
+
+err_qp0:
+ if (ctx->has_smi)
+ ib_destroy_qp(ctx->qp[0].qp);
+ ctx->qp[0].qp = NULL;
+
+err_mr:
+ ib_dereg_mr(ctx->mr);
+ ctx->mr = NULL;
+
+err_pd:
+ ib_dealloc_pd(ctx->pd);
+ ctx->pd = NULL;
+
+err_cq:
+ ib_destroy_cq(ctx->cq);
+ ctx->cq = NULL;
+
+err_buf:
+ mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun);
+
+err_out_qp0:
+ if (ctx->has_smi)
+ mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun);
+err_out:
+ ctx->state = DEMUX_PV_STATE_DOWN;
+ return ret;
+}
+
+static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port,
+ struct mlx4_ib_demux_pv_ctx *ctx, int flush)
+{
+ if (!ctx)
+ return;
+ if (ctx->state > DEMUX_PV_STATE_DOWN) {
+ ctx->state = DEMUX_PV_STATE_DOWNING;
+ if (flush)
+ flush_workqueue(ctx->wq);
+ if (ctx->has_smi) {
+ ib_destroy_qp(ctx->qp[0].qp);
+ ctx->qp[0].qp = NULL;
+ mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1);
+ }
+ ib_destroy_qp(ctx->qp[1].qp);
+ ctx->qp[1].qp = NULL;
+ mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1);
+ ib_dereg_mr(ctx->mr);
+ ctx->mr = NULL;
+ ib_dealloc_pd(ctx->pd);
+ ctx->pd = NULL;
+ ib_destroy_cq(ctx->cq);
+ ctx->cq = NULL;
+ ctx->state = DEMUX_PV_STATE_DOWN;
+ }
+}
+
+static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave,
+ int port, int do_init)
+{
+ int ret = 0;
+
+ if (!do_init) {
+ clean_vf_mcast(&dev->sriov.demux[port - 1], slave);
+ /* for master, destroy real sqp resources */
+ if (slave == mlx4_master_func_num(dev->dev))
+ destroy_pv_resources(dev, slave, port,
+ dev->sriov.sqps[port - 1], 1);
+ /* destroy the tunnel qp resources */
+ destroy_pv_resources(dev, slave, port,
+ dev->sriov.demux[port - 1].tun[slave], 1);
+ return 0;
+ }
+
+ /* create the tunnel qp resources */
+ ret = create_pv_resources(&dev->ib_dev, slave, port, 1,
+ dev->sriov.demux[port - 1].tun[slave]);
+
+ /* for master, create the real sqp resources */
+ if (!ret && slave == mlx4_master_func_num(dev->dev))
+ ret = create_pv_resources(&dev->ib_dev, slave, port, 0,
+ dev->sriov.sqps[port - 1]);
+ return ret;
+}
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work)
+{
+ struct mlx4_ib_demux_work *dmxw;
+
+ dmxw = container_of(work, struct mlx4_ib_demux_work, work);
+ mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port,
+ dmxw->do_init);
+ kfree(dmxw);
+ return;
+}
+
+static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
+ struct mlx4_ib_demux_ctx *ctx,
+ int port)
+{
+ char name[12];
+ int ret = 0;
+ int i;
+
+ ctx->tun = kcalloc(dev->dev->caps.sqp_demux,
+ sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL);
+ if (!ctx->tun)
+ return -ENOMEM;
+
+ ctx->dev = dev;
+ ctx->port = port;
+ ctx->ib_dev = &dev->ib_dev;
+
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+ ret = alloc_pv_object(dev, i, port, &ctx->tun[i]);
+ if (ret) {
+ ret = -ENOMEM;
+ goto err_mcg;
+ }
+ }
+
+ ret = mlx4_ib_mcg_port_init(ctx);
+ if (ret) {
+ pr_err("Failed initializing mcg para-virt (%d)\n", ret);
+ goto err_mcg;
+ }
+
+ snprintf(name, sizeof name, "mlx4_ibt%d", port);
+ ctx->wq = create_singlethread_workqueue(name);
+ if (!ctx->wq) {
+ pr_err("Failed to create tunnelling WQ for port %d\n", port);
+ ret = -ENOMEM;
+ goto err_wq;
+ }
+
+ snprintf(name, sizeof name, "mlx4_ibud%d", port);
+ ctx->ud_wq = create_singlethread_workqueue(name);
+ if (!ctx->ud_wq) {
+ pr_err("Failed to create up/down WQ for port %d\n", port);
+ ret = -ENOMEM;
+ goto err_udwq;
+ }
+
+ return 0;
+
+err_udwq:
+ destroy_workqueue(ctx->wq);
+ ctx->wq = NULL;
+
+err_wq:
+ mlx4_ib_mcg_port_cleanup(ctx, 1);
+err_mcg:
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++)
+ free_pv_object(dev, i, port);
+ kfree(ctx->tun);
+ ctx->tun = NULL;
+ return ret;
+}
+
+static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx)
+{
+ if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) {
+ sqp_ctx->state = DEMUX_PV_STATE_DOWNING;
+ flush_workqueue(sqp_ctx->wq);
+ if (sqp_ctx->has_smi) {
+ ib_destroy_qp(sqp_ctx->qp[0].qp);
+ sqp_ctx->qp[0].qp = NULL;
+ mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0);
+ }
+ ib_destroy_qp(sqp_ctx->qp[1].qp);
+ sqp_ctx->qp[1].qp = NULL;
+ mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0);
+ ib_dereg_mr(sqp_ctx->mr);
+ sqp_ctx->mr = NULL;
+ ib_dealloc_pd(sqp_ctx->pd);
+ sqp_ctx->pd = NULL;
+ ib_destroy_cq(sqp_ctx->cq);
+ sqp_ctx->cq = NULL;
+ sqp_ctx->state = DEMUX_PV_STATE_DOWN;
+ }
+}
+
+static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx)
+{
+ int i;
+ if (ctx) {
+ struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+ mlx4_ib_mcg_port_cleanup(ctx, 1);
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+ if (!ctx->tun[i])
+ continue;
+ if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN)
+ ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING;
+ }
+ flush_workqueue(ctx->wq);
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+ destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0);
+ free_pv_object(dev, i, ctx->port);
+ }
+ kfree(ctx->tun);
+ destroy_workqueue(ctx->ud_wq);
+ destroy_workqueue(ctx->wq);
+ }
+}
+
+static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init)
+{
+ int i;
+
+ if (!mlx4_is_master(dev->dev))
+ return;
+ /* initialize or tear down tunnel QPs for the master */
+ for (i = 0; i < dev->dev->caps.num_ports; i++)
+ mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init);
+ return;
+}
+
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
+{
+ int i = 0;
+ int err;
+
+ if (!mlx4_is_mfunc(dev->dev))
+ return 0;
+
+ dev->sriov.is_going_down = 0;
+ spin_lock_init(&dev->sriov.going_down_lock);
+ mlx4_ib_cm_paravirt_init(dev);
+
+ mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n");
+
+ if (mlx4_is_slave(dev->dev)) {
+ mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n");
+ return 0;
+ }
+
+ for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+ if (i == mlx4_master_func_num(dev->dev))
+ mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid);
+ else
+ mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid());
+ }
+
+ err = mlx4_ib_init_alias_guid_service(dev);
+ if (err) {
+ mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n");
+ goto paravirt_err;
+ }
+ err = mlx4_ib_device_register_sysfs(dev);
+ if (err) {
+ mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n");
+ goto sysfs_err;
+ }
+
+ mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n",
+ dev->dev->caps.sqp_demux);
+ for (i = 0; i < dev->num_ports; i++) {
+ union ib_gid gid;
+ err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1);
+ if (err)
+ goto demux_err;
+ dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id;
+ err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1,
+ &dev->sriov.sqps[i]);
+ if (err)
+ goto demux_err;
+ err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1);
+ if (err)
+ goto demux_err;
+ }
+ mlx4_ib_master_tunnels(dev, 1);
+ return 0;
+
+demux_err:
+ while (i > 0) {
+ free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
+ mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+ --i;
+ }
+ mlx4_ib_device_unregister_sysfs(dev);
+
+sysfs_err:
+ mlx4_ib_destroy_alias_guid_service(dev);
+
+paravirt_err:
+ mlx4_ib_cm_paravirt_clean(dev, -1);
+
+ return err;
+}
+
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev)
+{
+ int i;
+ unsigned long flags;
+
+ if (!mlx4_is_mfunc(dev->dev))
+ return;
+
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ dev->sriov.is_going_down = 1;
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+ if (mlx4_is_master(dev->dev)) {
+ for (i = 0; i < dev->num_ports; i++) {
+ flush_workqueue(dev->sriov.demux[i].ud_wq);
+ mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]);
+ kfree(dev->sriov.sqps[i]);
+ dev->sriov.sqps[i] = NULL;
+ mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+ }
+
+ mlx4_ib_cm_paravirt_clean(dev, -1);
+ mlx4_ib_destroy_alias_guid_service(dev);
+ mlx4_ib_device_unregister_sysfs(dev);
+ }
+}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index cc05579ebce7..718ec6b2bad2 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -59,6 +59,10 @@ MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_VERSION(DRV_VERSION);
+int mlx4_ib_sm_guid_assign = 1;
+module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
+MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)");
+
static const char mlx4_ib_version[] =
DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
DRV_VERSION " (" DRV_RELDATE ")\n";
@@ -70,6 +74,8 @@ struct update_gid_work {
int port;
};
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
+
static struct workqueue_struct *wq;
static void init_query_mad(struct ib_smp *mad)
@@ -98,7 +104,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
init_query_mad(in_mad);
in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
- err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad);
+ err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS,
+ 1, NULL, NULL, in_mad, out_mad);
if (err)
goto out;
@@ -133,7 +140,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
0xffffff;
- props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30));
+ props->vendor_part_id = dev->dev->pdev->device;
props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32));
memcpy(&props->sys_image_guid, out_mad->data + 4, 8);
@@ -182,11 +189,12 @@ mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num)
}
static int ib_link_query_port(struct ib_device *ibdev, u8 port,
- struct ib_port_attr *props)
+ struct ib_port_attr *props, int netw_view)
{
struct ib_smp *in_mad = NULL;
struct ib_smp *out_mad = NULL;
int ext_active_speed;
+ int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
int err = -ENOMEM;
in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
@@ -198,7 +206,10 @@ static int ib_link_query_port(struct ib_device *ibdev, u8 port,
in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
in_mad->attr_mod = cpu_to_be32(port);
- err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL,
+ if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+ mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
in_mad, out_mad);
if (err)
goto out;
@@ -211,7 +222,10 @@ static int ib_link_query_port(struct ib_device *ibdev, u8 port,
props->state = out_mad->data[32] & 0xf;
props->phys_state = out_mad->data[33] >> 4;
props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20));
- props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
+ if (netw_view)
+ props->gid_tbl_len = out_mad->data[50];
+ else
+ props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz;
props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port];
props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46));
@@ -244,7 +258,7 @@ static int ib_link_query_port(struct ib_device *ibdev, u8 port,
in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO;
in_mad->attr_mod = cpu_to_be32(port);
- err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port,
+ err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port,
NULL, NULL, in_mad, out_mad);
if (err)
goto out;
@@ -270,7 +284,7 @@ static u8 state_to_phys_state(enum ib_port_state state)
}
static int eth_link_query_port(struct ib_device *ibdev, u8 port,
- struct ib_port_attr *props)
+ struct ib_port_attr *props, int netw_view)
{
struct mlx4_ib_dev *mdev = to_mdev(ibdev);
@@ -320,26 +334,36 @@ out:
return err;
}
-static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
- struct ib_port_attr *props)
+int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+ struct ib_port_attr *props, int netw_view)
{
int err;
memset(props, 0, sizeof *props);
err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ?
- ib_link_query_port(ibdev, port, props) :
- eth_link_query_port(ibdev, port, props);
+ ib_link_query_port(ibdev, port, props, netw_view) :
+ eth_link_query_port(ibdev, port, props, netw_view);
return err;
}
-static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
- union ib_gid *gid)
+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+ struct ib_port_attr *props)
+{
+ /* returns host view */
+ return __mlx4_ib_query_port(ibdev, port, props, 0);
+}
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+ union ib_gid *gid, int netw_view)
{
struct ib_smp *in_mad = NULL;
struct ib_smp *out_mad = NULL;
int err = -ENOMEM;
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ int clear = 0;
+ int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
@@ -350,23 +374,38 @@ static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
in_mad->attr_mod = cpu_to_be32(port);
- err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ if (mlx4_is_mfunc(dev->dev) && netw_view)
+ mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+ err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad);
if (err)
goto out;
memcpy(gid->raw, out_mad->data + 8, 8);
+ if (mlx4_is_mfunc(dev->dev) && !netw_view) {
+ if (index) {
+ /* For any index > 0, return the null guid */
+ err = 0;
+ clear = 1;
+ goto out;
+ }
+ }
+
init_query_mad(in_mad);
in_mad->attr_id = IB_SMP_ATTR_GUID_INFO;
in_mad->attr_mod = cpu_to_be32(index / 8);
- err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ err = mlx4_MAD_IFC(dev, mad_ifc_flags, port,
+ NULL, NULL, in_mad, out_mad);
if (err)
goto out;
memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
out:
+ if (clear)
+ memset(gid->raw + 8, 0, 8);
kfree(in_mad);
kfree(out_mad);
return err;
@@ -386,16 +425,17 @@ static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
union ib_gid *gid)
{
if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
- return __mlx4_ib_query_gid(ibdev, port, index, gid);
+ return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
else
return iboe_query_gid(ibdev, port, index, gid);
}
-static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
- u16 *pkey)
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+ u16 *pkey, int netw_view)
{
struct ib_smp *in_mad = NULL;
struct ib_smp *out_mad = NULL;
+ int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
int err = -ENOMEM;
in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
@@ -407,7 +447,11 @@ static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE;
in_mad->attr_mod = cpu_to_be32(index / 32);
- err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+ mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+ in_mad, out_mad);
if (err)
goto out;
@@ -419,6 +463,11 @@ out:
return err;
}
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+ return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0);
+}
+
static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
struct ib_device_modify *props)
{
@@ -431,6 +480,9 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
return 0;
+ if (mlx4_is_slave(to_mdev(ibdev)->dev))
+ return -EOPNOTSUPP;
+
spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags);
memcpy(ibdev->node_desc, props->node_desc, 64);
spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags);
@@ -446,7 +498,7 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
memset(mailbox->buf, 0, 256);
memcpy(mailbox->buf, props->node_desc, 64);
mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
- MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+ MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
@@ -849,6 +901,7 @@ static int init_node_data(struct mlx4_ib_dev *dev)
{
struct ib_smp *in_mad = NULL;
struct ib_smp *out_mad = NULL;
+ int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
int err = -ENOMEM;
in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
@@ -858,8 +911,10 @@ static int init_node_data(struct mlx4_ib_dev *dev)
init_query_mad(in_mad);
in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+ if (mlx4_is_master(dev->dev))
+ mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
- err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+ err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
if (err)
goto out;
@@ -867,10 +922,11 @@ static int init_node_data(struct mlx4_ib_dev *dev)
in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
- err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+ err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
if (err)
goto out;
+ dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
out:
@@ -959,7 +1015,7 @@ static void update_gids_task(struct work_struct *work)
err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
- MLX4_CMD_NATIVE);
+ MLX4_CMD_WRAPPED);
if (err)
pr_warn("set port command failed\n");
else {
@@ -1121,6 +1177,38 @@ static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event
return NOTIFY_DONE;
}
+static void init_pkeys(struct mlx4_ib_dev *ibdev)
+{
+ int port;
+ int slave;
+ int i;
+
+ if (mlx4_is_master(ibdev->dev)) {
+ for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) {
+ for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+ for (i = 0;
+ i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+ ++i) {
+ ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] =
+ /* master has the identity virt2phys pkey mapping */
+ (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i :
+ ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+ mlx4_sync_pkey_table(ibdev->dev, slave, port, i,
+ ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]);
+ }
+ }
+ }
+ /* initialize pkey cache */
+ for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+ for (i = 0;
+ i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+ ++i)
+ ibdev->pkeys.phys_pkey_cache[port-1][i] =
+ (i) ? 0 : 0xFFFF;
+ }
+ }
+}
+
static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
{
char name[32];
@@ -1207,11 +1295,15 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
pr_info_once("%s", mlx4_ib_version);
- if (mlx4_is_mfunc(dev)) {
- pr_warn("IB not yet supported in SRIOV\n");
+ mlx4_foreach_non_ib_transport_port(i, dev)
+ num_ports++;
+
+ if (mlx4_is_mfunc(dev) && num_ports) {
+ dev_err(&dev->pdev->dev, "RoCE is not supported over SRIOV as yet\n");
return NULL;
}
+ num_ports = 0;
mlx4_foreach_ib_transport_port(i, dev)
num_ports++;
@@ -1318,10 +1410,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach;
ibdev->ib_dev.process_mad = mlx4_ib_process_mad;
- ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
- ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr;
- ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr;
- ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc;
+ if (!mlx4_is_slave(ibdev->dev)) {
+ ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
+ ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr;
+ ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr;
+ ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc;
+ }
if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
@@ -1357,11 +1451,14 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
if (mlx4_ib_mad_init(ibdev))
goto err_reg;
+ if (mlx4_ib_init_sriov(ibdev))
+ goto err_mad;
+
if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) {
iboe->nb.notifier_call = mlx4_ib_netdev_event;
err = register_netdevice_notifier(&iboe->nb);
if (err)
- goto err_reg;
+ goto err_sriov;
}
for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
@@ -1372,6 +1469,18 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->ib_active = true;
+ if (mlx4_is_mfunc(ibdev->dev))
+ init_pkeys(ibdev);
+
+ /* create paravirt contexts for any VFs which are active */
+ if (mlx4_is_master(ibdev->dev)) {
+ for (j = 0; j < MLX4_MFUNC_MAX; j++) {
+ if (j == mlx4_master_func_num(ibdev->dev))
+ continue;
+ if (mlx4_is_slave_active(ibdev->dev, j))
+ do_slave_init(ibdev, j, 1);
+ }
+ }
return ibdev;
err_notif:
@@ -1379,6 +1488,12 @@ err_notif:
pr_warn("failure unregistering notifier\n");
flush_workqueue(wq);
+err_sriov:
+ mlx4_ib_close_sriov(ibdev);
+
+err_mad:
+ mlx4_ib_mad_cleanup(ibdev);
+
err_reg:
ib_unregister_device(&ibdev->ib_dev);
@@ -1407,6 +1522,7 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
struct mlx4_ib_dev *ibdev = ibdev_ptr;
int p;
+ mlx4_ib_close_sriov(ibdev);
mlx4_ib_mad_cleanup(ibdev);
ib_unregister_device(&ibdev->ib_dev);
if (ibdev->iboe.nb.notifier_call) {
@@ -1428,6 +1544,51 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
ib_dealloc_device(&ibdev->ib_dev);
}
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init)
+{
+ struct mlx4_ib_demux_work **dm = NULL;
+ struct mlx4_dev *dev = ibdev->dev;
+ int i;
+ unsigned long flags;
+
+ if (!mlx4_is_master(dev))
+ return;
+
+ dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC);
+ if (!dm) {
+ pr_err("failed to allocate memory for tunneling qp update\n");
+ goto out;
+ }
+
+ for (i = 0; i < dev->caps.num_ports; i++) {
+ dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC);
+ if (!dm[i]) {
+ pr_err("failed to allocate memory for tunneling qp update work struct\n");
+ for (i = 0; i < dev->caps.num_ports; i++) {
+ if (dm[i])
+ kfree(dm[i]);
+ }
+ goto out;
+ }
+ }
+ /* initialize or tear down tunnel QPs for the slave */
+ for (i = 0; i < dev->caps.num_ports; i++) {
+ INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work);
+ dm[i]->port = i + 1;
+ dm[i]->slave = slave;
+ dm[i]->do_init = do_init;
+ dm[i]->dev = ibdev;
+ spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags);
+ if (!ibdev->sriov.is_going_down)
+ queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work);
+ spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags);
+ }
+out:
+ if (dm)
+ kfree(dm);
+ return;
+}
+
static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
enum mlx4_dev_event event, unsigned long param)
{
@@ -1435,22 +1596,28 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
struct mlx4_eqe *eqe = NULL;
struct ib_event_work *ew;
- int port = 0;
+ int p = 0;
if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE)
eqe = (struct mlx4_eqe *)param;
else
- port = (u8)param;
-
- if (port > ibdev->num_ports)
- return;
+ p = (int) param;
switch (event) {
case MLX4_DEV_EVENT_PORT_UP:
+ if (p > ibdev->num_ports)
+ return;
+ if (mlx4_is_master(dev) &&
+ rdma_port_get_link_layer(&ibdev->ib_dev, p) ==
+ IB_LINK_LAYER_INFINIBAND) {
+ mlx4_ib_invalidate_all_guid_record(ibdev, p);
+ }
ibev.event = IB_EVENT_PORT_ACTIVE;
break;
case MLX4_DEV_EVENT_PORT_DOWN:
+ if (p > ibdev->num_ports)
+ return;
ibev.event = IB_EVENT_PORT_ERR;
break;
@@ -1469,7 +1636,21 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
INIT_WORK(&ew->work, handle_port_mgmt_change_event);
memcpy(&ew->ib_eqe, eqe, sizeof *eqe);
ew->ib_dev = ibdev;
- handle_port_mgmt_change_event(&ew->work);
+ /* need to queue only for port owner, which uses GEN_EQE */
+ if (mlx4_is_master(dev))
+ queue_work(wq, &ew->work);
+ else
+ handle_port_mgmt_change_event(&ew->work);
+ return;
+
+ case MLX4_DEV_EVENT_SLAVE_INIT:
+ /* here, p is the slave id */
+ do_slave_init(ibdev, p, 1);
+ return;
+
+ case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+ /* here, p is the slave id */
+ do_slave_init(ibdev, p, 0);
return;
default:
@@ -1477,7 +1658,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
}
ibev.device = ibdev_ptr;
- ibev.element.port_num = port;
+ ibev.element.port_num = (u8) p;
ib_dispatch_event(&ibev);
}
@@ -1497,18 +1678,28 @@ static int __init mlx4_ib_init(void)
if (!wq)
return -ENOMEM;
+ err = mlx4_ib_mcg_init();
+ if (err)
+ goto clean_wq;
+
err = mlx4_register_interface(&mlx4_ib_interface);
- if (err) {
- destroy_workqueue(wq);
- return err;
- }
+ if (err)
+ goto clean_mcg;
return 0;
+
+clean_mcg:
+ mlx4_ib_mcg_destroy();
+
+clean_wq:
+ destroy_workqueue(wq);
+ return err;
}
static void __exit mlx4_ib_cleanup(void)
{
mlx4_unregister_interface(&mlx4_ib_interface);
+ mlx4_ib_mcg_destroy();
destroy_workqueue(wq);
}
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
new file mode 100644
index 000000000000..3c3b54c3fdd9
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -0,0 +1,1254 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
+#include <linux/delay.h>
+
+#include "mlx4_ib.h"
+
+#define MAX_VFS 80
+#define MAX_PEND_REQS_PER_FUNC 4
+#define MAD_TIMEOUT_MS 2000
+
+#define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg)
+#define mcg_error(fmt, arg...) pr_err(fmt, ##arg)
+#define mcg_warn_group(group, format, arg...) \
+ pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
+ (group)->name, group->demux->port, ## arg)
+
+#define mcg_error_group(group, format, arg...) \
+ pr_err(" %16s: " format, (group)->name, ## arg)
+
+
+static union ib_gid mgid0;
+
+static struct workqueue_struct *clean_wq;
+
+enum mcast_state {
+ MCAST_NOT_MEMBER = 0,
+ MCAST_MEMBER,
+};
+
+enum mcast_group_state {
+ MCAST_IDLE,
+ MCAST_JOIN_SENT,
+ MCAST_LEAVE_SENT,
+ MCAST_RESP_READY
+};
+
+struct mcast_member {
+ enum mcast_state state;
+ uint8_t join_state;
+ int num_pend_reqs;
+ struct list_head pending;
+};
+
+struct ib_sa_mcmember_data {
+ union ib_gid mgid;
+ union ib_gid port_gid;
+ __be32 qkey;
+ __be16 mlid;
+ u8 mtusel_mtu;
+ u8 tclass;
+ __be16 pkey;
+ u8 ratesel_rate;
+ u8 lifetmsel_lifetm;
+ __be32 sl_flowlabel_hoplimit;
+ u8 scope_join_state;
+ u8 proxy_join;
+ u8 reserved[2];
+};
+
+struct mcast_group {
+ struct ib_sa_mcmember_data rec;
+ struct rb_node node;
+ struct list_head mgid0_list;
+ struct mlx4_ib_demux_ctx *demux;
+ struct mcast_member func[MAX_VFS];
+ struct mutex lock;
+ struct work_struct work;
+ struct list_head pending_list;
+ int members[3];
+ enum mcast_group_state state;
+ enum mcast_group_state prev_state;
+ struct ib_sa_mad response_sa_mad;
+ __be64 last_req_tid;
+
+ char name[33]; /* MGID string */
+ struct device_attribute dentry;
+
+ /* refcount is the reference count for the following:
+ 1. Each queued request
+ 2. Each invocation of the worker thread
+ 3. Membership of the port at the SA
+ */
+ atomic_t refcount;
+
+ /* delayed work to clean pending SM request */
+ struct delayed_work timeout_work;
+ struct list_head cleanup_list;
+};
+
+struct mcast_req {
+ int func;
+ struct ib_sa_mad sa_mad;
+ struct list_head group_list;
+ struct list_head func_list;
+ struct mcast_group *group;
+ int clean;
+};
+
+
+#define safe_atomic_dec(ref) \
+ do {\
+ if (atomic_dec_and_test(ref)) \
+ mcg_warn_group(group, "did not expect to reach zero\n"); \
+ } while (0)
+
+static const char *get_state_string(enum mcast_group_state state)
+{
+ switch (state) {
+ case MCAST_IDLE:
+ return "MCAST_IDLE";
+ case MCAST_JOIN_SENT:
+ return "MCAST_JOIN_SENT";
+ case MCAST_LEAVE_SENT:
+ return "MCAST_LEAVE_SENT";
+ case MCAST_RESP_READY:
+ return "MCAST_RESP_READY";
+ }
+ return "Invalid State";
+}
+
+static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx,
+ union ib_gid *mgid)
+{
+ struct rb_node *node = ctx->mcg_table.rb_node;
+ struct mcast_group *group;
+ int ret;
+
+ while (node) {
+ group = rb_entry(node, struct mcast_group, node);
+ ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+ if (!ret)
+ return group;
+
+ if (ret < 0)
+ node = node->rb_left;
+ else
+ node = node->rb_right;
+ }
+ return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx,
+ struct mcast_group *group)
+{
+ struct rb_node **link = &ctx->mcg_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct mcast_group *cur_group;
+ int ret;
+
+ while (*link) {
+ parent = *link;
+ cur_group = rb_entry(parent, struct mcast_group, node);
+
+ ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+ sizeof group->rec.mgid);
+ if (ret < 0)
+ link = &(*link)->rb_left;
+ else if (ret > 0)
+ link = &(*link)->rb_right;
+ else
+ return cur_group;
+ }
+ rb_link_node(&group->node, parent, link);
+ rb_insert_color(&group->node, &ctx->mcg_table);
+ return NULL;
+}
+
+static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
+{
+ struct mlx4_ib_dev *dev = ctx->dev;
+ struct ib_ah_attr ah_attr;
+
+ spin_lock(&dev->sm_lock);
+ if (!dev->sm_ah[ctx->port - 1]) {
+ /* port is not yet Active, sm_ah not ready */
+ spin_unlock(&dev->sm_lock);
+ return -EAGAIN;
+ }
+ mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+ spin_unlock(&dev->sm_lock);
+ return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port,
+ IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad);
+}
+
+static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
+ struct ib_mad *mad)
+{
+ struct mlx4_ib_dev *dev = ctx->dev;
+ struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1];
+ struct ib_wc wc;
+ struct ib_ah_attr ah_attr;
+
+ /* Our agent might not yet be registered when mads start to arrive */
+ if (!agent)
+ return -EAGAIN;
+
+ ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+
+ wc.pkey_index = 0;
+ wc.sl = 0;
+ wc.dlid_path_bits = 0;
+ wc.port_num = ctx->port;
+ wc.slid = ah_attr.dlid; /* opensm lid */
+ wc.src_qp = 1;
+ return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad);
+}
+
+static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad)
+{
+ struct ib_sa_mad mad;
+ struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data;
+ int ret;
+
+ /* we rely on a mad request as arrived from a VF */
+ memcpy(&mad, sa_mad, sizeof mad);
+
+ /* fix port GID to be the real one (slave 0) */
+ sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0];
+
+ /* assign our own TID */
+ mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+ group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+
+ ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+ /* set timeout handler */
+ if (!ret) {
+ /* calls mlx4_ib_mcg_timeout_handler */
+ queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+ msecs_to_jiffies(MAD_TIMEOUT_MS));
+ }
+
+ return ret;
+}
+
+static int send_leave_to_wire(struct mcast_group *group, u8 join_state)
+{
+ struct ib_sa_mad mad;
+ struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+ int ret;
+
+ memset(&mad, 0, sizeof mad);
+ mad.mad_hdr.base_version = 1;
+ mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mad.mad_hdr.class_version = 2;
+ mad.mad_hdr.method = IB_SA_METHOD_DELETE;
+ mad.mad_hdr.status = cpu_to_be16(0);
+ mad.mad_hdr.class_specific = cpu_to_be16(0);
+ mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+ group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+ mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+ mad.mad_hdr.attr_mod = cpu_to_be32(0);
+ mad.sa_hdr.sm_key = 0x0;
+ mad.sa_hdr.attr_offset = cpu_to_be16(7);
+ mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+ *sa_data = group->rec;
+ sa_data->scope_join_state = join_state;
+
+ ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+ if (ret)
+ group->state = MCAST_IDLE;
+
+ /* set timeout handler */
+ if (!ret) {
+ /* calls mlx4_ib_mcg_timeout_handler */
+ queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+ msecs_to_jiffies(MAD_TIMEOUT_MS));
+ }
+
+ return ret;
+}
+
+static int send_reply_to_slave(int slave, struct mcast_group *group,
+ struct ib_sa_mad *req_sa_mad, u16 status)
+{
+ struct ib_sa_mad mad;
+ struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+ struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data;
+ int ret;
+
+ memset(&mad, 0, sizeof mad);
+ mad.mad_hdr.base_version = 1;
+ mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mad.mad_hdr.class_version = 2;
+ mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+ mad.mad_hdr.status = cpu_to_be16(status);
+ mad.mad_hdr.class_specific = cpu_to_be16(0);
+ mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid;
+ *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */
+ mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+ mad.mad_hdr.attr_mod = cpu_to_be32(0);
+ mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key;
+ mad.sa_hdr.attr_offset = cpu_to_be16(7);
+ mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */
+
+ *sa_data = group->rec;
+
+ /* reconstruct VF's requested join_state and port_gid */
+ sa_data->scope_join_state &= 0xf0;
+ sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f);
+ memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid);
+
+ ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad);
+ return ret;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+ ib_sa_comp_mask selector_mask,
+ ib_sa_comp_mask value_mask,
+ u8 src_value, u8 dst_value)
+{
+ int err;
+ u8 selector = dst_value >> 6;
+ dst_value &= 0x3f;
+ src_value &= 0x3f;
+
+ if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+ return 0;
+
+ switch (selector) {
+ case IB_SA_GT:
+ err = (src_value <= dst_value);
+ break;
+ case IB_SA_LT:
+ err = (src_value >= dst_value);
+ break;
+ case IB_SA_EQ:
+ err = (src_value != dst_value);
+ break;
+ default:
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+static u16 cmp_rec(struct ib_sa_mcmember_data *src,
+ struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask)
+{
+ /* src is group record, dst is request record */
+ /* MGID must already match */
+ /* Port_GID we always replace to our Port_GID, so it is a match */
+
+#define MAD_STATUS_REQ_INVALID 0x0200
+ if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+ return MAD_STATUS_REQ_INVALID;
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+ IB_SA_MCMEMBER_REC_MTU,
+ src->mtusel_mtu, dst->mtusel_mtu))
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+ src->tclass != dst->tclass)
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+ return MAD_STATUS_REQ_INVALID;
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+ IB_SA_MCMEMBER_REC_RATE,
+ src->ratesel_rate, dst->ratesel_rate))
+ return MAD_STATUS_REQ_INVALID;
+ if (check_selector(comp_mask,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+ src->lifetmsel_lifetm, dst->lifetmsel_lifetm))
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SL &&
+ (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) !=
+ (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000))
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+ (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) !=
+ (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00))
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+ (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) !=
+ (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff))
+ return MAD_STATUS_REQ_INVALID;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE &&
+ (src->scope_join_state & 0xf0) !=
+ (dst->scope_join_state & 0xf0))
+ return MAD_STATUS_REQ_INVALID;
+
+ /* join_state checked separately, proxy_join ignored */
+
+ return 0;
+}
+
+/* release group, return 1 if this was last release and group is destroyed
+ * timout work is canceled sync */
+static int release_group(struct mcast_group *group, int from_timeout_handler)
+{
+ struct mlx4_ib_demux_ctx *ctx = group->demux;
+ int nzgroup;
+
+ mutex_lock(&ctx->mcg_table_lock);
+ mutex_lock(&group->lock);
+ if (atomic_dec_and_test(&group->refcount)) {
+ if (!from_timeout_handler) {
+ if (group->state != MCAST_IDLE &&
+ !cancel_delayed_work(&group->timeout_work)) {
+ atomic_inc(&group->refcount);
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ return 0;
+ }
+ }
+
+ nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0);
+ if (nzgroup)
+ del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+ if (!list_empty(&group->pending_list))
+ mcg_warn_group(group, "releasing a group with non empty pending list\n");
+ if (nzgroup)
+ rb_erase(&group->node, &ctx->mcg_table);
+ list_del_init(&group->mgid0_list);
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ kfree(group);
+ return 1;
+ } else {
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ }
+ return 0;
+}
+
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+ int i;
+
+ for (i = 0; i < 3; i++, join_state >>= 1)
+ if (join_state & 0x1)
+ group->members[i] += inc;
+}
+
+static u8 get_leave_state(struct mcast_group *group)
+{
+ u8 leave_state = 0;
+ int i;
+
+ for (i = 0; i < 3; i++)
+ if (!group->members[i])
+ leave_state |= (1 << i);
+
+ return leave_state & (group->rec.scope_join_state & 7);
+}
+
+static int join_group(struct mcast_group *group, int slave, u8 join_mask)
+{
+ int ret = 0;
+ u8 join_state;
+
+ /* remove bits that slave is already member of, and adjust */
+ join_state = join_mask & (~group->func[slave].join_state);
+ adjust_membership(group, join_state, 1);
+ group->func[slave].join_state |= join_state;
+ if (group->func[slave].state != MCAST_MEMBER && join_state) {
+ group->func[slave].state = MCAST_MEMBER;
+ ret = 1;
+ }
+ return ret;
+}
+
+static int leave_group(struct mcast_group *group, int slave, u8 leave_state)
+{
+ int ret = 0;
+
+ adjust_membership(group, leave_state, -1);
+ group->func[slave].join_state &= ~leave_state;
+ if (!group->func[slave].join_state) {
+ group->func[slave].state = MCAST_NOT_MEMBER;
+ ret = 1;
+ }
+ return ret;
+}
+
+static int check_leave(struct mcast_group *group, int slave, u8 leave_mask)
+{
+ if (group->func[slave].state != MCAST_MEMBER)
+ return MAD_STATUS_REQ_INVALID;
+
+ /* make sure we're not deleting unset bits */
+ if (~group->func[slave].join_state & leave_mask)
+ return MAD_STATUS_REQ_INVALID;
+
+ if (!leave_mask)
+ return MAD_STATUS_REQ_INVALID;
+
+ return 0;
+}
+
+static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
+{
+ struct delayed_work *delay = to_delayed_work(work);
+ struct mcast_group *group;
+ struct mcast_req *req = NULL;
+
+ group = container_of(delay, typeof(*group), timeout_work);
+
+ mutex_lock(&group->lock);
+ if (group->state == MCAST_JOIN_SENT) {
+ if (!list_empty(&group->pending_list)) {
+ req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ --group->func[req->func].num_pend_reqs;
+ mutex_unlock(&group->lock);
+ kfree(req);
+ if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) {
+ if (release_group(group, 1))
+ return;
+ } else {
+ kfree(group);
+ return;
+ }
+ mutex_lock(&group->lock);
+ } else
+ mcg_warn_group(group, "DRIVER BUG\n");
+ } else if (group->state == MCAST_LEAVE_SENT) {
+ if (group->rec.scope_join_state & 7)
+ group->rec.scope_join_state &= 0xf8;
+ group->state = MCAST_IDLE;
+ mutex_unlock(&group->lock);
+ if (release_group(group, 1))
+ return;
+ mutex_lock(&group->lock);
+ } else
+ mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state));
+ group->state = MCAST_IDLE;
+ atomic_inc(&group->refcount);
+ if (!queue_work(group->demux->mcg_wq, &group->work))
+ safe_atomic_dec(&group->refcount);
+
+ mutex_unlock(&group->lock);
+}
+
+static int handle_leave_req(struct mcast_group *group, u8 leave_mask,
+ struct mcast_req *req)
+{
+ u16 status;
+
+ if (req->clean)
+ leave_mask = group->func[req->func].join_state;
+
+ status = check_leave(group, req->func, leave_mask);
+ if (!status)
+ leave_group(group, req->func, leave_mask);
+
+ if (!req->clean)
+ send_reply_to_slave(req->func, group, &req->sa_mad, status);
+ --group->func[req->func].num_pend_reqs;
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ return 1;
+}
+
+static int handle_join_req(struct mcast_group *group, u8 join_mask,
+ struct mcast_req *req)
+{
+ u8 group_join_state = group->rec.scope_join_state & 7;
+ int ref = 0;
+ u16 status;
+ struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+
+ if (join_mask == (group_join_state & join_mask)) {
+ /* port's membership need not change */
+ status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask);
+ if (!status)
+ join_group(group, req->func, join_mask);
+
+ --group->func[req->func].num_pend_reqs;
+ send_reply_to_slave(req->func, group, &req->sa_mad, status);
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ ++ref;
+ } else {
+ /* port's membership needs to be updated */
+ group->prev_state = group->state;
+ if (send_join_to_wire(group, &req->sa_mad)) {
+ --group->func[req->func].num_pend_reqs;
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ ref = 1;
+ group->state = group->prev_state;
+ } else
+ group->state = MCAST_JOIN_SENT;
+ }
+
+ return ref;
+}
+
+static void mlx4_ib_mcg_work_handler(struct work_struct *work)
+{
+ struct mcast_group *group;
+ struct mcast_req *req = NULL;
+ struct ib_sa_mcmember_data *sa_data;
+ u8 req_join_state;
+ int rc = 1; /* release_count - this is for the scheduled work */
+ u16 status;
+ u8 method;
+
+ group = container_of(work, typeof(*group), work);
+
+ mutex_lock(&group->lock);
+
+ /* First, let's see if a response from SM is waiting regarding this group.
+ * If so, we need to update the group's REC. If this is a bad response, we
+ * may need to send a bad response to a VF waiting for it. If VF is waiting
+ * and this is a good response, the VF will be answered later in this func. */
+ if (group->state == MCAST_RESP_READY) {
+ /* cancels mlx4_ib_mcg_timeout_handler */
+ cancel_delayed_work(&group->timeout_work);
+ status = be16_to_cpu(group->response_sa_mad.mad_hdr.status);
+ method = group->response_sa_mad.mad_hdr.method;
+ if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) {
+ mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n",
+ be64_to_cpu(group->response_sa_mad.mad_hdr.tid),
+ be64_to_cpu(group->last_req_tid));
+ group->state = group->prev_state;
+ goto process_requests;
+ }
+ if (status) {
+ if (!list_empty(&group->pending_list))
+ req = list_first_entry(&group->pending_list,
+ struct mcast_req, group_list);
+ if ((method == IB_MGMT_METHOD_GET_RESP)) {
+ if (req) {
+ send_reply_to_slave(req->func, group, &req->sa_mad, status);
+ --group->func[req->func].num_pend_reqs;
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ ++rc;
+ } else
+ mcg_warn_group(group, "no request for failed join\n");
+ } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing)
+ ++rc;
+ } else {
+ u8 resp_join_state;
+ u8 cur_join_state;
+
+ resp_join_state = ((struct ib_sa_mcmember_data *)
+ group->response_sa_mad.data)->scope_join_state & 7;
+ cur_join_state = group->rec.scope_join_state & 7;
+
+ if (method == IB_MGMT_METHOD_GET_RESP) {
+ /* successfull join */
+ if (!cur_join_state && resp_join_state)
+ --rc;
+ } else if (!resp_join_state)
+ ++rc;
+ memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec);
+ }
+ group->state = MCAST_IDLE;
+ }
+
+process_requests:
+ /* We should now go over pending join/leave requests, as long as we are idle. */
+ while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) {
+ req = list_first_entry(&group->pending_list, struct mcast_req,
+ group_list);
+ sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+ req_join_state = sa_data->scope_join_state & 0x7;
+
+ /* For a leave request, we will immediately answer the VF, and
+ * update our internal counters. The actual leave will be sent
+ * to SM later, if at all needed. We dequeue the request now. */
+ if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE)
+ rc += handle_leave_req(group, req_join_state, req);
+ else
+ rc += handle_join_req(group, req_join_state, req);
+ }
+
+ /* Handle leaves */
+ if (group->state == MCAST_IDLE) {
+ req_join_state = get_leave_state(group);
+ if (req_join_state) {
+ group->rec.scope_join_state &= ~req_join_state;
+ group->prev_state = group->state;
+ if (send_leave_to_wire(group, req_join_state)) {
+ group->state = group->prev_state;
+ ++rc;
+ } else
+ group->state = MCAST_LEAVE_SENT;
+ }
+ }
+
+ if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE)
+ goto process_requests;
+ mutex_unlock(&group->lock);
+
+ while (rc--)
+ release_group(group, 0);
+}
+
+static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx,
+ __be64 tid,
+ union ib_gid *new_mgid)
+{
+ struct mcast_group *group = NULL, *cur_group;
+ struct mcast_req *req;
+ struct list_head *pos;
+ struct list_head *n;
+
+ mutex_lock(&ctx->mcg_table_lock);
+ list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) {
+ group = list_entry(pos, struct mcast_group, mgid0_list);
+ mutex_lock(&group->lock);
+ if (group->last_req_tid == tid) {
+ if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
+ group->rec.mgid = *new_mgid;
+ sprintf(group->name, "%016llx%016llx",
+ be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+ be64_to_cpu(group->rec.mgid.global.interface_id));
+ list_del_init(&group->mgid0_list);
+ cur_group = mcast_insert(ctx, group);
+ if (cur_group) {
+ /* A race between our code and SM. Silently cleaning the new one */
+ req = list_first_entry(&group->pending_list,
+ struct mcast_req, group_list);
+ --group->func[req->func].num_pend_reqs;
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ release_group(group, 0);
+ return NULL;
+ }
+
+ atomic_inc(&group->refcount);
+ add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ return group;
+ } else {
+ struct mcast_req *tmp1, *tmp2;
+
+ list_del(&group->mgid0_list);
+ if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE)
+ cancel_delayed_work_sync(&group->timeout_work);
+
+ list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) {
+ list_del(&tmp1->group_list);
+ kfree(tmp1);
+ }
+ mutex_unlock(&group->lock);
+ mutex_unlock(&ctx->mcg_table_lock);
+ kfree(group);
+ return NULL;
+ }
+ }
+ mutex_unlock(&group->lock);
+ }
+ mutex_unlock(&ctx->mcg_table_lock);
+
+ return NULL;
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+ struct device_attribute *attr, char *buf);
+
+static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx,
+ union ib_gid *mgid, int create,
+ gfp_t gfp_mask)
+{
+ struct mcast_group *group, *cur_group;
+ int is_mgid0;
+ int i;
+
+ is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+ if (!is_mgid0) {
+ group = mcast_find(ctx, mgid);
+ if (group)
+ goto found;
+ }
+
+ if (!create)
+ return ERR_PTR(-ENOENT);
+
+ group = kzalloc(sizeof *group, gfp_mask);
+ if (!group)
+ return ERR_PTR(-ENOMEM);
+
+ group->demux = ctx;
+ group->rec.mgid = *mgid;
+ INIT_LIST_HEAD(&group->pending_list);
+ INIT_LIST_HEAD(&group->mgid0_list);
+ for (i = 0; i < MAX_VFS; ++i)
+ INIT_LIST_HEAD(&group->func[i].pending);
+ INIT_WORK(&group->work, mlx4_ib_mcg_work_handler);
+ INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler);
+ mutex_init(&group->lock);
+ sprintf(group->name, "%016llx%016llx",
+ be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+ be64_to_cpu(group->rec.mgid.global.interface_id));
+ sysfs_attr_init(&group->dentry.attr);
+ group->dentry.show = sysfs_show_group;
+ group->dentry.store = NULL;
+ group->dentry.attr.name = group->name;
+ group->dentry.attr.mode = 0400;
+ group->state = MCAST_IDLE;
+
+ if (is_mgid0) {
+ list_add(&group->mgid0_list, &ctx->mcg_mgid0_list);
+ goto found;
+ }
+
+ cur_group = mcast_insert(ctx, group);
+ if (cur_group) {
+ mcg_warn("group just showed up %s - confused\n", cur_group->name);
+ kfree(group);
+ return ERR_PTR(-EINVAL);
+ }
+
+ add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+
+found:
+ atomic_inc(&group->refcount);
+ return group;
+}
+
+static void queue_req(struct mcast_req *req)
+{
+ struct mcast_group *group = req->group;
+
+ atomic_inc(&group->refcount); /* for the request */
+ atomic_inc(&group->refcount); /* for scheduling the work */
+ list_add_tail(&req->group_list, &group->pending_list);
+ list_add_tail(&req->func_list, &group->func[req->func].pending);
+ /* calls mlx4_ib_mcg_work_handler */
+ if (!queue_work(group->demux->mcg_wq, &group->work))
+ safe_atomic_dec(&group->refcount);
+}
+
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+ struct ib_sa_mad *mad)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data;
+ struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+ struct mcast_group *group;
+
+ switch (mad->mad_hdr.method) {
+ case IB_MGMT_METHOD_GET_RESP:
+ case IB_SA_METHOD_DELETE_RESP:
+ mutex_lock(&ctx->mcg_table_lock);
+ group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL);
+ mutex_unlock(&ctx->mcg_table_lock);
+ if (IS_ERR(group)) {
+ if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) {
+ __be64 tid = mad->mad_hdr.tid;
+ *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */
+ group = search_relocate_mgid0_group(ctx, tid, &rec->mgid);
+ } else
+ group = NULL;
+ }
+
+ if (!group)
+ return 1;
+
+ mutex_lock(&group->lock);
+ group->response_sa_mad = *mad;
+ group->prev_state = group->state;
+ group->state = MCAST_RESP_READY;
+ /* calls mlx4_ib_mcg_work_handler */
+ atomic_inc(&group->refcount);
+ if (!queue_work(ctx->mcg_wq, &group->work))
+ safe_atomic_dec(&group->refcount);
+ mutex_unlock(&group->lock);
+ release_group(group, 0);
+ return 1; /* consumed */
+ case IB_MGMT_METHOD_SET:
+ case IB_SA_METHOD_GET_TABLE:
+ case IB_SA_METHOD_GET_TABLE_RESP:
+ case IB_SA_METHOD_DELETE:
+ return 0; /* not consumed, pass-through to guest over tunnel */
+ default:
+ mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n",
+ port, mad->mad_hdr.method);
+ return 1; /* consumed */
+ }
+}
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
+ int slave, struct ib_sa_mad *sa_mad)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data;
+ struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+ struct mcast_group *group;
+ struct mcast_req *req;
+ int may_create = 0;
+
+ if (ctx->flushing)
+ return -EAGAIN;
+
+ switch (sa_mad->mad_hdr.method) {
+ case IB_MGMT_METHOD_SET:
+ may_create = 1;
+ case IB_SA_METHOD_DELETE:
+ req = kzalloc(sizeof *req, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->func = slave;
+ req->sa_mad = *sa_mad;
+
+ mutex_lock(&ctx->mcg_table_lock);
+ group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL);
+ mutex_unlock(&ctx->mcg_table_lock);
+ if (IS_ERR(group)) {
+ kfree(req);
+ return PTR_ERR(group);
+ }
+ mutex_lock(&group->lock);
+ if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
+ mutex_unlock(&group->lock);
+ mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
+ port, slave, MAX_PEND_REQS_PER_FUNC);
+ release_group(group, 0);
+ kfree(req);
+ return -ENOMEM;
+ }
+ ++group->func[slave].num_pend_reqs;
+ req->group = group;
+ queue_req(req);
+ mutex_unlock(&group->lock);
+ release_group(group, 0);
+ return 1; /* consumed */
+ case IB_SA_METHOD_GET_TABLE:
+ case IB_MGMT_METHOD_GET_RESP:
+ case IB_SA_METHOD_GET_TABLE_RESP:
+ case IB_SA_METHOD_DELETE_RESP:
+ return 0; /* not consumed, pass-through */
+ default:
+ mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n",
+ port, slave, sa_mad->mad_hdr.method);
+ return 1; /* consumed */
+ }
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mcast_group *group =
+ container_of(attr, struct mcast_group, dentry);
+ struct mcast_req *req = NULL;
+ char pending_str[40];
+ char state_str[40];
+ ssize_t len = 0;
+ int f;
+
+ if (group->state == MCAST_IDLE)
+ sprintf(state_str, "%s", get_state_string(group->state));
+ else
+ sprintf(state_str, "%s(TID=0x%llx)",
+ get_state_string(group->state),
+ be64_to_cpu(group->last_req_tid));
+ if (list_empty(&group->pending_list)) {
+ sprintf(pending_str, "No");
+ } else {
+ req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+ sprintf(pending_str, "Yes(TID=0x%llx)",
+ be64_to_cpu(req->sa_mad.mad_hdr.tid));
+ }
+ len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s ",
+ group->rec.scope_join_state & 0xf,
+ group->members[2], group->members[1], group->members[0],
+ atomic_read(&group->refcount),
+ pending_str,
+ state_str);
+ for (f = 0; f < MAX_VFS; ++f)
+ if (group->func[f].state == MCAST_MEMBER)
+ len += sprintf(buf + len, "%d[%1x] ",
+ f, group->func[f].join_state);
+
+ len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x "
+ "%4x %4x %2x %2x)\n",
+ be16_to_cpu(group->rec.pkey),
+ be32_to_cpu(group->rec.qkey),
+ (group->rec.mtusel_mtu & 0xc0) >> 6,
+ group->rec.mtusel_mtu & 0x3f,
+ group->rec.tclass,
+ (group->rec.ratesel_rate & 0xc0) >> 6,
+ group->rec.ratesel_rate & 0x3f,
+ (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28,
+ (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8,
+ be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff,
+ group->rec.proxy_join);
+
+ return len;
+}
+
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx)
+{
+ char name[20];
+
+ atomic_set(&ctx->tid, 0);
+ sprintf(name, "mlx4_ib_mcg%d", ctx->port);
+ ctx->mcg_wq = create_singlethread_workqueue(name);
+ if (!ctx->mcg_wq)
+ return -ENOMEM;
+
+ mutex_init(&ctx->mcg_table_lock);
+ ctx->mcg_table = RB_ROOT;
+ INIT_LIST_HEAD(&ctx->mcg_mgid0_list);
+ ctx->flushing = 0;
+
+ return 0;
+}
+
+static void force_clean_group(struct mcast_group *group)
+{
+ struct mcast_req *req, *tmp
+ ;
+ list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) {
+ list_del(&req->group_list);
+ kfree(req);
+ }
+ del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr);
+ rb_erase(&group->node, &group->demux->mcg_table);
+ kfree(group);
+}
+
+static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+ int i;
+ struct rb_node *p;
+ struct mcast_group *group;
+ unsigned long end;
+ int count;
+
+ if (ctx->flushing)
+ return;
+
+ ctx->flushing = 1;
+ for (i = 0; i < MAX_VFS; ++i)
+ clean_vf_mcast(ctx, i);
+
+ end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000);
+ do {
+ count = 0;
+ mutex_lock(&ctx->mcg_table_lock);
+ for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p))
+ ++count;
+ mutex_unlock(&ctx->mcg_table_lock);
+ if (!count)
+ break;
+
+ msleep(1);
+ } while (time_after(end, jiffies));
+
+ flush_workqueue(ctx->mcg_wq);
+ if (destroy_wq)
+ destroy_workqueue(ctx->mcg_wq);
+
+ mutex_lock(&ctx->mcg_table_lock);
+ while ((p = rb_first(&ctx->mcg_table)) != NULL) {
+ group = rb_entry(p, struct mcast_group, node);
+ if (atomic_read(&group->refcount))
+ mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group);
+
+ force_clean_group(group);
+ }
+ mutex_unlock(&ctx->mcg_table_lock);
+
+ if (!destroy_wq)
+ ctx->flushing = 0;
+}
+
+struct clean_work {
+ struct work_struct work;
+ struct mlx4_ib_demux_ctx *ctx;
+ int destroy_wq;
+};
+
+static void mcg_clean_task(struct work_struct *work)
+{
+ struct clean_work *cw = container_of(work, struct clean_work, work);
+
+ _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq);
+ kfree(cw);
+}
+
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+ struct clean_work *work;
+
+ if (destroy_wq) {
+ _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq);
+ return;
+ }
+
+ work = kmalloc(sizeof *work, GFP_KERNEL);
+ if (!work) {
+ mcg_warn("failed allocating work for cleanup\n");
+ return;
+ }
+
+ work->ctx = ctx;
+ work->destroy_wq = destroy_wq;
+ INIT_WORK(&work->work, mcg_clean_task);
+ queue_work(clean_wq, &work->work);
+}
+
+static void build_leave_mad(struct mcast_req *req)
+{
+ struct ib_sa_mad *mad = &req->sa_mad;
+
+ mad->mad_hdr.method = IB_SA_METHOD_DELETE;
+}
+
+
+static void clear_pending_reqs(struct mcast_group *group, int vf)
+{
+ struct mcast_req *req, *tmp, *group_first = NULL;
+ int clear;
+ int pend = 0;
+
+ if (!list_empty(&group->pending_list))
+ group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+
+ list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) {
+ clear = 1;
+ if (group_first == req &&
+ (group->state == MCAST_JOIN_SENT ||
+ group->state == MCAST_LEAVE_SENT)) {
+ clear = cancel_delayed_work(&group->timeout_work);
+ pend = !clear;
+ group->state = MCAST_IDLE;
+ }
+ if (clear) {
+ --group->func[vf].num_pend_reqs;
+ list_del(&req->group_list);
+ list_del(&req->func_list);
+ kfree(req);
+ atomic_dec(&group->refcount);
+ }
+ }
+
+ if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) {
+ mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n",
+ list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs);
+ }
+}
+
+static int push_deleteing_req(struct mcast_group *group, int slave)
+{
+ struct mcast_req *req;
+ struct mcast_req *pend_req;
+
+ if (!group->func[slave].join_state)
+ return 0;
+
+ req = kzalloc(sizeof *req, GFP_KERNEL);
+ if (!req) {
+ mcg_warn_group(group, "failed allocation - may leave stall groups\n");
+ return -ENOMEM;
+ }
+
+ if (!list_empty(&group->func[slave].pending)) {
+ pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list);
+ if (pend_req->clean) {
+ kfree(req);
+ return 0;
+ }
+ }
+
+ req->clean = 1;
+ req->func = slave;
+ req->group = group;
+ ++group->func[slave].num_pend_reqs;
+ build_leave_mad(req);
+ queue_req(req);
+ return 0;
+}
+
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave)
+{
+ struct mcast_group *group;
+ struct rb_node *p;
+
+ mutex_lock(&ctx->mcg_table_lock);
+ for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) {
+ group = rb_entry(p, struct mcast_group, node);
+ mutex_lock(&group->lock);
+ if (atomic_read(&group->refcount)) {
+ /* clear pending requests of this VF */
+ clear_pending_reqs(group, slave);
+ push_deleteing_req(group, slave);
+ }
+ mutex_unlock(&group->lock);
+ }
+ mutex_unlock(&ctx->mcg_table_lock);
+}
+
+
+int mlx4_ib_mcg_init(void)
+{
+ clean_wq = create_singlethread_workqueue("mlx4_ib_mcg");
+ if (!clean_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void mlx4_ib_mcg_destroy(void)
+{
+ destroy_workqueue(clean_wq);
+}
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index c136bb618e29..e04cbc9a54a5 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -37,9 +37,12 @@
#include <linux/compiler.h>
#include <linux/list.h>
#include <linux/mutex.h>
+#include <linux/idr.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_umem.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
#include <linux/mlx4/device.h>
#include <linux/mlx4/doorbell.h>
@@ -62,6 +65,9 @@ enum {
#define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1)
#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT))
+/*module param to indicate if SM assigns the alias_GUID*/
+extern int mlx4_ib_sm_guid_assign;
+
struct mlx4_ib_ucontext {
struct ib_ucontext ibucontext;
struct mlx4_uar uar;
@@ -133,8 +139,10 @@ struct mlx4_ib_wq {
};
enum mlx4_ib_qp_flags {
- MLX4_IB_QP_LSO = 1 << 0,
- MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 1,
+ MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
+ MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
+ MLX4_IB_SRIOV_SQP = 1 << 31,
};
struct mlx4_ib_gid_entry {
@@ -144,6 +152,80 @@ struct mlx4_ib_gid_entry {
u8 port;
};
+enum mlx4_ib_qp_type {
+ /*
+ * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+ * here (and in that order) since the MAD layer uses them as
+ * indices into a 2-entry table.
+ */
+ MLX4_IB_QPT_SMI = IB_QPT_SMI,
+ MLX4_IB_QPT_GSI = IB_QPT_GSI,
+
+ MLX4_IB_QPT_RC = IB_QPT_RC,
+ MLX4_IB_QPT_UC = IB_QPT_UC,
+ MLX4_IB_QPT_UD = IB_QPT_UD,
+ MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6,
+ MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE,
+ MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET,
+ MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI,
+ MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT,
+
+ MLX4_IB_QPT_PROXY_SMI_OWNER = 1 << 16,
+ MLX4_IB_QPT_PROXY_SMI = 1 << 17,
+ MLX4_IB_QPT_PROXY_GSI = 1 << 18,
+ MLX4_IB_QPT_TUN_SMI_OWNER = 1 << 19,
+ MLX4_IB_QPT_TUN_SMI = 1 << 20,
+ MLX4_IB_QPT_TUN_GSI = 1 << 21,
+};
+
+#define MLX4_IB_QPT_ANY_SRIOV (MLX4_IB_QPT_PROXY_SMI_OWNER | \
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \
+ MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI)
+
+enum mlx4_ib_mad_ifc_flags {
+ MLX4_MAD_IFC_IGNORE_MKEY = 1,
+ MLX4_MAD_IFC_IGNORE_BKEY = 2,
+ MLX4_MAD_IFC_IGNORE_KEYS = (MLX4_MAD_IFC_IGNORE_MKEY |
+ MLX4_MAD_IFC_IGNORE_BKEY),
+ MLX4_MAD_IFC_NET_VIEW = 4,
+};
+
+enum {
+ MLX4_NUM_TUNNEL_BUFS = 256,
+};
+
+struct mlx4_ib_tunnel_header {
+ struct mlx4_av av;
+ __be32 remote_qpn;
+ __be32 qkey;
+ __be16 vlan;
+ u8 mac[6];
+ __be16 pkey_index;
+ u8 reserved[6];
+};
+
+struct mlx4_ib_buf {
+ void *addr;
+ dma_addr_t map;
+};
+
+struct mlx4_rcv_tunnel_hdr {
+ __be32 flags_src_qp; /* flags[6:5] is defined for VLANs:
+ * 0x0 - no vlan was in the packet
+ * 0x01 - C-VLAN was in the packet */
+ u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */
+ u8 reserved;
+ __be16 pkey_index;
+ __be16 sl_vid;
+ __be16 slid_mac_47_32;
+ __be32 mac_31_0;
+};
+
+struct mlx4_ib_proxy_sqp_hdr {
+ struct ib_grh grh;
+ struct mlx4_rcv_tunnel_hdr tun;
+} __packed;
+
struct mlx4_ib_qp {
struct ib_qp ibqp;
struct mlx4_qp mqp;
@@ -159,6 +241,7 @@ struct mlx4_ib_qp {
int sq_spare_wqes;
struct mlx4_ib_wq sq;
+ enum mlx4_ib_qp_type mlx4_ib_qp_type;
struct ib_umem *umem;
struct mlx4_mtt mtt;
int buf_size;
@@ -174,6 +257,8 @@ struct mlx4_ib_qp {
int mlx_type;
struct list_head gid_list;
struct list_head steering_rules;
+ struct mlx4_ib_buf *sqp_proxy_rcv;
+
};
struct mlx4_ib_srq {
@@ -196,6 +281,138 @@ struct mlx4_ib_ah {
union mlx4_ext_av av;
};
+/****************************************/
+/* alias guid support */
+/****************************************/
+#define NUM_PORT_ALIAS_GUID 2
+#define NUM_ALIAS_GUID_IN_REC 8
+#define NUM_ALIAS_GUID_REC_IN_PORT 16
+#define GUID_REC_SIZE 8
+#define NUM_ALIAS_GUID_PER_PORT 128
+#define MLX4_NOT_SET_GUID (0x00LL)
+#define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL))
+
+enum mlx4_guid_alias_rec_status {
+ MLX4_GUID_INFO_STATUS_IDLE,
+ MLX4_GUID_INFO_STATUS_SET,
+ MLX4_GUID_INFO_STATUS_PENDING,
+};
+
+enum mlx4_guid_alias_rec_ownership {
+ MLX4_GUID_DRIVER_ASSIGN,
+ MLX4_GUID_SYSADMIN_ASSIGN,
+ MLX4_GUID_NONE_ASSIGN, /*init state of each record*/
+};
+
+enum mlx4_guid_alias_rec_method {
+ MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET,
+ MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE,
+};
+
+struct mlx4_sriov_alias_guid_info_rec_det {
+ u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC];
+ ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/
+ enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/
+ u8 method; /*set or delete*/
+ enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/
+};
+
+struct mlx4_sriov_alias_guid_port_rec_det {
+ struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT];
+ struct workqueue_struct *wq;
+ struct delayed_work alias_guid_work;
+ u8 port;
+ struct mlx4_sriov_alias_guid *parent;
+ struct list_head cb_list;
+};
+
+struct mlx4_sriov_alias_guid {
+ struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS];
+ spinlock_t ag_work_lock;
+ struct ib_sa_client *sa_client;
+};
+
+struct mlx4_ib_demux_work {
+ struct work_struct work;
+ struct mlx4_ib_dev *dev;
+ int slave;
+ int do_init;
+ u8 port;
+
+};
+
+struct mlx4_ib_tun_tx_buf {
+ struct mlx4_ib_buf buf;
+ struct ib_ah *ah;
+};
+
+struct mlx4_ib_demux_pv_qp {
+ struct ib_qp *qp;
+ enum ib_qp_type proxy_qpt;
+ struct mlx4_ib_buf *ring;
+ struct mlx4_ib_tun_tx_buf *tx_ring;
+ spinlock_t tx_lock;
+ unsigned tx_ix_head;
+ unsigned tx_ix_tail;
+};
+
+enum mlx4_ib_demux_pv_state {
+ DEMUX_PV_STATE_DOWN,
+ DEMUX_PV_STATE_STARTING,
+ DEMUX_PV_STATE_ACTIVE,
+ DEMUX_PV_STATE_DOWNING,
+};
+
+struct mlx4_ib_demux_pv_ctx {
+ int port;
+ int slave;
+ enum mlx4_ib_demux_pv_state state;
+ int has_smi;
+ struct ib_device *ib_dev;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ struct work_struct work;
+ struct workqueue_struct *wq;
+ struct mlx4_ib_demux_pv_qp qp[2];
+};
+
+struct mlx4_ib_demux_ctx {
+ struct ib_device *ib_dev;
+ int port;
+ struct workqueue_struct *wq;
+ struct workqueue_struct *ud_wq;
+ spinlock_t ud_lock;
+ __be64 subnet_prefix;
+ __be64 guid_cache[128];
+ struct mlx4_ib_dev *dev;
+ /* the following lock protects both mcg_table and mcg_mgid0_list */
+ struct mutex mcg_table_lock;
+ struct rb_root mcg_table;
+ struct list_head mcg_mgid0_list;
+ struct workqueue_struct *mcg_wq;
+ struct mlx4_ib_demux_pv_ctx **tun;
+ atomic_t tid;
+ int flushing; /* flushing the work queue */
+};
+
+struct mlx4_ib_sriov {
+ struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS];
+ struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS];
+ /* when using this spinlock you should use "irq" because
+ * it may be called from interrupt context.*/
+ spinlock_t going_down_lock;
+ int is_going_down;
+
+ struct mlx4_sriov_alias_guid alias_guid;
+
+ /* CM paravirtualization fields */
+ struct list_head cm_list;
+ spinlock_t id_map_lock;
+ struct rb_root sl_id_map;
+ struct idr pv_id_table;
+};
+
struct mlx4_ib_iboe {
spinlock_t lock;
struct net_device *netdevs[MLX4_MAX_PORTS];
@@ -203,6 +420,42 @@ struct mlx4_ib_iboe {
union ib_gid gid_table[MLX4_MAX_PORTS][128];
};
+struct pkey_mgt {
+ u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+ u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+ struct list_head pkey_port_list[MLX4_MFUNC_MAX];
+ struct kobject *device_parent[MLX4_MFUNC_MAX];
+};
+
+struct mlx4_ib_iov_sysfs_attr {
+ void *ctx;
+ struct kobject *kobj;
+ unsigned long data;
+ u32 entry_num;
+ char name[15];
+ struct device_attribute dentry;
+ struct device *dev;
+};
+
+struct mlx4_ib_iov_sysfs_attr_ar {
+ struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1];
+};
+
+struct mlx4_ib_iov_port {
+ char name[100];
+ u8 num;
+ struct mlx4_ib_dev *dev;
+ struct list_head list;
+ struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar;
+ struct ib_port_attr attr;
+ struct kobject *cur_port;
+ struct kobject *admin_alias_parent;
+ struct kobject *gids_parent;
+ struct kobject *pkeys_parent;
+ struct kobject *mcgs_parent;
+ struct mlx4_ib_iov_sysfs_attr mcg_dentry;
+};
+
struct mlx4_ib_dev {
struct ib_device ib_dev;
struct mlx4_dev *dev;
@@ -216,6 +469,7 @@ struct mlx4_ib_dev {
struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2];
struct ib_ah *sm_ah[MLX4_MAX_PORTS];
spinlock_t sm_lock;
+ struct mlx4_ib_sriov sriov;
struct mutex cap_mask_mutex;
bool ib_active;
@@ -223,6 +477,11 @@ struct mlx4_ib_dev {
int counters[MLX4_MAX_PORTS];
int *eq_table;
int eq_added;
+ struct kobject *iov_parent;
+ struct kobject *ports_parent;
+ struct kobject *dev_ports_parent[MLX4_MFUNC_MAX];
+ struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS];
+ struct pkey_mgt pkeys;
};
struct ib_event_work {
@@ -231,6 +490,13 @@ struct ib_event_work {
struct mlx4_eqe ib_eqe;
};
+struct mlx4_ib_qp_tunnel_init_attr {
+ struct ib_qp_init_attr init_attr;
+ int slave;
+ enum ib_qp_type proxy_qp_type;
+ u8 port;
+};
+
static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
{
return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
@@ -300,6 +566,9 @@ static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah)
return container_of(ibah, struct mlx4_ib_ah, ibah);
}
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
+
int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
struct mlx4_db *db);
void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
@@ -356,7 +625,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
struct ib_recv_wr **bad_wr);
-int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
void *in_mad, void *response_mad);
int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
@@ -371,6 +640,13 @@ int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages,
u64 iova);
int mlx4_ib_unmap_fmr(struct list_head *fmr_list);
int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr);
+int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+ struct ib_port_attr *props, int netw_view);
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+ u16 *pkey, int netw_view);
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+ union ib_gid *gid, int netw_view);
int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
u8 *mac, int *is_mcast, u8 port);
@@ -385,10 +661,69 @@ static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
return !!(ah->av.ib.g_slid & 0x80);
}
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx);
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq);
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave);
+int mlx4_ib_mcg_init(void);
+void mlx4_ib_mcg_destroy(void);
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid);
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave,
+ struct ib_sa_mad *sa_mad);
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+ struct ib_sa_mad *mad);
+
int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
union ib_gid *gid);
void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
enum ib_event_type type);
+void mlx4_ib_tunnels_update_work(struct work_struct *work);
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+ enum ib_qp_type qpt, struct ib_wc *wc,
+ struct ib_grh *grh, struct ib_mad *mad);
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+ enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
+ u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad);
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+ struct ib_mad *mad);
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+ struct ib_mad *mad);
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id);
+
+/* alias guid support */
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port);
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port);
+
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+ int block_num,
+ u8 port_num, u8 *p_data);
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev,
+ int block_num, u8 port_num,
+ u8 *p_data);
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+ struct attribute *attr);
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+ struct attribute *attr);
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index);
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ;
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device);
+
+__be64 mlx4_ib_gen_node_guid(void);
+
+
#endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 56e66a4c335c..19e0637220b9 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -38,6 +38,7 @@
#include <rdma/ib_cache.h>
#include <rdma/ib_pack.h>
#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
#include <linux/mlx4/qp.h>
@@ -110,16 +111,62 @@ static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
return container_of(mqp, struct mlx4_ib_sqp, qp);
}
+static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ if (!mlx4_is_master(dev->dev))
+ return 0;
+
+ return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn &&
+ qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn +
+ 8 * MLX4_MFUNC_MAX;
+}
+
static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
{
- return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
- qp->mqp.qpn <= dev->dev->caps.sqp_start + 3;
+ int proxy_sqp = 0;
+ int real_sqp = 0;
+ int i;
+ /* PPF or Native -- real SQP */
+ real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+ qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+ qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3);
+ if (real_sqp)
+ return 1;
+ /* VF or PF -- proxy SQP */
+ if (mlx4_is_mfunc(dev->dev)) {
+ for (i = 0; i < dev->dev->caps.num_ports; i++) {
+ if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] ||
+ qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) {
+ proxy_sqp = 1;
+ break;
+ }
+ }
+ }
+ return proxy_sqp;
}
+/* used for INIT/CLOSE port logic */
static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
{
- return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
- qp->mqp.qpn <= dev->dev->caps.sqp_start + 1;
+ int proxy_qp0 = 0;
+ int real_qp0 = 0;
+ int i;
+ /* PPF or Native -- real QP0 */
+ real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+ qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+ qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1);
+ if (real_qp0)
+ return 1;
+ /* VF or PF -- proxy QP0 */
+ if (mlx4_is_mfunc(dev->dev)) {
+ for (i = 0; i < dev->dev->caps.num_ports; i++) {
+ if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) {
+ proxy_qp0 = 1;
+ break;
+ }
+ }
+ }
+ return proxy_qp0;
}
static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
@@ -270,7 +317,7 @@ static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
}
}
-static int send_wqe_overhead(enum ib_qp_type type, u32 flags)
+static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
{
/*
* UD WQEs must have a datagram segment.
@@ -279,19 +326,29 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags)
* header and space for the ICRC).
*/
switch (type) {
- case IB_QPT_UD:
+ case MLX4_IB_QPT_UD:
return sizeof (struct mlx4_wqe_ctrl_seg) +
sizeof (struct mlx4_wqe_datagram_seg) +
((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0);
- case IB_QPT_UC:
+ case MLX4_IB_QPT_PROXY_SMI_OWNER:
+ case MLX4_IB_QPT_PROXY_SMI:
+ case MLX4_IB_QPT_PROXY_GSI:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_datagram_seg) + 64;
+ case MLX4_IB_QPT_TUN_SMI_OWNER:
+ case MLX4_IB_QPT_TUN_GSI:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_datagram_seg);
+
+ case MLX4_IB_QPT_UC:
return sizeof (struct mlx4_wqe_ctrl_seg) +
sizeof (struct mlx4_wqe_raddr_seg);
- case IB_QPT_RC:
+ case MLX4_IB_QPT_RC:
return sizeof (struct mlx4_wqe_ctrl_seg) +
sizeof (struct mlx4_wqe_atomic_seg) +
sizeof (struct mlx4_wqe_raddr_seg);
- case IB_QPT_SMI:
- case IB_QPT_GSI:
+ case MLX4_IB_QPT_SMI:
+ case MLX4_IB_QPT_GSI:
return sizeof (struct mlx4_wqe_ctrl_seg) +
ALIGN(MLX4_IB_UD_HEADER_SIZE +
DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
@@ -345,7 +402,7 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
}
static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
- enum ib_qp_type type, struct mlx4_ib_qp *qp)
+ enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
{
int s;
@@ -360,7 +417,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
* For MLX transport we need 2 extra S/G entries:
* one for the header and one for the checksum at the end
*/
- if ((type == IB_QPT_SMI || type == IB_QPT_GSI) &&
+ if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
+ type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
return -EINVAL;
@@ -404,7 +462,9 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
*/
if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
qp->sq_signal_bits && BITS_PER_LONG == 64 &&
- type != IB_QPT_SMI && type != IB_QPT_GSI)
+ type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
+ !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
+ MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
qp->sq.wqe_shift = ilog2(64);
else
qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
@@ -476,6 +536,54 @@ static int set_user_sq_size(struct mlx4_ib_dev *dev,
return 0;
}
+static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+ int i;
+
+ qp->sqp_proxy_rcv =
+ kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt,
+ GFP_KERNEL);
+ if (!qp->sqp_proxy_rcv)
+ return -ENOMEM;
+ for (i = 0; i < qp->rq.wqe_cnt; i++) {
+ qp->sqp_proxy_rcv[i].addr =
+ kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ GFP_KERNEL);
+ if (!qp->sqp_proxy_rcv[i].addr)
+ goto err;
+ qp->sqp_proxy_rcv[i].map =
+ ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr,
+ sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ DMA_FROM_DEVICE);
+ }
+ return 0;
+
+err:
+ while (i > 0) {
+ --i;
+ ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+ sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ DMA_FROM_DEVICE);
+ kfree(qp->sqp_proxy_rcv[i].addr);
+ }
+ kfree(qp->sqp_proxy_rcv);
+ qp->sqp_proxy_rcv = NULL;
+ return -ENOMEM;
+}
+
+static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+ int i;
+
+ for (i = 0; i < qp->rq.wqe_cnt; i++) {
+ ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+ sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ DMA_FROM_DEVICE);
+ kfree(qp->sqp_proxy_rcv[i].addr);
+ }
+ kfree(qp->sqp_proxy_rcv);
+}
+
static int qp_has_rq(struct ib_qp_init_attr *attr)
{
if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
@@ -486,10 +594,67 @@ static int qp_has_rq(struct ib_qp_init_attr *attr)
static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
struct ib_qp_init_attr *init_attr,
- struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
+ struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp)
{
int qpn;
int err;
+ struct mlx4_ib_sqp *sqp;
+ struct mlx4_ib_qp *qp;
+ enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
+
+ /* When tunneling special qps, we use a plain UD qp */
+ if (sqpn) {
+ if (mlx4_is_mfunc(dev->dev) &&
+ (!mlx4_is_master(dev->dev) ||
+ !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
+ if (init_attr->qp_type == IB_QPT_GSI)
+ qp_type = MLX4_IB_QPT_PROXY_GSI;
+ else if (mlx4_is_master(dev->dev))
+ qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
+ else
+ qp_type = MLX4_IB_QPT_PROXY_SMI;
+ }
+ qpn = sqpn;
+ /* add extra sg entry for tunneling */
+ init_attr->cap.max_recv_sge++;
+ } else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) {
+ struct mlx4_ib_qp_tunnel_init_attr *tnl_init =
+ container_of(init_attr,
+ struct mlx4_ib_qp_tunnel_init_attr, init_attr);
+ if ((tnl_init->proxy_qp_type != IB_QPT_SMI &&
+ tnl_init->proxy_qp_type != IB_QPT_GSI) ||
+ !mlx4_is_master(dev->dev))
+ return -EINVAL;
+ if (tnl_init->proxy_qp_type == IB_QPT_GSI)
+ qp_type = MLX4_IB_QPT_TUN_GSI;
+ else if (tnl_init->slave == mlx4_master_func_num(dev->dev))
+ qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
+ else
+ qp_type = MLX4_IB_QPT_TUN_SMI;
+ /* we are definitely in the PPF here, since we are creating
+ * tunnel QPs. base_tunnel_sqpn is therefore valid. */
+ qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave
+ + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1;
+ sqpn = qpn;
+ }
+
+ if (!*caller_qp) {
+ if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI ||
+ (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
+ MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+ sqp = kzalloc(sizeof (struct mlx4_ib_sqp), GFP_KERNEL);
+ if (!sqp)
+ return -ENOMEM;
+ qp = &sqp->qp;
+ } else {
+ qp = kzalloc(sizeof (struct mlx4_ib_qp), GFP_KERNEL);
+ if (!qp)
+ return -ENOMEM;
+ }
+ } else
+ qp = *caller_qp;
+
+ qp->mlx4_ib_qp_type = qp_type;
mutex_init(&qp->mutex);
spin_lock_init(&qp->sq.lock);
@@ -550,7 +715,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
qp->flags |= MLX4_IB_QP_LSO;
- err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+ err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
if (err)
goto err;
@@ -586,7 +751,13 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
}
if (sqpn) {
- qpn = sqpn;
+ if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+ if (alloc_proxy_bufs(pd->device, qp)) {
+ err = -ENOMEM;
+ goto err_wrid;
+ }
+ }
} else {
/* Raw packet QPNs must be aligned to 8 bits. If not, the WQE
* BlueFlame setup flow wrongly causes VLAN insertion. */
@@ -595,7 +766,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
else
err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn);
if (err)
- goto err_wrid;
+ goto err_proxy;
}
err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
@@ -613,13 +784,16 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
qp->mqp.event = mlx4_ib_qp_event;
-
+ if (!*caller_qp)
+ *caller_qp = qp;
return 0;
err_qpn:
if (!sqpn)
mlx4_qp_release_range(dev->dev, qpn, 1);
-
+err_proxy:
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+ free_proxy_bufs(pd->device, qp);
err_wrid:
if (pd->uobject) {
if (qp_has_rq(init_attr))
@@ -643,6 +817,8 @@ err_db:
mlx4_db_free(dev->dev, &qp->db);
err:
+ if (!*caller_qp)
+ kfree(qp);
return err;
}
@@ -755,7 +931,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
mlx4_qp_free(dev->dev, &qp->mqp);
- if (!is_sqp(dev, qp))
+ if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp))
mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
mlx4_mtt_cleanup(dev->dev, &qp->mtt);
@@ -768,6 +944,9 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
} else {
kfree(qp->sq.wrid);
kfree(qp->rq.wrid);
+ if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+ free_proxy_bufs(&dev->ib_dev, qp);
mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
if (qp->rq.wqe_cnt)
mlx4_db_free(dev->dev, &qp->db);
@@ -776,25 +955,46 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
del_gid_entries(qp);
}
+static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
+{
+ /* Native or PPF */
+ if (!mlx4_is_mfunc(dev->dev) ||
+ (mlx4_is_master(dev->dev) &&
+ attr->create_flags & MLX4_IB_SRIOV_SQP)) {
+ return dev->dev->phys_caps.base_sqpn +
+ (attr->qp_type == IB_QPT_SMI ? 0 : 2) +
+ attr->port_num - 1;
+ }
+ /* PF or VF -- creating proxies */
+ if (attr->qp_type == IB_QPT_SMI)
+ return dev->dev->caps.qp0_proxy[attr->port_num - 1];
+ else
+ return dev->dev->caps.qp1_proxy[attr->port_num - 1];
+}
+
struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
struct ib_qp_init_attr *init_attr,
struct ib_udata *udata)
{
- struct mlx4_ib_sqp *sqp;
- struct mlx4_ib_qp *qp;
+ struct mlx4_ib_qp *qp = NULL;
int err;
u16 xrcdn = 0;
/*
- * We only support LSO and multicast loopback blocking, and
- * only for kernel UD QPs.
+ * We only support LSO, vendor flag1, and multicast loopback blocking,
+ * and only for kernel UD QPs.
*/
- if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
- IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
+ if (init_attr->create_flags & ~(MLX4_IB_QP_LSO |
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
+ MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP))
return ERR_PTR(-EINVAL);
if (init_attr->create_flags &&
- (udata || init_attr->qp_type != IB_QPT_UD))
+ (udata ||
+ ((init_attr->create_flags & ~MLX4_IB_SRIOV_SQP) &&
+ init_attr->qp_type != IB_QPT_UD) ||
+ ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) &&
+ init_attr->qp_type > IB_QPT_GSI)))
return ERR_PTR(-EINVAL);
switch (init_attr->qp_type) {
@@ -810,18 +1010,17 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
/* fall through */
case IB_QPT_RC:
case IB_QPT_UC:
- case IB_QPT_UD:
case IB_QPT_RAW_PACKET:
- {
qp = kzalloc(sizeof *qp, GFP_KERNEL);
if (!qp)
return ERR_PTR(-ENOMEM);
-
- err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, 0, qp);
- if (err) {
- kfree(qp);
+ /* fall through */
+ case IB_QPT_UD:
+ {
+ err = create_qp_common(to_mdev(pd->device), pd, init_attr,
+ udata, 0, &qp);
+ if (err)
return ERR_PTR(err);
- }
qp->ibqp.qp_num = qp->mqp.qpn;
qp->xrcdn = xrcdn;
@@ -835,21 +1034,11 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
if (udata)
return ERR_PTR(-EINVAL);
- sqp = kzalloc(sizeof *sqp, GFP_KERNEL);
- if (!sqp)
- return ERR_PTR(-ENOMEM);
-
- qp = &sqp->qp;
-
err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
- to_mdev(pd->device)->dev->caps.sqp_start +
- (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) +
- init_attr->port_num - 1,
- qp);
- if (err) {
- kfree(sqp);
+ get_sqp_num(to_mdev(pd->device), init_attr),
+ &qp);
+ if (err)
return ERR_PTR(err);
- }
qp->port = init_attr->port_num;
qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
@@ -884,18 +1073,27 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp)
return 0;
}
-static int to_mlx4_st(enum ib_qp_type type)
+static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
{
switch (type) {
- case IB_QPT_RC: return MLX4_QP_ST_RC;
- case IB_QPT_UC: return MLX4_QP_ST_UC;
- case IB_QPT_UD: return MLX4_QP_ST_UD;
- case IB_QPT_XRC_INI:
- case IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC;
- case IB_QPT_SMI:
- case IB_QPT_GSI:
- case IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX;
- default: return -1;
+ case MLX4_IB_QPT_RC: return MLX4_QP_ST_RC;
+ case MLX4_IB_QPT_UC: return MLX4_QP_ST_UC;
+ case MLX4_IB_QPT_UD: return MLX4_QP_ST_UD;
+ case MLX4_IB_QPT_XRC_INI:
+ case MLX4_IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC;
+ case MLX4_IB_QPT_SMI:
+ case MLX4_IB_QPT_GSI:
+ case MLX4_IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX;
+
+ case MLX4_IB_QPT_PROXY_SMI_OWNER:
+ case MLX4_IB_QPT_TUN_SMI_OWNER: return (mlx4_is_mfunc(dev->dev) ?
+ MLX4_QP_ST_MLX : -1);
+ case MLX4_IB_QPT_PROXY_SMI:
+ case MLX4_IB_QPT_TUN_SMI:
+ case MLX4_IB_QPT_PROXY_GSI:
+ case MLX4_IB_QPT_TUN_GSI: return (mlx4_is_mfunc(dev->dev) ?
+ MLX4_QP_ST_UD : -1);
+ default: return -1;
}
}
@@ -1043,7 +1241,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
return -ENOMEM;
context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
- (to_mlx4_st(ibqp->qp_type) << 16));
+ (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
if (!(attr_mask & IB_QP_PATH_MIG_STATE))
context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
@@ -1121,13 +1319,16 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
}
if (attr_mask & IB_QP_PKEY_INDEX) {
+ if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+ context->pri_path.disable_pkey_check = 0x40;
context->pri_path.pkey_index = attr->pkey_index;
optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
}
if (attr_mask & IB_QP_AV) {
if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path,
- attr_mask & IB_QP_PORT ? attr->port_num : qp->port))
+ attr_mask & IB_QP_PORT ?
+ attr->port_num : qp->port))
goto out;
optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
@@ -1210,8 +1411,24 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
if (attr_mask & IB_QP_RQ_PSN)
context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+ /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */
if (attr_mask & IB_QP_QKEY) {
- context->qkey = cpu_to_be32(attr->qkey);
+ if (qp->mlx4_ib_qp_type &
+ (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))
+ context->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+ else {
+ if (mlx4_is_mfunc(dev->dev) &&
+ !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) &&
+ (attr->qkey & MLX4_RESERVED_QKEY_MASK) ==
+ MLX4_RESERVED_QKEY_BASE) {
+ pr_err("Cannot use reserved QKEY"
+ " 0x%x (range 0xffff0000..0xffffffff"
+ " is reserved)\n", attr->qkey);
+ err = -EINVAL;
+ goto out;
+ }
+ context->qkey = cpu_to_be32(attr->qkey);
+ }
optpar |= MLX4_QP_OPTPAR_Q_KEY;
}
@@ -1227,10 +1444,17 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
ibqp->qp_type == IB_QPT_UD ||
ibqp->qp_type == IB_QPT_RAW_PACKET)) {
context->pri_path.sched_queue = (qp->port - 1) << 6;
- if (is_qp0(dev, qp))
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+ qp->mlx4_ib_qp_type &
+ (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) {
context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
- else
+ if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI)
+ context->pri_path.fl = 0x80;
+ } else {
+ if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+ context->pri_path.fl = 0x80;
context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+ }
}
if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD &&
@@ -1346,7 +1570,7 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
}
if ((attr_mask & IB_QP_PORT) &&
- (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
+ (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
pr_debug("qpn 0x%x: invalid port number (%d) specified "
"for transition %d to %d. qp_type %d\n",
ibqp->qp_num, attr->port_num, cur_state,
@@ -1400,6 +1624,114 @@ out:
return err;
}
+static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
+ struct ib_send_wr *wr,
+ void *wqe, unsigned *mlx_seg_len)
+{
+ struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device);
+ struct ib_device *ib_dev = &mdev->ib_dev;
+ struct mlx4_wqe_mlx_seg *mlx = wqe;
+ struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+ struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+ u16 pkey;
+ u32 qkey;
+ int send_size;
+ int header_size;
+ int spc;
+ int i;
+
+ if (wr->opcode != IB_WR_SEND)
+ return -EINVAL;
+
+ send_size = 0;
+
+ for (i = 0; i < wr->num_sge; ++i)
+ send_size += wr->sg_list[i].length;
+
+ /* for proxy-qp0 sends, need to add in size of tunnel header */
+ /* for tunnel-qp0 sends, tunnel header is already in s/g list */
+ if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
+ send_size += sizeof (struct mlx4_ib_tunnel_header);
+
+ ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header);
+
+ if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
+ sqp->ud_header.lrh.service_level =
+ be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+ sqp->ud_header.lrh.destination_lid =
+ cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+ sqp->ud_header.lrh.source_lid =
+ cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+ }
+
+ mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+ /* force loopback */
+ mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR);
+ mlx->rlid = sqp->ud_header.lrh.destination_lid;
+
+ sqp->ud_header.lrh.virtual_lane = 0;
+ sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+ ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey);
+ sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+ if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
+ sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ else
+ sqp->ud_header.bth.destination_qpn =
+ cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]);
+
+ sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+ if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+ return -EINVAL;
+ sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
+ sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
+
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ sqp->ud_header.immediate_present = 0;
+
+ header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+ /*
+ * Inline data segments may not cross a 64 byte boundary. If
+ * our UD header is bigger than the space available up to the
+ * next 64 byte boundary in the WQE, use two inline data
+ * segments to hold the UD header.
+ */
+ spc = MLX4_INLINE_ALIGN -
+ ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+ if (header_size <= spc) {
+ inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+ memcpy(inl + 1, sqp->header_buf, header_size);
+ i = 1;
+ } else {
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
+ memcpy(inl + 1, sqp->header_buf, spc);
+
+ inl = (void *) (inl + 1) + spc;
+ memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+ /*
+ * Need a barrier here to make sure all the data is
+ * visible before the byte_count field is set.
+ * Otherwise the HCA prefetcher could grab the 64-byte
+ * chunk with this inline segment and get a valid (!=
+ * 0xffffffff) byte count but stale data, and end up
+ * generating a packet with bad headers.
+ *
+ * The first inline segment's byte_count field doesn't
+ * need a barrier, because it comes after a
+ * control/MLX segment and therefore is at an offset
+ * of 16 mod 64.
+ */
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+ i = 2;
+ }
+
+ *mlx_seg_len =
+ ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+ return 0;
+}
+
static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
void *wqe, unsigned *mlx_seg_len)
{
@@ -1418,6 +1750,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
int is_vlan = 0;
int is_grh;
u16 vlan;
+ int err = 0;
send_size = 0;
for (i = 0; i < wr->num_sge; ++i)
@@ -1426,8 +1759,24 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
is_grh = mlx4_ib_ah_grh_present(ah);
if (is_eth) {
- ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
- ah->av.ib.gid_index, &sgid);
+ if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+ /* When multi-function is enabled, the ib_core gid
+ * indexes don't necessarily match the hw ones, so
+ * we must use our own cache */
+ sgid.global.subnet_prefix =
+ to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+ subnet_prefix;
+ sgid.global.interface_id =
+ to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+ guid_cache[ah->av.ib.gid_index];
+ } else {
+ err = ib_get_cached_gid(ib_dev,
+ be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index, &sgid);
+ if (err)
+ return err;
+ }
+
vlan = rdma_get_vlan_id(&sgid);
is_vlan = vlan < 0x1000;
}
@@ -1446,8 +1795,21 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
sqp->ud_header.grh.flow_label =
ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit;
- ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
- ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid);
+ if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+ /* When multi-function is enabled, the ib_core gid
+ * indexes don't necessarily match the hw ones, so
+ * we must use our own cache */
+ sqp->ud_header.grh.source_gid.global.subnet_prefix =
+ to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+ subnet_prefix;
+ sqp->ud_header.grh.source_gid.global.interface_id =
+ to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+ guid_cache[ah->av.ib.gid_index];
+ } else
+ ib_get_cached_gid(ib_dev,
+ be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index,
+ &sqp->ud_header.grh.source_gid);
memcpy(sqp->ud_header.grh.destination_gid.raw,
ah->av.ib.dgid, 16);
}
@@ -1459,6 +1821,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
(sqp->ud_header.lrh.destination_lid ==
IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
(sqp->ud_header.lrh.service_level << 8));
+ if (ah->av.ib.port_pd & cpu_to_be32(0x80000000))
+ mlx->flags |= cpu_to_be32(0x1); /* force loopback */
mlx->rlid = sqp->ud_header.lrh.destination_lid;
}
@@ -1667,6 +2031,63 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
}
+static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
+ struct mlx4_wqe_datagram_seg *dseg,
+ struct ib_send_wr *wr, enum ib_qp_type qpt)
+{
+ union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av;
+ struct mlx4_av sqp_av = {0};
+ int port = *((u8 *) &av->ib.port_pd) & 0x3;
+
+ /* force loopback */
+ sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000);
+ sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */
+ sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel &
+ cpu_to_be32(0xf0000000);
+
+ memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
+ /* This function used only for sending on QP1 proxies */
+ dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]);
+ /* Use QKEY from the QP context, which is set by master */
+ dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+}
+
+static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len)
+{
+ struct mlx4_wqe_inline_seg *inl = wqe;
+ struct mlx4_ib_tunnel_header hdr;
+ struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+ int spc;
+ int i;
+
+ memcpy(&hdr.av, &ah->av, sizeof hdr.av);
+ hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index);
+ hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+
+ spc = MLX4_INLINE_ALIGN -
+ ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+ if (sizeof (hdr) <= spc) {
+ memcpy(inl + 1, &hdr, sizeof (hdr));
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr));
+ i = 1;
+ } else {
+ memcpy(inl + 1, &hdr, spc);
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
+
+ inl = (void *) (inl + 1) + spc;
+ memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc);
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc));
+ i = 2;
+ }
+
+ *mlx_seg_len =
+ ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16);
+}
+
static void set_mlx_icrc_seg(void *dseg)
{
u32 *t = dseg;
@@ -1748,6 +2169,13 @@ static __be32 send_ieth(struct ib_send_wr *wr)
}
}
+static void add_zero_len_inline(void *wqe)
+{
+ struct mlx4_wqe_inline_seg *inl = wqe;
+ memset(wqe, 0, 16);
+ inl->byte_count = cpu_to_be32(1 << 31);
+}
+
int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
struct ib_send_wr **bad_wr)
{
@@ -1806,9 +2234,9 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
wqe += sizeof *ctrl;
size = sizeof *ctrl / 16;
- switch (ibqp->qp_type) {
- case IB_QPT_RC:
- case IB_QPT_UC:
+ switch (qp->mlx4_ib_qp_type) {
+ case MLX4_IB_QPT_RC:
+ case MLX4_IB_QPT_UC:
switch (wr->opcode) {
case IB_WR_ATOMIC_CMP_AND_SWP:
case IB_WR_ATOMIC_FETCH_AND_ADD:
@@ -1869,7 +2297,25 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
}
break;
- case IB_QPT_UD:
+ case MLX4_IB_QPT_TUN_SMI_OWNER:
+ err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += seglen;
+ size += seglen / 16;
+ break;
+ case MLX4_IB_QPT_TUN_SMI:
+ case MLX4_IB_QPT_TUN_GSI:
+ /* this is a UD qp used in MAD responses to slaves. */
+ set_datagram_seg(wqe, wr);
+ /* set the forced-loopback bit in the data seg av */
+ *(__be32 *) wqe |= cpu_to_be32(0x80000000);
+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+ break;
+ case MLX4_IB_QPT_UD:
set_datagram_seg(wqe, wr);
wqe += sizeof (struct mlx4_wqe_datagram_seg);
size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
@@ -1886,8 +2332,47 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
}
break;
- case IB_QPT_SMI:
- case IB_QPT_GSI:
+ case MLX4_IB_QPT_PROXY_SMI_OWNER:
+ if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) {
+ err = -ENOSYS;
+ *bad_wr = wr;
+ goto out;
+ }
+ err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += seglen;
+ size += seglen / 16;
+ /* to start tunnel header on a cache-line boundary */
+ add_zero_len_inline(wqe);
+ wqe += 16;
+ size++;
+ build_tunnel_header(wr, wqe, &seglen);
+ wqe += seglen;
+ size += seglen / 16;
+ break;
+ case MLX4_IB_QPT_PROXY_SMI:
+ /* don't allow QP0 sends on guests */
+ err = -ENOSYS;
+ *bad_wr = wr;
+ goto out;
+ case MLX4_IB_QPT_PROXY_GSI:
+ /* If we are tunneling special qps, this is a UD qp.
+ * In this case we first add a UD segment targeting
+ * the tunnel qp, and then add a header with address
+ * information */
+ set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type);
+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+ build_tunnel_header(wr, wqe, &seglen);
+ wqe += seglen;
+ size += seglen / 16;
+ break;
+
+ case MLX4_IB_QPT_SMI:
+ case MLX4_IB_QPT_GSI:
err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
if (unlikely(err)) {
*bad_wr = wr;
@@ -1913,8 +2398,10 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
/* Add one more inline data segment for ICRC for MLX sends */
- if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI ||
- qp->ibqp.qp_type == IB_QPT_GSI)) {
+ if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+ qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI ||
+ qp->mlx4_ib_qp_type &
+ (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) {
set_mlx_icrc_seg(dseg + 1);
size += sizeof (struct mlx4_wqe_data_seg) / 16;
}
@@ -2006,8 +2493,10 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
int err = 0;
int nreq;
int ind;
+ int max_gs;
int i;
+ max_gs = qp->rq.max_gs;
spin_lock_irqsave(&qp->rq.lock, flags);
ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
@@ -2027,10 +2516,25 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
scat = get_recv_wqe(qp, ind);
+ if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+ ib_dma_sync_single_for_device(ibqp->device,
+ qp->sqp_proxy_rcv[ind].map,
+ sizeof (struct mlx4_ib_proxy_sqp_hdr),
+ DMA_FROM_DEVICE);
+ scat->byte_count =
+ cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr));
+ /* use dma lkey from upper layer entry */
+ scat->lkey = cpu_to_be32(wr->sg_list->lkey);
+ scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map);
+ scat++;
+ max_gs--;
+ }
+
for (i = 0; i < wr->num_sge; ++i)
__set_data_seg(scat + i, wr->sg_list + i);
- if (i < qp->rq.max_gs) {
+ if (i < max_gs) {
scat[i].byte_count = 0;
scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY);
scat[i].addr = 0;
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
new file mode 100644
index 000000000000..5b2a01dfb907
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*#include "core_priv.h"*/
+#include "mlx4_ib.h"
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+
+#include <rdma/ib_mad.h>
+/*show_admin_alias_guid returns the administratively assigned value of that GUID.
+ * Values returned in buf parameter string:
+ * 0 - requests opensm to assign a value.
+ * ffffffffffffffff - delete this entry.
+ * other - value assigned by administrator.
+ */
+static ssize_t show_admin_alias_guid(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int record_num;/*0-15*/
+ int guid_index_in_rec; /*0 - 7*/
+ struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+ container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+ struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+ struct mlx4_ib_dev *mdev = port->dev;
+
+ record_num = mlx4_ib_iov_dentry->entry_num / 8 ;
+ guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ;
+
+ return sprintf(buf, "%llx\n",
+ be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid.
+ ports_guid[port->num - 1].
+ all_rec_per_port[record_num].
+ all_recs[8 * guid_index_in_rec]));
+}
+
+/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID.
+ * Values in buf parameter string:
+ * 0 - requests opensm to assign a value.
+ * 0xffffffffffffffff - delete this entry.
+ * other - guid value assigned by the administrator.
+ */
+static ssize_t store_admin_alias_guid(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int record_num;/*0-15*/
+ int guid_index_in_rec; /*0 - 7*/
+ struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+ container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+ struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+ struct mlx4_ib_dev *mdev = port->dev;
+ u64 sysadmin_ag_val;
+
+ record_num = mlx4_ib_iov_dentry->entry_num / 8;
+ guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8;
+ if (0 == record_num && 0 == guid_index_in_rec) {
+ pr_err("GUID 0 block 0 is RO\n");
+ return count;
+ }
+ sscanf(buf, "%llx", &sysadmin_ag_val);
+ *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1].
+ all_rec_per_port[record_num].
+ all_recs[GUID_REC_SIZE * guid_index_in_rec] =
+ cpu_to_be64(sysadmin_ag_val);
+
+ /* Change the state to be pending for update */
+ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status
+ = MLX4_GUID_INFO_STATUS_IDLE ;
+
+ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method
+ = MLX4_GUID_INFO_RECORD_SET;
+
+ switch (sysadmin_ag_val) {
+ case MLX4_GUID_FOR_DELETE_VAL:
+ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method
+ = MLX4_GUID_INFO_RECORD_DELETE;
+ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
+ = MLX4_GUID_SYSADMIN_ASSIGN;
+ break;
+ /* The sysadmin requests the SM to re-assign */
+ case MLX4_NOT_SET_GUID:
+ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
+ = MLX4_GUID_DRIVER_ASSIGN;
+ break;
+ /* The sysadmin requests a specific value.*/
+ default:
+ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
+ = MLX4_GUID_SYSADMIN_ASSIGN;
+ break;
+ }
+
+ /* set the record index */
+ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes
+ = mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec);
+
+ mlx4_ib_init_alias_guid_work(mdev, port->num - 1);
+
+ return count;
+}
+
+static ssize_t show_port_gid(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+ container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+ struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+ struct mlx4_ib_dev *mdev = port->dev;
+ union ib_gid gid;
+ ssize_t ret;
+
+ ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num,
+ mlx4_ib_iov_dentry->entry_num, &gid, 1);
+ if (ret)
+ return ret;
+ ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((__be16 *) gid.raw)[0]),
+ be16_to_cpu(((__be16 *) gid.raw)[1]),
+ be16_to_cpu(((__be16 *) gid.raw)[2]),
+ be16_to_cpu(((__be16 *) gid.raw)[3]),
+ be16_to_cpu(((__be16 *) gid.raw)[4]),
+ be16_to_cpu(((__be16 *) gid.raw)[5]),
+ be16_to_cpu(((__be16 *) gid.raw)[6]),
+ be16_to_cpu(((__be16 *) gid.raw)[7]));
+ return ret;
+}
+
+static ssize_t show_phys_port_pkey(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+ container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+ struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+ struct mlx4_ib_dev *mdev = port->dev;
+ u16 pkey;
+ ssize_t ret;
+
+ ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num,
+ mlx4_ib_iov_dentry->entry_num, &pkey, 1);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define DENTRY_REMOVE(_dentry) \
+do { \
+ sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \
+} while (0);
+
+static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry,
+ char *_name, struct kobject *_kobj,
+ ssize_t (*show)(struct device *dev,
+ struct device_attribute *attr,
+ char *buf),
+ ssize_t (*store)(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+ )
+{
+ int ret = 0;
+ struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry;
+
+ vdentry->ctx = _ctx;
+ vdentry->dentry.show = show;
+ vdentry->dentry.store = store;
+ sysfs_attr_init(&vdentry->dentry.attr);
+ vdentry->dentry.attr.name = vdentry->name;
+ vdentry->dentry.attr.mode = 0;
+ vdentry->kobj = _kobj;
+ snprintf(vdentry->name, 15, "%s", _name);
+
+ if (vdentry->dentry.store)
+ vdentry->dentry.attr.mode |= S_IWUSR;
+
+ if (vdentry->dentry.show)
+ vdentry->dentry.attr.mode |= S_IRUGO;
+
+ ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr);
+ if (ret) {
+ pr_err("failed to create %s\n", vdentry->dentry.attr.name);
+ vdentry->ctx = NULL;
+ return ret;
+ }
+
+ return ret;
+}
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+ struct attribute *attr)
+{
+ struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+ int ret;
+
+ ret = sysfs_create_file(port->mcgs_parent, attr);
+ if (ret)
+ pr_err("failed to create %s\n", attr->name);
+
+ return ret;
+}
+
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+ struct attribute *attr)
+{
+ struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+
+ sysfs_remove_file(port->mcgs_parent, attr);
+}
+
+static int add_port_entries(struct mlx4_ib_dev *device, int port_num)
+{
+ int i;
+ char buff[10];
+ struct mlx4_ib_iov_port *port = NULL;
+ int ret = 0 ;
+ struct ib_port_attr attr;
+
+ /* get the physical gid and pkey table sizes.*/
+ ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1);
+ if (ret)
+ goto err;
+
+ port = &device->iov_ports[port_num - 1];
+ port->dev = device;
+ port->num = port_num;
+ /* Directory structure:
+ * iov -
+ * port num -
+ * admin_guids
+ * gids (operational)
+ * mcg_table
+ */
+ port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar),
+ GFP_KERNEL);
+ if (!port->dentr_ar) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ sprintf(buff, "%d", port_num);
+ port->cur_port = kobject_create_and_add(buff,
+ kobject_get(device->ports_parent));
+ if (!port->cur_port) {
+ ret = -ENOMEM;
+ goto kobj_create_err;
+ }
+ /* admin GUIDs */
+ port->admin_alias_parent = kobject_create_and_add("admin_guids",
+ kobject_get(port->cur_port));
+ if (!port->admin_alias_parent) {
+ ret = -ENOMEM;
+ goto err_admin_guids;
+ }
+ for (i = 0 ; i < attr.gid_tbl_len; i++) {
+ sprintf(buff, "%d", i);
+ port->dentr_ar->dentries[i].entry_num = i;
+ ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i],
+ buff, port->admin_alias_parent,
+ show_admin_alias_guid, store_admin_alias_guid);
+ if (ret)
+ goto err_admin_alias_parent;
+ }
+
+ /* gids subdirectory (operational gids) */
+ port->gids_parent = kobject_create_and_add("gids",
+ kobject_get(port->cur_port));
+ if (!port->gids_parent) {
+ ret = -ENOMEM;
+ goto err_gids;
+ }
+
+ for (i = 0 ; i < attr.gid_tbl_len; i++) {
+ sprintf(buff, "%d", i);
+ port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i;
+ ret = create_sysfs_entry(port,
+ &port->dentr_ar->dentries[attr.gid_tbl_len + i],
+ buff,
+ port->gids_parent, show_port_gid, NULL);
+ if (ret)
+ goto err_gids_parent;
+ }
+
+ /* physical port pkey table */
+ port->pkeys_parent =
+ kobject_create_and_add("pkeys", kobject_get(port->cur_port));
+ if (!port->pkeys_parent) {
+ ret = -ENOMEM;
+ goto err_pkeys;
+ }
+
+ for (i = 0 ; i < attr.pkey_tbl_len; i++) {
+ sprintf(buff, "%d", i);
+ port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i;
+ ret = create_sysfs_entry(port,
+ &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i],
+ buff, port->pkeys_parent,
+ show_phys_port_pkey, NULL);
+ if (ret)
+ goto err_pkeys_parent;
+ }
+
+ /* MCGs table */
+ port->mcgs_parent =
+ kobject_create_and_add("mcgs", kobject_get(port->cur_port));
+ if (!port->mcgs_parent) {
+ ret = -ENOMEM;
+ goto err_mcgs;
+ }
+ return 0;
+
+err_mcgs:
+ kobject_put(port->cur_port);
+
+err_pkeys_parent:
+ kobject_put(port->pkeys_parent);
+
+err_pkeys:
+ kobject_put(port->cur_port);
+
+err_gids_parent:
+ kobject_put(port->gids_parent);
+
+err_gids:
+ kobject_put(port->cur_port);
+
+err_admin_alias_parent:
+ kobject_put(port->admin_alias_parent);
+
+err_admin_guids:
+ kobject_put(port->cur_port);
+ kobject_put(port->cur_port); /* once more for create_and_add buff */
+
+kobj_create_err:
+ kobject_put(device->ports_parent);
+ kfree(port->dentr_ar);
+
+err:
+ pr_err("add_port_entries FAILED: for port:%d, error: %d\n",
+ port_num, ret);
+ return ret;
+}
+
+static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
+{
+ char base_name[9];
+
+ /* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
+ strlcpy(name, pci_name(dev->dev->pdev), max);
+ strncpy(base_name, name, 8); /*till xxxx:yy:*/
+ base_name[8] = '\0';
+ /* with no ARI only 3 last bits are used so when the fn is higher than 8
+ * need to add it to the dev num, so count in the last number will be
+ * modulo 8 */
+ sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8));
+}
+
+struct mlx4_port {
+ struct kobject kobj;
+ struct mlx4_ib_dev *dev;
+ struct attribute_group pkey_group;
+ struct attribute_group gid_group;
+ u8 port_num;
+ int slave;
+};
+
+
+static void mlx4_port_release(struct kobject *kobj)
+{
+ struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+ struct attribute *a;
+ int i;
+
+ for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+ kfree(a);
+ kfree(p->pkey_group.attrs);
+ for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+ kfree(a);
+ kfree(p->gid_group.attrs);
+ kfree(p);
+}
+
+struct port_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf);
+ ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+ const char *buf, size_t count);
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+ if (!port_attr->show)
+ return -EIO;
+ return port_attr->show(p, port_attr, buf);
+}
+
+static ssize_t port_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t size)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+ if (!port_attr->store)
+ return -EIO;
+ return port_attr->store(p, port_attr, buf, size);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+ .show = port_attr_show,
+ .store = port_attr_store,
+};
+
+static struct kobj_type port_type = {
+ .release = mlx4_port_release,
+ .sysfs_ops = &port_sysfs_ops,
+};
+
+struct port_table_attribute {
+ struct port_attribute attr;
+ char name[8];
+ int index;
+};
+
+static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ ssize_t ret = -ENODEV;
+
+ if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >=
+ (p->dev->dev->caps.pkey_table_len[p->port_num]))
+ ret = sprintf(buf, "none\n");
+ else
+ ret = sprintf(buf, "%d\n",
+ p->dev->pkeys.virt2phys_pkey[p->slave]
+ [p->port_num - 1][tab_attr->index]);
+ return ret;
+}
+
+static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ int idx;
+ int err;
+
+ /* do not allow remapping Dom0 virtual pkey table */
+ if (p->slave == mlx4_master_func_num(p->dev->dev))
+ return -EINVAL;
+
+ if (!strncasecmp(buf, "no", 2))
+ idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1;
+ else if (sscanf(buf, "%i", &idx) != 1 ||
+ idx >= p->dev->dev->caps.pkey_table_len[p->port_num] ||
+ idx < 0)
+ return -EINVAL;
+
+ p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1]
+ [tab_attr->index] = idx;
+ mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num,
+ tab_attr->index, idx);
+ err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num);
+ if (err) {
+ pr_err("mlx4_gen_pkey_eqe failed for slave %d,"
+ " port %d, index %d\n", p->slave, p->port_num, idx);
+ return err;
+ }
+ return count;
+}
+
+static ssize_t show_port_gid_idx(struct mlx4_port *p,
+ struct port_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", p->slave);
+}
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct mlx4_port *,
+ struct port_attribute *, char *buf),
+ ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+ const char *buf, size_t count),
+ int len)
+{
+ struct attribute **tab_attr;
+ struct port_table_attribute *element;
+ int i;
+
+ tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL);
+ if (!tab_attr)
+ return NULL;
+
+ for (i = 0; i < len; i++) {
+ element = kzalloc(sizeof (struct port_table_attribute),
+ GFP_KERNEL);
+ if (!element)
+ goto err;
+ if (snprintf(element->name, sizeof (element->name),
+ "%d", i) >= sizeof (element->name)) {
+ kfree(element);
+ goto err;
+ }
+ sysfs_attr_init(&element->attr.attr);
+ element->attr.attr.name = element->name;
+ if (store) {
+ element->attr.attr.mode = S_IWUSR | S_IRUGO;
+ element->attr.store = store;
+ } else
+ element->attr.attr.mode = S_IRUGO;
+
+ element->attr.show = show;
+ element->index = i;
+ tab_attr[i] = &element->attr.attr;
+ }
+ return tab_attr;
+
+err:
+ while (--i >= 0)
+ kfree(tab_attr[i]);
+ kfree(tab_attr);
+ return NULL;
+}
+
+static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
+{
+ struct mlx4_port *p;
+ int i;
+ int ret;
+
+ p = kzalloc(sizeof *p, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ p->dev = dev;
+ p->port_num = port_num;
+ p->slave = slave;
+
+ ret = kobject_init_and_add(&p->kobj, &port_type,
+ kobject_get(dev->dev_ports_parent[slave]),
+ "%d", port_num);
+ if (ret)
+ goto err_alloc;
+
+ p->pkey_group.name = "pkey_idx";
+ p->pkey_group.attrs =
+ alloc_group_attrs(show_port_pkey, store_port_pkey,
+ dev->dev->caps.pkey_table_len[port_num]);
+ if (!p->pkey_group.attrs)
+ goto err_alloc;
+
+ ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+ if (ret)
+ goto err_free_pkey;
+
+ p->gid_group.name = "gid_idx";
+ p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1);
+ if (!p->gid_group.attrs)
+ goto err_free_pkey;
+
+ ret = sysfs_create_group(&p->kobj, &p->gid_group);
+ if (ret)
+ goto err_free_gid;
+
+ list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]);
+ return 0;
+
+err_free_gid:
+ kfree(p->gid_group.attrs[0]);
+ kfree(p->gid_group.attrs);
+
+err_free_pkey:
+ for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i)
+ kfree(p->pkey_group.attrs[i]);
+ kfree(p->pkey_group.attrs);
+
+err_alloc:
+ kobject_put(dev->dev_ports_parent[slave]);
+ kfree(p);
+ return ret;
+}
+
+static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave)
+{
+ char name[32];
+ int err;
+ int port;
+ struct kobject *p, *t;
+ struct mlx4_port *mport;
+
+ get_name(dev, name, slave, sizeof name);
+
+ dev->pkeys.device_parent[slave] =
+ kobject_create_and_add(name, kobject_get(dev->iov_parent));
+
+ if (!dev->pkeys.device_parent[slave]) {
+ err = -ENOMEM;
+ goto fail_dev;
+ }
+
+ INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]);
+
+ dev->dev_ports_parent[slave] =
+ kobject_create_and_add("ports",
+ kobject_get(dev->pkeys.device_parent[slave]));
+
+ if (!dev->dev_ports_parent[slave]) {
+ err = -ENOMEM;
+ goto err_ports;
+ }
+
+ for (port = 1; port <= dev->dev->caps.num_ports; ++port) {
+ err = add_port(dev, port, slave);
+ if (err)
+ goto err_add;
+ }
+ return 0;
+
+err_add:
+ list_for_each_entry_safe(p, t,
+ &dev->pkeys.pkey_port_list[slave],
+ entry) {
+ list_del(&p->entry);
+ mport = container_of(p, struct mlx4_port, kobj);
+ sysfs_remove_group(p, &mport->pkey_group);
+ sysfs_remove_group(p, &mport->gid_group);
+ kobject_put(p);
+ }
+ kobject_put(dev->dev_ports_parent[slave]);
+
+err_ports:
+ kobject_put(dev->pkeys.device_parent[slave]);
+ /* extra put for the device_parent create_and_add */
+ kobject_put(dev->pkeys.device_parent[slave]);
+
+fail_dev:
+ kobject_put(dev->iov_parent);
+ return err;
+}
+
+static int register_pkey_tree(struct mlx4_ib_dev *device)
+{
+ int i;
+
+ if (!mlx4_is_master(device->dev))
+ return 0;
+
+ for (i = 0; i <= device->dev->num_vfs; ++i)
+ register_one_pkey_tree(device, i);
+
+ return 0;
+}
+
+static void unregister_pkey_tree(struct mlx4_ib_dev *device)
+{
+ int slave;
+ struct kobject *p, *t;
+ struct mlx4_port *port;
+
+ if (!mlx4_is_master(device->dev))
+ return;
+
+ for (slave = device->dev->num_vfs; slave >= 0; --slave) {
+ list_for_each_entry_safe(p, t,
+ &device->pkeys.pkey_port_list[slave],
+ entry) {
+ list_del(&p->entry);
+ port = container_of(p, struct mlx4_port, kobj);
+ sysfs_remove_group(p, &port->pkey_group);
+ sysfs_remove_group(p, &port->gid_group);
+ kobject_put(p);
+ kobject_put(device->dev_ports_parent[slave]);
+ }
+ kobject_put(device->dev_ports_parent[slave]);
+ kobject_put(device->pkeys.device_parent[slave]);
+ kobject_put(device->pkeys.device_parent[slave]);
+ kobject_put(device->iov_parent);
+ }
+}
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev)
+{
+ int i;
+ int ret = 0;
+
+ if (!mlx4_is_master(dev->dev))
+ return 0;
+
+ dev->iov_parent =
+ kobject_create_and_add("iov",
+ kobject_get(dev->ib_dev.ports_parent->parent));
+ if (!dev->iov_parent) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ dev->ports_parent =
+ kobject_create_and_add("ports",
+ kobject_get(dev->iov_parent));
+ if (!dev->iov_parent) {
+ ret = -ENOMEM;
+ goto err_ports;
+ }
+
+ for (i = 1; i <= dev->ib_dev.phys_port_cnt; ++i) {
+ ret = add_port_entries(dev, i);
+ if (ret)
+ goto err_add_entries;
+ }
+
+ ret = register_pkey_tree(dev);
+ if (ret)
+ goto err_add_entries;
+ return 0;
+
+err_add_entries:
+ kobject_put(dev->ports_parent);
+
+err_ports:
+ kobject_put(dev->iov_parent);
+err:
+ kobject_put(dev->ib_dev.ports_parent->parent);
+ pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret);
+ return ret;
+}
+
+static void unregister_alias_guid_tree(struct mlx4_ib_dev *device)
+{
+ struct mlx4_ib_iov_port *p;
+ int i;
+
+ if (!mlx4_is_master(device->dev))
+ return;
+
+ for (i = 0; i < device->dev->caps.num_ports; i++) {
+ p = &device->iov_ports[i];
+ kobject_put(p->admin_alias_parent);
+ kobject_put(p->gids_parent);
+ kobject_put(p->pkeys_parent);
+ kobject_put(p->mcgs_parent);
+ kobject_put(p->cur_port);
+ kobject_put(p->cur_port);
+ kobject_put(p->cur_port);
+ kobject_put(p->cur_port);
+ kobject_put(p->cur_port);
+ kobject_put(p->dev->ports_parent);
+ kfree(p->dentr_ar);
+ }
+}
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device)
+{
+ unregister_alias_guid_tree(device);
+ unregister_pkey_tree(device);
+ kobject_put(device->ports_parent);
+ kobject_put(device->iov_parent);
+ kobject_put(device->iov_parent);
+ kobject_put(device->ib_dev.ports_parent->parent);
+}