From 284555fc7f5d2d4aab34dbf2b5766c6f8da84873 Mon Sep 17 00:00:00 2001 From: Manoj Kumar Date: Mon, 14 Dec 2015 15:07:02 -0600 Subject: cxlflash: Fix to resolve cmd leak after host reset [ Upstream commit ee91e332a6e6e9b939f60f6e1bd72fb2def5290d ] After a few iterations of resetting the card, either during EEH recovery, or a host_reset the following is seen in the logs. cxlflash 0008:00: cxlflash_queuecommand: could not get a free command At every reset of the card, the commands that are outstanding are being leaked. No effort is being made to reap these commands. A few more resets later, the above error message floods the logs and the card is rendered totally unusable as no free commands are available. Iterated through the 'cmd' queue and printed out the 'free' counter and found that on each reset certain commands were in-use and stayed in-use through subsequent resets. To resolve this issue, when the card is reset, reap all the commands that are active/outstanding. Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Andrew Donnellan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'drivers/scsi/cxlflash/main.c') diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index 1e5bf0ca81da..3fb0e1ca6809 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -632,15 +632,30 @@ static void free_mem(struct cxlflash_cfg *cfg) * @cfg: Internal structure associated with the host. * * Safe to call with AFU in a partially allocated/initialized state. + * + * Cleans up all state associated with the command queue, and unmaps + * the MMIO space. + * + * - complete() will take care of commands we initiated (they'll be checked + * in as part of the cleanup that occurs after the completion) + * + * - cmd_checkin() will take care of entries that we did not initiate and that + * have not (and will not) complete because they are sitting on a [now stale] + * hardware queue */ static void stop_afu(struct cxlflash_cfg *cfg) { int i; struct afu *afu = cfg->afu; + struct afu_cmd *cmd; if (likely(afu)) { - for (i = 0; i < CXLFLASH_NUM_CMDS; i++) - complete(&afu->cmd[i].cevent); + for (i = 0; i < CXLFLASH_NUM_CMDS; i++) { + cmd = &afu->cmd[i]; + complete(&cmd->cevent); + if (!atomic_read(&cmd->free)) + cmd_checkin(cmd); + } if (likely(afu->afu_map)) { cxl_psa_unmap((void __iomem *)afu->afu_map); -- cgit v1.2.3 From 20ab6948f1c09ad84ec8f5b8fdf02ea86e33fa63 Mon Sep 17 00:00:00 2001 From: Manoj Kumar Date: Mon, 14 Dec 2015 15:07:23 -0600 Subject: cxlflash: Resolve oops in wait_port_offline [ Upstream commit b45cdbaf9f7f0486847c52f60747fb108724652a ] If an async error interrupt is generated, and the error requires the FC link to be reset, it cannot be performed in the interrupt context. So a work element is scheduled to complete the link reset in a process context. If either an EEH event or an escalation occurs in between when the interrupt is generated and the scheduled work is started, the MMIO space may no longer be available. This will cause an oops in the worker thread. [ 606.806583] NIP kthread_data+0x28/0x40 [ 606.806633] LR wq_worker_sleeping+0x30/0x100 [ 606.806694] Call Trace: [ 606.806721] 0x50 (unreliable) [ 606.806796] wq_worker_sleeping+0x30/0x100 [ 606.806884] __schedule+0x69c/0x8a0 [ 606.806959] schedule+0x44/0xc0 [ 606.807034] do_exit+0x770/0xb90 [ 606.807109] die+0x300/0x460 [ 606.807185] bad_page_fault+0xd8/0x150 [ 606.807259] handle_page_fault+0x2c/0x30 [ 606.807338] wait_port_offline.constprop.12+0x60/0x130 [cxlflash] To prevent the problem space area from being unmapped, when there is pending work, a mapcount (using the kref mechanism) is held. The mapcount is released only when the work is completed. The last reference release is tied to the unmapping service. Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Uma Krishnan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'drivers/scsi/cxlflash/main.c') diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index 3fb0e1ca6809..463d7e503d88 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -368,6 +368,7 @@ out: no_room: afu->read_room = true; + kref_get(&cfg->afu->mapcount); schedule_work(&cfg->work_q); rc = SCSI_MLQUEUE_HOST_BUSY; goto out; @@ -473,6 +474,16 @@ out: return rc; } +static void afu_unmap(struct kref *ref) +{ + struct afu *afu = container_of(ref, struct afu, mapcount); + + if (likely(afu->afu_map)) { + cxl_psa_unmap((void __iomem *)afu->afu_map); + afu->afu_map = NULL; + } +} + /** * cxlflash_driver_info() - information handler for this host driver * @host: SCSI host associated with device. @@ -503,6 +514,7 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scp) ulong lock_flags; short lflag = 0; int rc = 0; + int kref_got = 0; dev_dbg_ratelimited(dev, "%s: (scp=%p) %d/%d/%d/%llu " "cdb=(%08X-%08X-%08X-%08X)\n", @@ -547,6 +559,9 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scp) goto out; } + kref_get(&cfg->afu->mapcount); + kref_got = 1; + cmd->rcb.ctx_id = afu->ctx_hndl; cmd->rcb.port_sel = port_sel; cmd->rcb.lun_id = lun_to_lunid(scp->device->lun); @@ -587,6 +602,8 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scp) } out: + if (kref_got) + kref_put(&afu->mapcount, afu_unmap); pr_devel("%s: returning rc=%d\n", __func__, rc); return rc; } @@ -661,6 +678,7 @@ static void stop_afu(struct cxlflash_cfg *cfg) cxl_psa_unmap((void __iomem *)afu->afu_map); afu->afu_map = NULL; } + kref_put(&afu->mapcount, afu_unmap); } } @@ -746,8 +764,8 @@ static void cxlflash_remove(struct pci_dev *pdev) scsi_remove_host(cfg->host); /* fall through */ case INIT_STATE_AFU: - term_afu(cfg); cancel_work_sync(&cfg->work_q); + term_afu(cfg); case INIT_STATE_PCI: pci_release_regions(cfg->dev); pci_disable_device(pdev); @@ -1331,6 +1349,7 @@ static irqreturn_t cxlflash_async_err_irq(int irq, void *data) __func__, port); cfg->lr_state = LINK_RESET_REQUIRED; cfg->lr_port = port; + kref_get(&cfg->afu->mapcount); schedule_work(&cfg->work_q); } @@ -1351,6 +1370,7 @@ static irqreturn_t cxlflash_async_err_irq(int irq, void *data) if (info->action & SCAN_HOST) { atomic_inc(&cfg->scan_host_needed); + kref_get(&cfg->afu->mapcount); schedule_work(&cfg->work_q); } } @@ -1746,6 +1766,7 @@ static int init_afu(struct cxlflash_cfg *cfg) rc = -ENOMEM; goto err1; } + kref_init(&afu->mapcount); /* No byte reverse on reading afu_version or string will be backwards */ reg = readq(&afu->afu_map->global.regs.afu_version); @@ -1780,8 +1801,7 @@ out: return rc; err2: - cxl_psa_unmap((void __iomem *)afu->afu_map); - afu->afu_map = NULL; + kref_put(&afu->mapcount, afu_unmap); err1: term_mc(cfg, UNDO_START); goto out; @@ -2354,6 +2374,7 @@ static void cxlflash_worker_thread(struct work_struct *work) if (atomic_dec_if_positive(&cfg->scan_host_needed) >= 0) scsi_scan_host(cfg->host); + kref_put(&afu->mapcount, afu_unmap); } /** -- cgit v1.2.3 From 8c67424e2a37005364d0aecd6541d221517a1c4f Mon Sep 17 00:00:00 2001 From: Manoj Kumar Date: Mon, 14 Dec 2015 15:07:43 -0600 Subject: cxlflash: Enable device id for future IBM CXL adapter [ Upstream commit a2746fb16e41b7c8f02aa4d2605ecce97abbebbd ] This drop enables a future card with a device id of 0x0600 to be recognized by the cxlflash driver. As per the design, the Accelerator Function Unit (AFU) for this new IBM CXL Flash Adapter retains the same host interface as the previous generation. For the early prototypes of the new card, the driver with this change behaves exactly as the driver prior to this behaved with the earlier generation card. Therefore, no card specific programming has been added. These card specific changes can be staged in later if needed. Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Andrew Donnellan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/scsi/cxlflash/main.c') diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index 463d7e503d88..d1d077420964 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -2309,6 +2309,7 @@ static struct scsi_host_template driver_template = { * Device dependent values */ static struct dev_dependent_vals dev_corsa_vals = { CXLFLASH_MAX_SECTORS }; +static struct dev_dependent_vals dev_flash_gt_vals = { CXLFLASH_MAX_SECTORS }; /* * PCI device binding table @@ -2316,6 +2317,8 @@ static struct dev_dependent_vals dev_corsa_vals = { CXLFLASH_MAX_SECTORS }; static struct pci_device_id cxlflash_pci_table[] = { {PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CORSA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (kernel_ulong_t)&dev_corsa_vals}, + {PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_FLASH_GT, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, (kernel_ulong_t)&dev_flash_gt_vals}, {} }; -- cgit v1.2.3 From 321aeea695a2a9343c20d03f33a475978b55c719 Mon Sep 17 00:00:00 2001 From: "Manoj N. Kumar" Date: Fri, 4 Mar 2016 15:55:19 -0600 Subject: cxlflash: Fix to avoid unnecessary scan with internal LUNs [ Upstream commit 603ecce95f4817074a724a889cd88c3c8210f933 ] When switching to the internal LUN defined on the IBM CXL flash adapter, there is an unnecessary scan occurring on the second port. This scan leads to the following extra lines in the log: Dec 17 10:09:00 tul83p1 kernel: [ 3708.561134] cxlflash 0008:00:00.0: cxlflash_queuecommand: (scp=c0000000fc1f0f00) 11/1/0/0 cdb=(A0000000-00000000-10000000-00000000) Dec 17 10:09:00 tul83p1 kernel: [ 3708.561147] process_cmd_err: cmd failed afu_rc=32 scsi_rc=0 fc_rc=0 afu_extra=0xE, scsi_extra=0x0, fc_extra=0x0 By definition, both of the internal LUNs are on the first port/channel. When the lun_mode is switched to internal LUN the same value for host->max_channel is retained. This causes an unnecessary scan over the second port/channel. This fix alters the host->max_channel to 0 (1 port), if internal LUNs are configured and switches it back to 1 (2 ports) while going back to external LUNs. Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Uma Krishnan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers/scsi/cxlflash/main.c') diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index d1d077420964..ad8dc8d4d1c2 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -2149,6 +2149,16 @@ static ssize_t lun_mode_store(struct device *dev, rc = kstrtouint(buf, 10, &lun_mode); if (!rc && (lun_mode < 5) && (lun_mode != afu->internal_lun)) { afu->internal_lun = lun_mode; + + /* + * When configured for internal LUN, there is only one channel, + * channel number 0, else there will be 2 (default). + */ + if (afu->internal_lun) + shost->max_channel = 0; + else + shost->max_channel = NUM_FC_PORTS - 1; + afu_reset(cfg); scsi_scan_host(cfg->host); } -- cgit v1.2.3 From 610f1a8d3700e81d534adb3db07267fa8b20411c Mon Sep 17 00:00:00 2001 From: Manoj Kumar Date: Tue, 30 Aug 2016 00:34:54 -0400 Subject: cxlflash: Fix to escalate LINK_RESET also on port 1 [ Upstream commit a9be294ecb3b9dc82b15625631b153f871181d16 ] The original fix to escalate a 'login timed out' error to a LINK_RESET was only made for one of the two ports on the card. This fix resolves the same issue for the second port (port 1). Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Uma Krishnan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/scsi/cxlflash/main.c') diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index ad8dc8d4d1c2..75ff7bdecca1 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -1141,7 +1141,7 @@ static const struct asyc_intr_info ainfo[] = { {SISL_ASTATUS_FC1_OTHER, "other error", 1, CLR_FC_ERROR | LINK_RESET}, {SISL_ASTATUS_FC1_LOGO, "target initiated LOGO", 1, 0}, {SISL_ASTATUS_FC1_CRC_T, "CRC threshold exceeded", 1, LINK_RESET}, - {SISL_ASTATUS_FC1_LOGI_R, "login timed out, retrying", 1, 0}, + {SISL_ASTATUS_FC1_LOGI_R, "login timed out, retrying", 1, LINK_RESET}, {SISL_ASTATUS_FC1_LOGI_F, "login failed", 1, CLR_FC_ERROR}, {SISL_ASTATUS_FC1_LOGI_S, "login succeeded", 1, SCAN_HOST}, {SISL_ASTATUS_FC1_LINK_DN, "link down", 1, 0}, -- cgit v1.2.3 From d4009e4b6e309e222c5245056f4d6e1a4da88026 Mon Sep 17 00:00:00 2001 From: "Manoj N. Kumar" Date: Tue, 30 Aug 2016 00:35:05 -0400 Subject: cxlflash: Move to exponential back-off when cmd_room is not available [ Upstream commit ea76543127da32dec28af0a13ea1b06625fc085e ] While profiling the cxlflash_queuecommand() path under a heavy load it was found that number of retries to find cmd_room was fairly high. There are two problems with the current back-off: a) It starts with a udelay of 0 b) It backs-off linearly Tried several approaches (a higher multiple 10*n, 100*n, as well as n^2, 2^n) and found that the exponential back-off(2^n) approach had the least overall cost. Cost as being defined as overall time spent waiting. The fix is to change the linear back-off to an exponential back-off. This solution also takes care of the problem with the initial delay (starts with 1 usec). Signed-off-by: Manoj N. Kumar Acked-by: Matthew R. Ochs Reviewed-by: Johannes Thumshirn Signed-off-by: Uma Krishnan Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/cxlflash/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/scsi/cxlflash/main.c') diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index 75ff7bdecca1..c86847c68448 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -289,7 +289,7 @@ static void context_reset(struct afu_cmd *cmd) atomic64_set(&afu->room, room); if (room) goto write_rrin; - udelay(nretry); + udelay(1 << nretry); } while (nretry++ < MC_ROOM_RETRY_CNT); pr_err("%s: no cmd_room to send reset\n", __func__); @@ -303,7 +303,7 @@ write_rrin: if (rrin != 0x1) break; /* Double delay each time */ - udelay(2 << nretry); + udelay(1 << nretry); } while (nretry++ < MC_ROOM_RETRY_CNT); } @@ -338,7 +338,7 @@ retry: atomic64_set(&afu->room, room); if (room) goto write_ioarrin; - udelay(nretry); + udelay(1 << nretry); } while (nretry++ < MC_ROOM_RETRY_CNT); dev_err(dev, "%s: no cmd_room to send 0x%X\n", @@ -352,7 +352,7 @@ retry: * afu->room. */ if (nretry++ < MC_ROOM_RETRY_CNT) { - udelay(nretry); + udelay(1 << nretry); goto retry; } -- cgit v1.2.3