aboutsummaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/DAC960.c18
-rw-r--r--drivers/block/Kconfig15
-rw-r--r--drivers/block/Makefile2
-rw-r--r--drivers/block/brd.c20
-rw-r--r--drivers/block/cciss.c6
-rw-r--r--drivers/block/cciss_scsi.c3
-rw-r--r--drivers/block/drbd/drbd_bitmap.c50
-rw-r--r--drivers/block/drbd/drbd_int.h4
-rw-r--r--drivers/block/drbd/drbd_main.c4
-rw-r--r--drivers/block/drbd/drbd_nl.c6
-rw-r--r--drivers/block/floppy.c56
-rw-r--r--drivers/block/hd.c1
-rw-r--r--drivers/block/loop.c40
-rw-r--r--drivers/block/mtip32xx/Kconfig9
-rw-r--r--drivers/block/mtip32xx/Makefile5
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c4118
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h450
-rw-r--r--drivers/block/nbd.c296
-rw-r--r--drivers/block/nvme.c1740
-rw-r--r--drivers/block/paride/bpck6.c5
-rw-r--r--drivers/block/paride/pcd.c2
-rw-r--r--drivers/block/paride/pd.c3
-rw-r--r--drivers/block/paride/pf.c4
-rw-r--r--drivers/block/paride/pg.c3
-rw-r--r--drivers/block/paride/pt.c4
-rw-r--r--drivers/block/pktcdvd.c8
-rw-r--r--drivers/block/rbd.c731
-rw-r--r--drivers/block/rbd_types.h4
-rw-r--r--drivers/block/sunvdc.c5
-rw-r--r--drivers/block/sx8.c14
-rw-r--r--drivers/block/ub.c42
-rw-r--r--drivers/block/viodasd.c809
-rw-r--r--drivers/block/virtio_blk.c92
-rw-r--r--drivers/block/xd.c3
-rw-r--r--drivers/block/xen-blkback/blkback.c122
-rw-r--r--drivers/block/xen-blkback/common.h73
-rw-r--r--drivers/block/xen-blkback/xenbus.c83
-rw-r--r--drivers/block/xen-blkfront.c120
38 files changed, 7333 insertions, 1637 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index e086fbbbe853..8db9089127c5 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -1177,7 +1177,8 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T
int TimeoutCounter;
int i;
-
+ memset(&CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T));
+
if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
return DAC960_Failure(Controller, "DMA mask out of range");
Controller->BounceBufferLimit = DMA_BIT_MASK(32);
@@ -4627,7 +4628,8 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
DAC960_Controller_T *Controller = Command->Controller;
DAC960_CommandType_T CommandType = Command->CommandType;
DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
- DAC960_V2_IOCTL_Opcode_T CommandOpcode = CommandMailbox->Common.IOCTL_Opcode;
+ DAC960_V2_IOCTL_Opcode_T IOCTLOpcode = CommandMailbox->Common.IOCTL_Opcode;
+ DAC960_V2_CommandOpcode_T CommandOpcode = CommandMailbox->SCSI_10.CommandOpcode;
DAC960_V2_CommandStatus_T CommandStatus = Command->V2.CommandStatus;
if (CommandType == DAC960_ReadCommand ||
@@ -4699,7 +4701,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
{
if (Controller->ShutdownMonitoringTimer)
return;
- if (CommandOpcode == DAC960_V2_GetControllerInfo)
+ if (IOCTLOpcode == DAC960_V2_GetControllerInfo)
{
DAC960_V2_ControllerInfo_T *NewControllerInfo =
Controller->V2.NewControllerInformation;
@@ -4719,14 +4721,14 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
memcpy(ControllerInfo, NewControllerInfo,
sizeof(DAC960_V2_ControllerInfo_T));
}
- else if (CommandOpcode == DAC960_V2_GetEvent)
+ else if (IOCTLOpcode == DAC960_V2_GetEvent)
{
if (CommandStatus == DAC960_V2_NormalCompletion) {
DAC960_V2_ReportEvent(Controller, Controller->V2.Event);
}
Controller->V2.NextEventSequenceNumber++;
}
- else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid &&
+ else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid &&
CommandStatus == DAC960_V2_NormalCompletion)
{
DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInfo =
@@ -4915,7 +4917,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
NewPhysicalDeviceInfo->LogicalUnit++;
Controller->V2.PhysicalDeviceIndex++;
}
- else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid)
+ else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid)
{
unsigned int DeviceIndex;
for (DeviceIndex = Controller->V2.PhysicalDeviceIndex;
@@ -4938,7 +4940,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
}
Controller->V2.NeedPhysicalDeviceInformation = false;
}
- else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid &&
+ else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid &&
CommandStatus == DAC960_V2_NormalCompletion)
{
DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInfo =
@@ -5065,7 +5067,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
[LogicalDeviceNumber] = true;
NewLogicalDeviceInfo->LogicalDeviceNumber++;
}
- else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid)
+ else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid)
{
int LogicalDriveNumber;
for (LogicalDriveNumber = 0;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 6f07ec1c2f58..a796407123c7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -116,6 +116,8 @@ config PARIDE
source "drivers/block/paride/Kconfig"
+source "drivers/block/mtip32xx/Kconfig"
+
config BLK_CPQ_DA
tristate "Compaq SMART2 support"
depends on PCI && VIRT_TO_BUS
@@ -315,6 +317,17 @@ config BLK_DEV_NBD
If unsure, say N.
+config BLK_DEV_NVME
+ tristate "NVM Express block device"
+ depends on PCI
+ ---help---
+ The NVM Express driver is for solid state drives directly
+ connected to the PCI or PCI Express bus. If you know you
+ don't have one of these, it is safe to answer N.
+
+ To compile this driver as a module, choose M here: the
+ module will be called nvme.
+
config BLK_DEV_OSD
tristate "OSD object-as-blkdev support"
depends on SCSI_OSD_ULD
@@ -341,7 +354,7 @@ config BLK_DEV_SX8
Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
config BLK_DEV_UB
- tristate "Low Performance USB Block driver"
+ tristate "Low Performance USB Block driver (deprecated)"
depends on USB
help
This driver supports certain USB attached storage devices
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 76646e9a1c91..5b795059f8fb 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_XILINX_SYSACE) += xsysace.o
obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
obj-$(CONFIG_MG_DISK) += mg_disk.o
obj-$(CONFIG_SUNVDC) += sunvdc.o
+obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o
obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
@@ -39,5 +40,6 @@ obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index ec246437f5a4..531ceb31d0ff 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -242,9 +242,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src,
page = brd_lookup_page(brd, sector);
BUG_ON(!page);
- dst = kmap_atomic(page, KM_USER1);
+ dst = kmap_atomic(page);
memcpy(dst + offset, src, copy);
- kunmap_atomic(dst, KM_USER1);
+ kunmap_atomic(dst);
if (copy < n) {
src += copy;
@@ -253,9 +253,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src,
page = brd_lookup_page(brd, sector);
BUG_ON(!page);
- dst = kmap_atomic(page, KM_USER1);
+ dst = kmap_atomic(page);
memcpy(dst, src, copy);
- kunmap_atomic(dst, KM_USER1);
+ kunmap_atomic(dst);
}
}
@@ -273,9 +273,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
copy = min_t(size_t, n, PAGE_SIZE - offset);
page = brd_lookup_page(brd, sector);
if (page) {
- src = kmap_atomic(page, KM_USER1);
+ src = kmap_atomic(page);
memcpy(dst, src + offset, copy);
- kunmap_atomic(src, KM_USER1);
+ kunmap_atomic(src);
} else
memset(dst, 0, copy);
@@ -285,9 +285,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
copy = n - copy;
page = brd_lookup_page(brd, sector);
if (page) {
- src = kmap_atomic(page, KM_USER1);
+ src = kmap_atomic(page);
memcpy(dst, src, copy);
- kunmap_atomic(src, KM_USER1);
+ kunmap_atomic(src);
} else
memset(dst, 0, copy);
}
@@ -309,7 +309,7 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
goto out;
}
- mem = kmap_atomic(page, KM_USER0);
+ mem = kmap_atomic(page);
if (rw == READ) {
copy_from_brd(mem + off, brd, sector, len);
flush_dcache_page(page);
@@ -317,7 +317,7 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
flush_dcache_page(page);
copy_to_brd(brd, mem + off, sector, len);
}
- kunmap_atomic(mem, KM_USER0);
+ kunmap_atomic(mem);
out:
return err;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 587cce57adae..b0f553b26d0f 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1735,7 +1735,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
case CCISS_BIG_PASSTHRU:
return cciss_bigpassthru(h, argp);
- /* scsi_cmd_ioctl handles these, below, though some are not */
+ /* scsi_cmd_blk_ioctl handles these, below, though some are not */
/* very meaningful for cciss. SG_IO is the main one people want. */
case SG_GET_VERSION_NUM:
@@ -1746,9 +1746,9 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
case SG_EMULATED_HOST:
case SG_IO:
case SCSI_IOCTL_SEND_COMMAND:
- return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
+ return scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
- /* scsi_cmd_ioctl would normally handle these, below, but */
+ /* scsi_cmd_blk_ioctl would normally handle these, below, but */
/* they aren't a good fit for cciss, as CD-ROMs are */
/* not supported, and we don't have any bus/target/lun */
/* which we present to the kernel. */
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index e820b68d2f6c..acda773b3720 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -866,6 +866,7 @@ cciss_scsi_detect(ctlr_info_t *h)
sh->can_queue = cciss_tape_cmds;
sh->sg_tablesize = h->maxsgentries;
sh->max_cmd_len = MAX_COMMAND_SIZE;
+ sh->max_sectors = h->cciss_max_sectors;
((struct cciss_scsi_adapter_data_t *)
h->scsi_ctlr)->scsi_host = sh;
@@ -1410,7 +1411,7 @@ static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
/* track how many SG entries we are using */
if (request_nsgs > h->maxSG)
h->maxSG = request_nsgs;
- c->Header.SGTotal = (__u8) request_nsgs + chained;
+ c->Header.SGTotal = (u16) request_nsgs + chained;
if (request_nsgs > h->max_cmd_sgentries)
c->Header.SGList = h->max_cmd_sgentries;
else
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 912f585a760f..3030201c69d8 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -289,25 +289,25 @@ static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
return page_nr;
}
-static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
{
struct page *page = b->bm_pages[idx];
- return (unsigned long *) kmap_atomic(page, km);
+ return (unsigned long *) kmap_atomic(page);
}
static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
{
- return __bm_map_pidx(b, idx, KM_IRQ1);
+ return __bm_map_pidx(b, idx);
}
-static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
+static void __bm_unmap(unsigned long *p_addr)
{
- kunmap_atomic(p_addr, km);
+ kunmap_atomic(p_addr);
};
static void bm_unmap(unsigned long *p_addr)
{
- return __bm_unmap(p_addr, KM_IRQ1);
+ return __bm_unmap(p_addr);
}
/* long word offset of _bitmap_ sector */
@@ -543,15 +543,15 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
/* all but last page */
for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
- p_addr = __bm_map_pidx(b, idx, KM_USER0);
+ p_addr = __bm_map_pidx(b, idx);
for (i = 0; i < LWPP; i++)
bits += hweight_long(p_addr[i]);
- __bm_unmap(p_addr, KM_USER0);
+ __bm_unmap(p_addr);
cond_resched();
}
/* last (or only) page */
last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
- p_addr = __bm_map_pidx(b, idx, KM_USER0);
+ p_addr = __bm_map_pidx(b, idx);
for (i = 0; i < last_word; i++)
bits += hweight_long(p_addr[i]);
p_addr[last_word] &= cpu_to_lel(mask);
@@ -559,7 +559,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
/* 32bit arch, may have an unused padding long */
if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
p_addr[last_word+1] = 0;
- __bm_unmap(p_addr, KM_USER0);
+ __bm_unmap(p_addr);
return bits;
}
@@ -970,11 +970,11 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
* to use pre-allocated page pool */
void *src, *dest;
page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
- dest = kmap_atomic(page, KM_USER0);
- src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
+ dest = kmap_atomic(page);
+ src = kmap_atomic(b->bm_pages[page_nr]);
memcpy(dest, src, PAGE_SIZE);
- kunmap_atomic(src, KM_USER1);
- kunmap_atomic(dest, KM_USER0);
+ kunmap_atomic(src);
+ kunmap_atomic(dest);
bm_store_page_idx(page, page_nr);
} else
page = b->bm_pages[page_nr];
@@ -1163,7 +1163,7 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc
* this returns a bit number, NOT a sector!
*/
static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
- const int find_zero_bit, const enum km_type km)
+ const int find_zero_bit)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr;
@@ -1178,7 +1178,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
while (bm_fo < b->bm_bits) {
/* bit offset of the first bit in the page */
bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
- p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
+ p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
if (find_zero_bit)
i = find_next_zero_bit_le(p_addr,
@@ -1187,7 +1187,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
i = find_next_bit_le(p_addr,
PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
- __bm_unmap(p_addr, km);
+ __bm_unmap(p_addr);
if (i < PAGE_SIZE*8) {
bm_fo = bit_offset + i;
if (bm_fo >= b->bm_bits)
@@ -1215,7 +1215,7 @@ static unsigned long bm_find_next(struct drbd_conf *mdev,
if (BM_DONT_TEST & b->bm_flags)
bm_print_lock_info(mdev);
- i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
+ i = __bm_find_next(mdev, bm_fo, find_zero_bit);
spin_unlock_irq(&b->bm_lock);
return i;
@@ -1239,13 +1239,13 @@ unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo
unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
{
/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
- return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
+ return __bm_find_next(mdev, bm_fo, 0);
}
unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
{
/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
- return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
+ return __bm_find_next(mdev, bm_fo, 1);
}
/* returns number of bits actually changed.
@@ -1273,14 +1273,14 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
if (page_nr != last_page_nr) {
if (p_addr)
- __bm_unmap(p_addr, KM_IRQ1);
+ __bm_unmap(p_addr);
if (c < 0)
bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
else if (c > 0)
bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
changed_total += c;
c = 0;
- p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1);
+ p_addr = __bm_map_pidx(b, page_nr);
last_page_nr = page_nr;
}
if (val)
@@ -1289,7 +1289,7 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
}
if (p_addr)
- __bm_unmap(p_addr, KM_IRQ1);
+ __bm_unmap(p_addr);
if (c < 0)
bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
else if (c > 0)
@@ -1342,13 +1342,13 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
{
int i;
int bits;
- unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
+ unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]);
paddr[i] = ~0UL;
b->bm_set += BITS_PER_LONG - bits;
}
- kunmap_atomic(paddr, KM_IRQ1);
+ kunmap_atomic(paddr);
}
/* Same thing as drbd_bm_set_bits,
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 9cf20355ceec..8d680562ba73 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -59,8 +59,8 @@
/* module parameter, defined in drbd_main.c */
extern unsigned int minor_count;
-extern int disable_sendpage;
-extern int allow_oos;
+extern bool disable_sendpage;
+extern bool allow_oos;
extern unsigned int cn_idx;
#ifdef CONFIG_DRBD_FAULT_INJECTION
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0358e55356c8..211fc44f84be 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -117,8 +117,8 @@ module_param(fault_devs, int, 0644);
/* module parameter, defined */
unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
-int disable_sendpage;
-int allow_oos;
+bool disable_sendpage;
+bool allow_oos;
unsigned int cn_idx = CN_IDX_DRBD;
int proc_details; /* Detail level in proc drbd*/
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index af2a25049bce..abfaacaaf346 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -179,7 +179,7 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
drbd_bcast_ev_helper(mdev, cmd);
- ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
if (ret)
dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
usermode_helper, cmd, mb,
@@ -2526,10 +2526,10 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
page = e->pages;
page_chain_for_each(page) {
- void *d = kmap_atomic(page, KM_USER0);
+ void *d = kmap_atomic(page);
unsigned l = min_t(unsigned, len, PAGE_SIZE);
memcpy(tl, d, l);
- kunmap_atomic(d, KM_USER0);
+ kunmap_atomic(d);
tl = (unsigned short*)((char*)tl + l);
len -= l;
if (len == 0)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 510fb10ec45a..b0b00d70c166 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -202,7 +202,6 @@ static int slow_floppy;
#include <asm/dma.h>
#include <asm/irq.h>
-#include <asm/system.h>
static int FLOPPY_IRQ = 6;
static int FLOPPY_DMA = 2;
@@ -1031,37 +1030,6 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
return 0;
}
-static DEFINE_SPINLOCK(floppy_hlt_lock);
-static int hlt_disabled;
-static void floppy_disable_hlt(void)
-{
- unsigned long flags;
-
- WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012");
- spin_lock_irqsave(&floppy_hlt_lock, flags);
- if (!hlt_disabled) {
- hlt_disabled = 1;
-#ifdef HAVE_DISABLE_HLT
- disable_hlt();
-#endif
- }
- spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
-static void floppy_enable_hlt(void)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&floppy_hlt_lock, flags);
- if (hlt_disabled) {
- hlt_disabled = 0;
-#ifdef HAVE_DISABLE_HLT
- enable_hlt();
-#endif
- }
- spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
static void setup_DMA(void)
{
unsigned long f;
@@ -1106,7 +1074,6 @@ static void setup_DMA(void)
fd_enable_dma();
release_dma_lock(f);
#endif
- floppy_disable_hlt();
}
static void show_floppy(void);
@@ -1708,7 +1675,6 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
fd_disable_dma();
release_dma_lock(f);
- floppy_enable_hlt();
do_floppy = NULL;
if (fdc >= N_FDC || FDCS->address == -1) {
/* we don't even know which FDC is the culprit */
@@ -1857,8 +1823,6 @@ static void floppy_shutdown(unsigned long data)
show_floppy();
cancel_activity();
- floppy_enable_hlt();
-
flags = claim_dma_lock();
fd_disable_dma();
release_dma_lock(flags);
@@ -3832,7 +3796,7 @@ static int __floppy_read_block_0(struct block_device *bdev)
bio.bi_size = size;
bio.bi_bdev = bdev;
bio.bi_sector = 0;
- bio.bi_flags = BIO_QUIET;
+ bio.bi_flags = (1 << BIO_QUIET);
init_completion(&complete);
bio.bi_private = &complete;
bio.bi_end_io = floppy_rb0_complete;
@@ -4368,8 +4332,14 @@ out_unreg_blkdev:
out_put_disk:
while (dr--) {
del_timer_sync(&motor_off_timer[dr]);
- if (disks[dr]->queue)
+ if (disks[dr]->queue) {
blk_cleanup_queue(disks[dr]->queue);
+ /*
+ * put_disk() is not paired with add_disk() and
+ * will put queue reference one extra time. fix it.
+ */
+ disks[dr]->queue = NULL;
+ }
put_disk(disks[dr]);
}
return err;
@@ -4503,7 +4473,6 @@ static void floppy_release_irq_and_dma(void)
#if N_FDC > 1
set_dor(1, ~8, 0);
#endif
- floppy_enable_hlt();
if (floppy_track_buffer && max_buffer_sectors) {
tmpsize = max_buffer_sectors * 1024;
@@ -4579,6 +4548,15 @@ static void __exit floppy_module_exit(void)
platform_device_unregister(&floppy_device[drive]);
}
blk_cleanup_queue(disks[drive]->queue);
+
+ /*
+ * These disks have not called add_disk(). Don't put down
+ * queue reference in put_disk().
+ */
+ if (!(allowed_drive_mask & (1 << drive)) ||
+ fdc_state[FDC(drive)].version == FDC_NONE)
+ disks[drive]->queue = NULL;
+
put_disk(disks[drive]);
}
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index b52c9ca146fc..bf397bf108b7 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -44,7 +44,6 @@
#define HD_IRQ 14
#define REALLY_SLOW_IO
-#include <asm/system.h>
#include <asm/io.h>
#include <asm/uaccess.h>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f00257782fcc..bbca966f8f66 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -93,16 +93,16 @@ static int transfer_none(struct loop_device *lo, int cmd,
struct page *loop_page, unsigned loop_off,
int size, sector_t real_block)
{
- char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
- char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
+ char *raw_buf = kmap_atomic(raw_page) + raw_off;
+ char *loop_buf = kmap_atomic(loop_page) + loop_off;
if (cmd == READ)
memcpy(loop_buf, raw_buf, size);
else
memcpy(raw_buf, loop_buf, size);
- kunmap_atomic(loop_buf, KM_USER1);
- kunmap_atomic(raw_buf, KM_USER0);
+ kunmap_atomic(loop_buf);
+ kunmap_atomic(raw_buf);
cond_resched();
return 0;
}
@@ -112,8 +112,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
struct page *loop_page, unsigned loop_off,
int size, sector_t real_block)
{
- char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
- char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
+ char *raw_buf = kmap_atomic(raw_page) + raw_off;
+ char *loop_buf = kmap_atomic(loop_page) + loop_off;
char *in, *out, *key;
int i, keysize;
@@ -130,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
for (i = 0; i < size; i++)
*out++ = *in++ ^ key[(i & 511) % keysize];
- kunmap_atomic(loop_buf, KM_USER1);
- kunmap_atomic(raw_buf, KM_USER0);
+ kunmap_atomic(loop_buf);
+ kunmap_atomic(raw_buf);
cond_resched();
return 0;
}
@@ -356,14 +356,14 @@ lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd)
return __splice_from_pipe(pipe, sd, lo_splice_actor);
}
-static int
+static ssize_t
do_lo_receive(struct loop_device *lo,
struct bio_vec *bvec, int bsize, loff_t pos)
{
struct lo_read_data cookie;
struct splice_desc sd;
struct file *file;
- long retval;
+ ssize_t retval;
cookie.lo = lo;
cookie.page = bvec->bv_page;
@@ -379,26 +379,28 @@ do_lo_receive(struct loop_device *lo,
file = lo->lo_backing_file;
retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);
- if (retval < 0)
- return retval;
- if (retval != bvec->bv_len)
- return -EIO;
- return 0;
+ return retval;
}
static int
lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
{
struct bio_vec *bvec;
- int i, ret = 0;
+ ssize_t s;
+ int i;
bio_for_each_segment(bvec, bio, i) {
- ret = do_lo_receive(lo, bvec, bsize, pos);
- if (ret < 0)
+ s = do_lo_receive(lo, bvec, bsize, pos);
+ if (s < 0)
+ return s;
+
+ if (s != bvec->bv_len) {
+ zero_fill_bio(bio);
break;
+ }
pos += bvec->bv_len;
}
- return ret;
+ return 0;
}
static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig
new file mode 100644
index 000000000000..0ba837fc62a8
--- /dev/null
+++ b/drivers/block/mtip32xx/Kconfig
@@ -0,0 +1,9 @@
+#
+# mtip32xx device driver configuration
+#
+
+config BLK_DEV_PCIESSD_MTIP32XX
+ tristate "Block Device Driver for Micron PCIe SSDs"
+ depends on PCI
+ help
+ This enables the block driver for Micron PCIe SSDs.
diff --git a/drivers/block/mtip32xx/Makefile b/drivers/block/mtip32xx/Makefile
new file mode 100644
index 000000000000..4fbef8c8329b
--- /dev/null
+++ b/drivers/block/mtip32xx/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for Block device driver for Micron PCIe SSD
+#
+
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx.o
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
new file mode 100644
index 000000000000..00f9fc992090
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -0,0 +1,4118 @@
+/*
+ * Driver for the Micron P320 SSD
+ * Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ * Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/ata.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/uaccess.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <../drivers/ata/ahci.h>
+#include <linux/export.h>
+#include "mtip32xx.h"
+
+#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32)
+#define HW_CMD_TBL_SZ (AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16))
+#define HW_CMD_TBL_AR_SZ (HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS)
+#define HW_PORT_PRIV_DMA_SZ \
+ (HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ)
+
+#define HOST_CAP_NZDMA (1 << 19)
+#define HOST_HSORG 0xFC
+#define HSORG_DISABLE_SLOTGRP_INTR (1<<24)
+#define HSORG_DISABLE_SLOTGRP_PXIS (1<<16)
+#define HSORG_HWREV 0xFF00
+#define HSORG_STYLE 0x8
+#define HSORG_SLOTGROUPS 0x7
+
+#define PORT_COMMAND_ISSUE 0x38
+#define PORT_SDBV 0x7C
+
+#define PORT_OFFSET 0x100
+#define PORT_MEM_SIZE 0x80
+
+#define PORT_IRQ_ERR \
+ (PORT_IRQ_HBUS_ERR | PORT_IRQ_IF_ERR | PORT_IRQ_CONNECT | \
+ PORT_IRQ_PHYRDY | PORT_IRQ_UNK_FIS | PORT_IRQ_BAD_PMP | \
+ PORT_IRQ_TF_ERR | PORT_IRQ_HBUS_DATA_ERR | PORT_IRQ_IF_NONFATAL | \
+ PORT_IRQ_OVERFLOW)
+#define PORT_IRQ_LEGACY \
+ (PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS)
+#define PORT_IRQ_HANDLED \
+ (PORT_IRQ_SDB_FIS | PORT_IRQ_LEGACY | \
+ PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR | \
+ PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)
+#define DEF_PORT_IRQ \
+ (PORT_IRQ_ERR | PORT_IRQ_LEGACY | PORT_IRQ_SDB_FIS)
+
+/* product numbers */
+#define MTIP_PRODUCT_UNKNOWN 0x00
+#define MTIP_PRODUCT_ASICFPGA 0x11
+
+/* Device instance number, incremented each time a device is probed. */
+static int instance;
+
+/*
+ * Global variable used to hold the major block device number
+ * allocated in mtip_init().
+ */
+static int mtip_major;
+
+static DEFINE_SPINLOCK(rssd_index_lock);
+static DEFINE_IDA(rssd_index_ida);
+
+static int mtip_block_initialize(struct driver_data *dd);
+
+#ifdef CONFIG_COMPAT
+struct mtip_compat_ide_task_request_s {
+ __u8 io_ports[8];
+ __u8 hob_ports[8];
+ ide_reg_valid_t out_flags;
+ ide_reg_valid_t in_flags;
+ int data_phase;
+ int req_cmd;
+ compat_ulong_t out_size;
+ compat_ulong_t in_size;
+};
+#endif
+
+/*
+ * This function check_for_surprise_removal is called
+ * while card is removed from the system and it will
+ * read the vendor id from the configration space
+ *
+ * @pdev Pointer to the pci_dev structure.
+ *
+ * return value
+ * true if device removed, else false
+ */
+static bool mtip_check_surprise_removal(struct pci_dev *pdev)
+{
+ u16 vendor_id = 0;
+
+ /* Read the vendorID from the configuration space */
+ pci_read_config_word(pdev, 0x00, &vendor_id);
+ if (vendor_id == 0xFFFF)
+ return true; /* device removed */
+
+ return false; /* device present */
+}
+
+/*
+ * This function is called for clean the pending command in the
+ * command slot during the surprise removal of device and return
+ * error to the upper layer.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_command_cleanup(struct driver_data *dd)
+{
+ int group = 0, commandslot = 0, commandindex = 0;
+ struct mtip_cmd *command;
+ struct mtip_port *port = dd->port;
+ static int in_progress;
+
+ if (in_progress)
+ return;
+
+ in_progress = 1;
+
+ for (group = 0; group < 4; group++) {
+ for (commandslot = 0; commandslot < 32; commandslot++) {
+ if (!(port->allocated[group] & (1 << commandslot)))
+ continue;
+
+ commandindex = group << 5 | commandslot;
+ command = &port->commands[commandindex];
+
+ if (atomic_read(&command->active)
+ && (command->async_callback)) {
+ command->async_callback(command->async_data,
+ -ENODEV);
+ command->async_callback = NULL;
+ command->async_data = NULL;
+ }
+
+ dma_unmap_sg(&port->dd->pdev->dev,
+ command->sg,
+ command->scatter_ents,
+ command->direction);
+ }
+ }
+
+ up(&port->cmd_slot);
+
+ set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
+ in_progress = 0;
+}
+
+/*
+ * Obtain an empty command slot.
+ *
+ * This function needs to be reentrant since it could be called
+ * at the same time on multiple CPUs. The allocation of the
+ * command slot must be atomic.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ * >= 0 Index of command slot obtained.
+ * -1 No command slots available.
+ */
+static int get_slot(struct mtip_port *port)
+{
+ int slot, i;
+ unsigned int num_command_slots = port->dd->slot_groups * 32;
+
+ /*
+ * Try 10 times, because there is a small race here.
+ * that's ok, because it's still cheaper than a lock.
+ *
+ * Race: Since this section is not protected by lock, same bit
+ * could be chosen by different process contexts running in
+ * different processor. So instead of costly lock, we are going
+ * with loop.
+ */
+ for (i = 0; i < 10; i++) {
+ slot = find_next_zero_bit(port->allocated,
+ num_command_slots, 1);
+ if ((slot < num_command_slots) &&
+ (!test_and_set_bit(slot, port->allocated)))
+ return slot;
+ }
+ dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n");
+
+ if (mtip_check_surprise_removal(port->dd->pdev)) {
+ /* Device not present, clean outstanding commands */
+ mtip_command_cleanup(port->dd);
+ }
+ return -1;
+}
+
+/*
+ * Release a command slot.
+ *
+ * @port Pointer to the port data structure.
+ * @tag Tag of command to release
+ *
+ * return value
+ * None
+ */
+static inline void release_slot(struct mtip_port *port, int tag)
+{
+ smp_mb__before_clear_bit();
+ clear_bit(tag, port->allocated);
+ smp_mb__after_clear_bit();
+}
+
+/*
+ * Reset the HBA (without sleeping)
+ *
+ * Just like hba_reset, except does not call sleep, so can be
+ * run from interrupt/tasklet context.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 The reset was successful.
+ * -1 The HBA Reset bit did not clear.
+ */
+static int hba_reset_nosleep(struct driver_data *dd)
+{
+ unsigned long timeout;
+
+ /* Chip quirk: quiesce any chip function */
+ mdelay(10);
+
+ /* Set the reset bit */
+ writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+ /* Flush */
+ readl(dd->mmio + HOST_CTL);
+
+ /*
+ * Wait 10ms then spin for up to 1 second
+ * waiting for reset acknowledgement
+ */
+ timeout = jiffies + msecs_to_jiffies(1000);
+ mdelay(10);
+ while ((readl(dd->mmio + HOST_CTL) & HOST_RESET)
+ && time_before(jiffies, timeout))
+ mdelay(1);
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+ return -1;
+
+ if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Issue a command to the hardware.
+ *
+ * Set the appropriate bit in the s_active and Command Issue hardware
+ * registers, causing hardware command processing to begin.
+ *
+ * @port Pointer to the port structure.
+ * @tag The tag of the command to be issued.
+ *
+ * return value
+ * None
+ */
+static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
+{
+ unsigned long flags = 0;
+
+ atomic_set(&port->commands[tag].active, 1);
+
+ spin_lock_irqsave(&port->cmd_issue_lock, flags);
+
+ writel((1 << MTIP_TAG_BIT(tag)),
+ port->s_active[MTIP_TAG_INDEX(tag)]);
+ writel((1 << MTIP_TAG_BIT(tag)),
+ port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+
+ spin_unlock_irqrestore(&port->cmd_issue_lock, flags);
+
+ /* Set the command's timeout value.*/
+ port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
+ MTIP_NCQ_COMMAND_TIMEOUT_MS);
+}
+
+/*
+ * Enable/disable the reception of FIS
+ *
+ * @port Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ * Previous state: 1 enabled, 0 disabled
+ */
+static int mtip_enable_fis(struct mtip_port *port, int enable)
+{
+ u32 tmp;
+
+ /* enable FIS reception */
+ tmp = readl(port->mmio + PORT_CMD);
+ if (enable)
+ writel(tmp | PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+ else
+ writel(tmp & ~PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+
+ /* Flush */
+ readl(port->mmio + PORT_CMD);
+
+ return (((tmp & PORT_CMD_FIS_RX) == PORT_CMD_FIS_RX));
+}
+
+/*
+ * Enable/disable the DMA engine
+ *
+ * @port Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ * Previous state: 1 enabled, 0 disabled.
+ */
+static int mtip_enable_engine(struct mtip_port *port, int enable)
+{
+ u32 tmp;
+
+ /* enable FIS reception */
+ tmp = readl(port->mmio + PORT_CMD);
+ if (enable)
+ writel(tmp | PORT_CMD_START, port->mmio + PORT_CMD);
+ else
+ writel(tmp & ~PORT_CMD_START, port->mmio + PORT_CMD);
+
+ readl(port->mmio + PORT_CMD);
+ return (((tmp & PORT_CMD_START) == PORT_CMD_START));
+}
+
+/*
+ * Enables the port DMA engine and FIS reception.
+ *
+ * return value
+ * None
+ */
+static inline void mtip_start_port(struct mtip_port *port)
+{
+ /* Enable FIS reception */
+ mtip_enable_fis(port, 1);
+
+ /* Enable the DMA engine */
+ mtip_enable_engine(port, 1);
+}
+
+/*
+ * Deinitialize a port by disabling port interrupts, the DMA engine,
+ * and FIS reception.
+ *
+ * @port Pointer to the port structure
+ *
+ * return value
+ * None
+ */
+static inline void mtip_deinit_port(struct mtip_port *port)
+{
+ /* Disable interrupts on this port */
+ writel(0, port->mmio + PORT_IRQ_MASK);
+
+ /* Disable the DMA engine */
+ mtip_enable_engine(port, 0);
+
+ /* Disable FIS reception */
+ mtip_enable_fis(port, 0);
+}
+
+/*
+ * Initialize a port.
+ *
+ * This function deinitializes the port by calling mtip_deinit_port() and
+ * then initializes it by setting the command header and RX FIS addresses,
+ * clearing the SError register and any pending port interrupts before
+ * re-enabling the default set of port interrupts.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_init_port(struct mtip_port *port)
+{
+ int i;
+ mtip_deinit_port(port);
+
+ /* Program the command list base and FIS base addresses */
+ if (readl(port->dd->mmio + HOST_CAP) & HOST_CAP_64) {
+ writel((port->command_list_dma >> 16) >> 16,
+ port->mmio + PORT_LST_ADDR_HI);
+ writel((port->rxfis_dma >> 16) >> 16,
+ port->mmio + PORT_FIS_ADDR_HI);
+ }
+
+ writel(port->command_list_dma & 0xFFFFFFFF,
+ port->mmio + PORT_LST_ADDR);
+ writel(port->rxfis_dma & 0xFFFFFFFF, port->mmio + PORT_FIS_ADDR);
+
+ /* Clear SError */
+ writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+
+ /* reset the completed registers.*/
+ for (i = 0; i < port->dd->slot_groups; i++)
+ writel(0xFFFFFFFF, port->completed[i]);
+
+ /* Clear any pending interrupts for this port */
+ writel(readl(port->dd->mmio + PORT_IRQ_STAT),
+ port->dd->mmio + PORT_IRQ_STAT);
+
+ /* Clear any pending interrupts on the HBA. */
+ writel(readl(port->dd->mmio + HOST_IRQ_STAT),
+ port->dd->mmio + HOST_IRQ_STAT);
+
+ /* Enable port interrupts */
+ writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
+}
+
+/*
+ * Restart a port
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_restart_port(struct mtip_port *port)
+{
+ unsigned long timeout;
+
+ /* Disable the DMA engine */
+ mtip_enable_engine(port, 0);
+
+ /* Chip quirk: wait up to 500ms for PxCMD.CR == 0 */
+ timeout = jiffies + msecs_to_jiffies(500);
+ while ((readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON)
+ && time_before(jiffies, timeout))
+ ;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return;
+
+ /*
+ * Chip quirk: escalate to hba reset if
+ * PxCMD.CR not clear after 500 ms
+ */
+ if (readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) {
+ dev_warn(&port->dd->pdev->dev,
+ "PxCMD.CR not clear, escalating reset\n");
+
+ if (hba_reset_nosleep(port->dd))
+ dev_err(&port->dd->pdev->dev,
+ "HBA reset escalation failed.\n");
+
+ /* 30 ms delay before com reset to quiesce chip */
+ mdelay(30);
+ }
+
+ dev_warn(&port->dd->pdev->dev, "Issuing COM reset\n");
+
+ /* Set PxSCTL.DET */
+ writel(readl(port->mmio + PORT_SCR_CTL) |
+ 1, port->mmio + PORT_SCR_CTL);
+ readl(port->mmio + PORT_SCR_CTL);
+
+ /* Wait 1 ms to quiesce chip function */
+ timeout = jiffies + msecs_to_jiffies(1);
+ while (time_before(jiffies, timeout))
+ ;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return;
+
+ /* Clear PxSCTL.DET */
+ writel(readl(port->mmio + PORT_SCR_CTL) & ~1,
+ port->mmio + PORT_SCR_CTL);
+ readl(port->mmio + PORT_SCR_CTL);
+
+ /* Wait 500 ms for bit 0 of PORT_SCR_STS to be set */
+ timeout = jiffies + msecs_to_jiffies(500);
+ while (((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+ && time_before(jiffies, timeout))
+ ;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return;
+
+ if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+ dev_warn(&port->dd->pdev->dev,
+ "COM reset failed\n");
+
+ mtip_init_port(port);
+ mtip_start_port(port);
+
+}
+
+/*
+ * Helper function for tag logging
+ */
+static void print_tags(struct driver_data *dd,
+ char *msg,
+ unsigned long *tagbits,
+ int cnt)
+{
+ unsigned char tagmap[128];
+ int group, tagmap_len = 0;
+
+ memset(tagmap, 0, sizeof(tagmap));
+ for (group = SLOTBITS_IN_LONGS; group > 0; group--)
+ tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ",
+ tagbits[group-1]);
+ dev_warn(&dd->pdev->dev,
+ "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap);
+}
+
+/*
+ * Called periodically to see if any read/write commands are
+ * taking too long to complete.
+ *
+ * @data Pointer to the PORT data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_timeout_function(unsigned long int data)
+{
+ struct mtip_port *port = (struct mtip_port *) data;
+ struct host_to_dev_fis *fis;
+ struct mtip_cmd *command;
+ int tag, cmdto_cnt = 0;
+ unsigned int bit, group;
+ unsigned int num_command_slots = port->dd->slot_groups * 32;
+ unsigned long to, tagaccum[SLOTBITS_IN_LONGS];
+
+ if (unlikely(!port))
+ return;
+
+ if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) {
+ mod_timer(&port->cmd_timer,
+ jiffies + msecs_to_jiffies(30000));
+ return;
+ }
+ /* clear the tag accumulator */
+ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+ for (tag = 0; tag < num_command_slots; tag++) {
+ /*
+ * Skip internal command slot as it has
+ * its own timeout mechanism
+ */
+ if (tag == MTIP_TAG_INTERNAL)
+ continue;
+
+ if (atomic_read(&port->commands[tag].active) &&
+ (time_after(jiffies, port->commands[tag].comp_time))) {
+ group = tag >> 5;
+ bit = tag & 0x1F;
+
+ command = &port->commands[tag];
+ fis = (struct host_to_dev_fis *) command->command;
+
+ set_bit(tag, tagaccum);
+ cmdto_cnt++;
+ if (cmdto_cnt == 1)
+ set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+
+ /*
+ * Clear the completed bit. This should prevent
+ * any interrupt handlers from trying to retire
+ * the command.
+ */
+ writel(1 << bit, port->completed[group]);
+
+ /* Call the async completion callback. */
+ if (likely(command->async_callback))
+ command->async_callback(command->async_data,
+ -EIO);
+ command->async_callback = NULL;
+ command->comp_func = NULL;
+
+ /* Unmap the DMA scatter list entries */
+ dma_unmap_sg(&port->dd->pdev->dev,
+ command->sg,
+ command->scatter_ents,
+ command->direction);
+
+ /*
+ * Clear the allocated bit and active tag for the
+ * command.
+ */
+ atomic_set(&port->commands[tag].active, 0);
+ release_slot(port, tag);
+
+ up(&port->cmd_slot);
+ }
+ }
+
+ if (cmdto_cnt && !test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
+ print_tags(port->dd, "timed out", tagaccum, cmdto_cnt);
+
+ mtip_restart_port(port);
+ clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+ }
+
+ if (port->ic_pause_timer) {
+ to = port->ic_pause_timer + msecs_to_jiffies(1000);
+ if (time_after(jiffies, to)) {
+ if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
+ port->ic_pause_timer = 0;
+ clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+ clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+ clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+ }
+
+
+ }
+ }
+
+ /* Restart the timer */
+ mod_timer(&port->cmd_timer,
+ jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * IO completion function.
+ *
+ * This completion function is called by the driver ISR when a
+ * command that was issued by the kernel completes. It first calls the
+ * asynchronous completion function which normally calls back into the block
+ * layer passing the asynchronous callback data, then unmaps the
+ * scatter list associated with the completed command, and finally
+ * clears the allocated bit associated with the completed command.
+ *
+ * @port Pointer to the port data structure.
+ * @tag Tag of the command.
+ * @data Pointer to driver_data.
+ * @status Completion status.
+ *
+ * return value
+ * None
+ */
+static void mtip_async_complete(struct mtip_port *port,
+ int tag,
+ void *data,
+ int status)
+{
+ struct mtip_cmd *command;
+ struct driver_data *dd = data;
+ int cb_status = status ? -EIO : 0;
+
+ if (unlikely(!dd) || unlikely(!port))
+ return;
+
+ command = &port->commands[tag];
+
+ if (unlikely(status == PORT_IRQ_TF_ERR)) {
+ dev_warn(&port->dd->pdev->dev,
+ "Command tag %d failed due to TFE\n", tag);
+ }
+
+ /* Upper layer callback */
+ if (likely(command->async_callback))
+ command->async_callback(command->async_data, cb_status);
+
+ command->async_callback = NULL;
+ command->comp_func = NULL;
+
+ /* Unmap the DMA scatter list entries */
+ dma_unmap_sg(&dd->pdev->dev,
+ command->sg,
+ command->scatter_ents,
+ command->direction);
+
+ /* Clear the allocated and active bits for the command */
+ atomic_set(&port->commands[tag].active, 0);
+ release_slot(port, tag);
+
+ up(&port->cmd_slot);
+}
+
+/*
+ * Internal command completion callback function.
+ *
+ * This function is normally called by the driver ISR when an internal
+ * command completed. This function signals the command completion by
+ * calling complete().
+ *
+ * @port Pointer to the port data structure.
+ * @tag Tag of the command that has completed.
+ * @data Pointer to a completion structure.
+ * @status Completion status.
+ *
+ * return value
+ * None
+ */
+static void mtip_completion(struct mtip_port *port,
+ int tag,
+ void *data,
+ int status)
+{
+ struct mtip_cmd *command = &port->commands[tag];
+ struct completion *waiting = data;
+ if (unlikely(status == PORT_IRQ_TF_ERR))
+ dev_warn(&port->dd->pdev->dev,
+ "Internal command %d completed with TFE\n", tag);
+
+ command->async_callback = NULL;
+ command->comp_func = NULL;
+
+ complete(waiting);
+}
+
+static void mtip_null_completion(struct mtip_port *port,
+ int tag,
+ void *data,
+ int status)
+{
+ return;
+}
+
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+ dma_addr_t buffer_dma, unsigned int sectors);
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+ struct smart_attr *attrib);
+/*
+ * Handle an error.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_handle_tfe(struct driver_data *dd)
+{
+ int group, tag, bit, reissue, rv;
+ struct mtip_port *port;
+ struct mtip_cmd *cmd;
+ u32 completed;
+ struct host_to_dev_fis *fis;
+ unsigned long tagaccum[SLOTBITS_IN_LONGS];
+ unsigned int cmd_cnt = 0;
+ unsigned char *buf;
+ char *fail_reason = NULL;
+ int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0;
+
+ dev_warn(&dd->pdev->dev, "Taskfile error\n");
+
+ port = dd->port;
+
+ /* Stop the timer to prevent command timeouts. */
+ del_timer(&port->cmd_timer);
+
+ /* clear the tag accumulator */
+ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+ /* Set eh_active */
+ set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+
+ /* Loop through all the groups */
+ for (group = 0; group < dd->slot_groups; group++) {
+ completed = readl(port->completed[group]);
+
+ /* clear completed status register in the hardware.*/
+ writel(completed, port->completed[group]);
+
+ /* Process successfully completed commands */
+ for (bit = 0; bit < 32 && completed; bit++) {
+ if (!(completed & (1<<bit)))
+ continue;
+ tag = (group << 5) + bit;
+
+ /* Skip the internal command slot */
+ if (tag == MTIP_TAG_INTERNAL)
+ continue;
+
+ cmd = &port->commands[tag];
+ if (likely(cmd->comp_func)) {
+ set_bit(tag, tagaccum);
+ cmd_cnt++;
+ atomic_set(&cmd->active, 0);
+ cmd->comp_func(port,
+ tag,
+ cmd->comp_data,
+ 0);
+ } else {
+ dev_err(&port->dd->pdev->dev,
+ "Missing completion func for tag %d",
+ tag);
+ if (mtip_check_surprise_removal(dd->pdev)) {
+ mtip_command_cleanup(dd);
+ /* don't proceed further */
+ return;
+ }
+ }
+ }
+ }
+
+ print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt);
+
+ /* Restart the port */
+ mdelay(20);
+ mtip_restart_port(port);
+
+ /* Trying to determine the cause of the error */
+ rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+ dd->port->log_buf,
+ dd->port->log_buf_dma, 1);
+ if (rv) {
+ dev_warn(&dd->pdev->dev,
+ "Error in READ LOG EXT (10h) command\n");
+ /* non-critical error, don't fail the load */
+ } else {
+ buf = (unsigned char *)dd->port->log_buf;
+ if (buf[259] & 0x1) {
+ dev_info(&dd->pdev->dev,
+ "Write protect bit is set.\n");
+ set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+ fail_all_ncq_write = 1;
+ fail_reason = "write protect";
+ }
+ if (buf[288] == 0xF7) {
+ dev_info(&dd->pdev->dev,
+ "Exceeded Tmax, drive in thermal shutdown.\n");
+ set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+ fail_all_ncq_cmds = 1;
+ fail_reason = "thermal shutdown";
+ }
+ if (buf[288] == 0xBF) {
+ dev_info(&dd->pdev->dev,
+ "Drive indicates rebuild has failed.\n");
+ fail_all_ncq_cmds = 1;
+ fail_reason = "rebuild failed";
+ }
+ }
+
+ /* clear the tag accumulator */
+ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+ /* Loop through all the groups */
+ for (group = 0; group < dd->slot_groups; group++) {
+ for (bit = 0; bit < 32; bit++) {
+ reissue = 1;
+ tag = (group << 5) + bit;
+ cmd = &port->commands[tag];
+
+ /* If the active bit is set re-issue the command */
+ if (atomic_read(&cmd->active) == 0)
+ continue;
+
+ fis = (struct host_to_dev_fis *)cmd->command;
+
+ /* Should re-issue? */
+ if (tag == MTIP_TAG_INTERNAL ||
+ fis->command == ATA_CMD_SET_FEATURES)
+ reissue = 0;
+ else {
+ if (fail_all_ncq_cmds ||
+ (fail_all_ncq_write &&
+ fis->command == ATA_CMD_FPDMA_WRITE)) {
+ dev_warn(&dd->pdev->dev,
+ " Fail: %s w/tag %d [%s].\n",
+ fis->command == ATA_CMD_FPDMA_WRITE ?
+ "write" : "read",
+ tag,
+ fail_reason != NULL ?
+ fail_reason : "unknown");
+ atomic_set(&cmd->active, 0);
+ if (cmd->comp_func) {
+ cmd->comp_func(port, tag,
+ cmd->comp_data,
+ -ENODATA);
+ }
+ continue;
+ }
+ }
+
+ /*
+ * First check if this command has
+ * exceeded its retries.
+ */
+ if (reissue && (cmd->retries-- > 0)) {
+
+ set_bit(tag, tagaccum);
+
+ /* Re-issue the command. */
+ mtip_issue_ncq_command(port, tag);
+
+ continue;
+ }
+
+ /* Retire a command that will not be reissued */
+ dev_warn(&port->dd->pdev->dev,
+ "retiring tag %d\n", tag);
+ atomic_set(&cmd->active, 0);
+
+ if (cmd->comp_func)
+ cmd->comp_func(
+ port,
+ tag,
+ cmd->comp_data,
+ PORT_IRQ_TF_ERR);
+ else
+ dev_warn(&port->dd->pdev->dev,
+ "Bad completion for tag %d\n",
+ tag);
+ }
+ }
+ print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt);
+
+ /* clear eh_active */
+ clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+
+ mod_timer(&port->cmd_timer,
+ jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+}
+
+/*
+ * Handle a set device bits interrupt
+ */
+static inline void mtip_process_sdbf(struct driver_data *dd)
+{
+ struct mtip_port *port = dd->port;
+ int group, tag, bit;
+ u32 completed;
+ struct mtip_cmd *command;
+
+ /* walk all bits in all slot groups */
+ for (group = 0; group < dd->slot_groups; group++) {
+ completed = readl(port->completed[group]);
+
+ /* clear completed status register in the hardware.*/
+ writel(completed, port->completed[group]);
+
+ /* Process completed commands. */
+ for (bit = 0;
+ (bit < 32) && completed;
+ bit++, completed >>= 1) {
+ if (completed & 0x01) {
+ tag = (group << 5) | bit;
+
+ /* skip internal command slot. */
+ if (unlikely(tag == MTIP_TAG_INTERNAL))
+ continue;
+
+ command = &port->commands[tag];
+ /* make internal callback */
+ if (likely(command->comp_func)) {
+ command->comp_func(
+ port,
+ tag,
+ command->comp_data,
+ 0);
+ } else {
+ dev_warn(&dd->pdev->dev,
+ "Null completion "
+ "for tag %d",
+ tag);
+
+ if (mtip_check_surprise_removal(
+ dd->pdev)) {
+ mtip_command_cleanup(dd);
+ return;
+ }
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Process legacy pio and d2h interrupts
+ */
+static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
+{
+ struct mtip_port *port = dd->port;
+ struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL];
+
+ if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
+ (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+ & (1 << MTIP_TAG_INTERNAL))) {
+ if (cmd->comp_func) {
+ cmd->comp_func(port,
+ MTIP_TAG_INTERNAL,
+ cmd->comp_data,
+ 0);
+ return;
+ }
+ }
+
+ return;
+}
+
+/*
+ * Demux and handle errors
+ */
+static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
+{
+ if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR)))
+ mtip_handle_tfe(dd);
+
+ if (unlikely(port_stat & PORT_IRQ_CONNECT)) {
+ dev_warn(&dd->pdev->dev,
+ "Clearing PxSERR.DIAG.x\n");
+ writel((1 << 26), dd->port->mmio + PORT_SCR_ERR);
+ }
+
+ if (unlikely(port_stat & PORT_IRQ_PHYRDY)) {
+ dev_warn(&dd->pdev->dev,
+ "Clearing PxSERR.DIAG.n\n");
+ writel((1 << 16), dd->port->mmio + PORT_SCR_ERR);
+ }
+
+ if (unlikely(port_stat & ~PORT_IRQ_HANDLED)) {
+ dev_warn(&dd->pdev->dev,
+ "Port stat errors %x unhandled\n",
+ (port_stat & ~PORT_IRQ_HANDLED));
+ }
+}
+
+static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
+{
+ struct driver_data *dd = (struct driver_data *) data;
+ struct mtip_port *port = dd->port;
+ u32 hba_stat, port_stat;
+ int rv = IRQ_NONE;
+
+ hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
+ if (hba_stat) {
+ rv = IRQ_HANDLED;
+
+ /* Acknowledge the interrupt status on the port.*/
+ port_stat = readl(port->mmio + PORT_IRQ_STAT);
+ writel(port_stat, port->mmio + PORT_IRQ_STAT);
+
+ /* Demux port status */
+ if (likely(port_stat & PORT_IRQ_SDB_FIS))
+ mtip_process_sdbf(dd);
+
+ if (unlikely(port_stat & PORT_IRQ_ERR)) {
+ if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+ mtip_command_cleanup(dd);
+ /* don't proceed further */
+ return IRQ_HANDLED;
+ }
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag))
+ return rv;
+
+ mtip_process_errors(dd, port_stat & PORT_IRQ_ERR);
+ }
+
+ if (unlikely(port_stat & PORT_IRQ_LEGACY))
+ mtip_process_legacy(dd, port_stat & PORT_IRQ_LEGACY);
+ }
+
+ /* acknowledge interrupt */
+ writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
+
+ return rv;
+}
+
+/*
+ * Wrapper for mtip_handle_irq
+ * (ignores return code)
+ */
+static void mtip_tasklet(unsigned long data)
+{
+ mtip_handle_irq((struct driver_data *) data);
+}
+
+/*
+ * HBA interrupt subroutine.
+ *
+ * @irq IRQ number.
+ * @instance Pointer to the driver data structure.
+ *
+ * return value
+ * IRQ_HANDLED A HBA interrupt was pending and handled.
+ * IRQ_NONE This interrupt was not for the HBA.
+ */
+static irqreturn_t mtip_irq_handler(int irq, void *instance)
+{
+ struct driver_data *dd = instance;
+ tasklet_schedule(&dd->tasklet);
+ return IRQ_HANDLED;
+}
+
+static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
+{
+ atomic_set(&port->commands[tag].active, 1);
+ writel(1 << MTIP_TAG_BIT(tag),
+ port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+}
+
+static bool mtip_pause_ncq(struct mtip_port *port,
+ struct host_to_dev_fis *fis)
+{
+ struct host_to_dev_fis *reply;
+ unsigned long task_file_data;
+
+ reply = port->rxfis + RX_FIS_D2H_REG;
+ task_file_data = readl(port->mmio+PORT_TFDATA);
+
+ if ((task_file_data & 1) || (fis->command == ATA_CMD_SEC_ERASE_UNIT))
+ return false;
+
+ if (fis->command == ATA_CMD_SEC_ERASE_PREP) {
+ set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+ port->ic_pause_timer = jiffies;
+ return true;
+ } else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) &&
+ (fis->features == 0x03)) {
+ set_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+ port->ic_pause_timer = jiffies;
+ return true;
+ } else if ((fis->command == ATA_CMD_SEC_ERASE_UNIT) ||
+ ((fis->command == 0xFC) &&
+ (fis->features == 0x27 || fis->features == 0x72 ||
+ fis->features == 0x62 || fis->features == 0x26))) {
+ /* Com reset after secure erase or lowlevel format */
+ mtip_restart_port(port);
+ return false;
+ }
+
+ return false;
+}
+
+/*
+ * Wait for port to quiesce
+ *
+ * @port Pointer to port data structure
+ * @timeout Max duration to wait (ms)
+ *
+ * return value
+ * 0 Success
+ * -EBUSY Commands still active
+ */
+static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
+{
+ unsigned long to;
+ unsigned int n;
+ unsigned int active = 1;
+
+ to = jiffies + msecs_to_jiffies(timeout);
+ do {
+ if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) &&
+ test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+ msleep(20);
+ continue; /* svc thd is actively issuing commands */
+ }
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return -EFAULT;
+ /*
+ * Ignore s_active bit 0 of array element 0.
+ * This bit will always be set
+ */
+ active = readl(port->s_active[0]) & 0xFFFFFFFE;
+ for (n = 1; n < port->dd->slot_groups; n++)
+ active |= readl(port->s_active[n]);
+
+ if (!active)
+ break;
+
+ msleep(20);
+ } while (time_before(jiffies, to));
+
+ return active ? -EBUSY : 0;
+}
+
+/*
+ * Execute an internal command and wait for the completion.
+ *
+ * @port Pointer to the port data structure.
+ * @fis Pointer to the FIS that describes the command.
+ * @fis_len Length in WORDS of the FIS.
+ * @buffer DMA accessible for command data.
+ * @buf_len Length, in bytes, of the data buffer.
+ * @opts Command header options, excluding the FIS length
+ * and the number of PRD entries.
+ * @timeout Time in ms to wait for the command to complete.
+ *
+ * return value
+ * 0 Command completed successfully.
+ * -EFAULT The buffer address is not correctly aligned.
+ * -EBUSY Internal command or other IO in progress.
+ * -EAGAIN Time out waiting for command to complete.
+ */
+static int mtip_exec_internal_command(struct mtip_port *port,
+ struct host_to_dev_fis *fis,
+ int fis_len,
+ dma_addr_t buffer,
+ int buf_len,
+ u32 opts,
+ gfp_t atomic,
+ unsigned long timeout)
+{
+ struct mtip_cmd_sg *command_sg;
+ DECLARE_COMPLETION_ONSTACK(wait);
+ int rv = 0, ready2go = 1;
+ struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL];
+ unsigned long to;
+
+ /* Make sure the buffer is 8 byte aligned. This is asic specific. */
+ if (buffer & 0x00000007) {
+ dev_err(&port->dd->pdev->dev,
+ "SG buffer is not 8 byte aligned\n");
+ return -EFAULT;
+ }
+
+ to = jiffies + msecs_to_jiffies(timeout);
+ do {
+ ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL,
+ port->allocated);
+ if (ready2go)
+ break;
+ mdelay(100);
+ } while (time_before(jiffies, to));
+ if (!ready2go) {
+ dev_warn(&port->dd->pdev->dev,
+ "Internal cmd active. new cmd [%02X]\n", fis->command);
+ return -EBUSY;
+ }
+ set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ port->ic_pause_timer = 0;
+
+ if (fis->command == ATA_CMD_SEC_ERASE_UNIT)
+ clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+ else if (fis->command == ATA_CMD_DOWNLOAD_MICRO)
+ clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+
+ if (atomic == GFP_KERNEL) {
+ if (fis->command != ATA_CMD_STANDBYNOW1) {
+ /* wait for io to complete if non atomic */
+ if (mtip_quiesce_io(port, 5000) < 0) {
+ dev_warn(&port->dd->pdev->dev,
+ "Failed to quiesce IO\n");
+ release_slot(port, MTIP_TAG_INTERNAL);
+ clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+ return -EBUSY;
+ }
+ }
+
+ /* Set the completion function and data for the command. */
+ int_cmd->comp_data = &wait;
+ int_cmd->comp_func = mtip_completion;
+
+ } else {
+ /* Clear completion - we're going to poll */
+ int_cmd->comp_data = NULL;
+ int_cmd->comp_func = mtip_null_completion;
+ }
+
+ /* Copy the command to the command table */
+ memcpy(int_cmd->command, fis, fis_len*4);
+
+ /* Populate the SG list */
+ int_cmd->command_header->opts =
+ __force_bit2int cpu_to_le32(opts | fis_len);
+ if (buf_len) {
+ command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ;
+
+ command_sg->info =
+ __force_bit2int cpu_to_le32((buf_len-1) & 0x3FFFFF);
+ command_sg->dba =
+ __force_bit2int cpu_to_le32(buffer & 0xFFFFFFFF);
+ command_sg->dba_upper =
+ __force_bit2int cpu_to_le32((buffer >> 16) >> 16);
+
+ int_cmd->command_header->opts |=
+ __force_bit2int cpu_to_le32((1 << 16));
+ }
+
+ /* Populate the command header */
+ int_cmd->command_header->byte_count = 0;
+
+ /* Issue the command to the hardware */
+ mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL);
+
+ /* Poll if atomic, wait_for_completion otherwise */
+ if (atomic == GFP_KERNEL) {
+ /* Wait for the command to complete or timeout. */
+ if (wait_for_completion_timeout(
+ &wait,
+ msecs_to_jiffies(timeout)) == 0) {
+ dev_err(&port->dd->pdev->dev,
+ "Internal command did not complete [%d] "
+ "within timeout of %lu ms\n",
+ atomic, timeout);
+ if (mtip_check_surprise_removal(port->dd->pdev) ||
+ test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &port->dd->dd_flag)) {
+ rv = -ENXIO;
+ goto exec_ic_exit;
+ }
+ rv = -EAGAIN;
+ }
+
+ if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+ & (1 << MTIP_TAG_INTERNAL)) {
+ dev_warn(&port->dd->pdev->dev,
+ "Retiring internal command but CI is 1.\n");
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &port->dd->dd_flag)) {
+ hba_reset_nosleep(port->dd);
+ rv = -ENXIO;
+ } else {
+ mtip_restart_port(port);
+ rv = -EAGAIN;
+ }
+ goto exec_ic_exit;
+ }
+
+ } else {
+ /* Spin for <timeout> checking if command still outstanding */
+ timeout = jiffies + msecs_to_jiffies(timeout);
+ while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+ & (1 << MTIP_TAG_INTERNAL))
+ && time_before(jiffies, timeout)) {
+ if (mtip_check_surprise_removal(port->dd->pdev)) {
+ rv = -ENXIO;
+ goto exec_ic_exit;
+ }
+ if ((fis->command != ATA_CMD_STANDBYNOW1) &&
+ test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &port->dd->dd_flag)) {
+ rv = -ENXIO;
+ goto exec_ic_exit;
+ }
+ }
+
+ if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+ & (1 << MTIP_TAG_INTERNAL)) {
+ dev_err(&port->dd->pdev->dev,
+ "Internal command did not complete [atomic]\n");
+ rv = -EAGAIN;
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &port->dd->dd_flag)) {
+ hba_reset_nosleep(port->dd);
+ rv = -ENXIO;
+ } else {
+ mtip_restart_port(port);
+ rv = -EAGAIN;
+ }
+ }
+ }
+exec_ic_exit:
+ /* Clear the allocated and active bits for the internal command. */
+ atomic_set(&int_cmd->active, 0);
+ release_slot(port, MTIP_TAG_INTERNAL);
+ if (rv >= 0 && mtip_pause_ncq(port, fis)) {
+ /* NCQ paused */
+ return rv;
+ }
+ clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+
+ return rv;
+}
+
+/*
+ * Byte-swap ATA ID strings.
+ *
+ * ATA identify data contains strings in byte-swapped 16-bit words.
+ * They must be swapped (on all architectures) to be usable as C strings.
+ * This function swaps bytes in-place.
+ *
+ * @buf The buffer location of the string
+ * @len The number of bytes to swap
+ *
+ * return value
+ * None
+ */
+static inline void ata_swap_string(u16 *buf, unsigned int len)
+{
+ int i;
+ for (i = 0; i < (len/2); i++)
+ be16_to_cpus(&buf[i]);
+}
+
+/*
+ * Request the device identity information.
+ *
+ * If a user space buffer is not specified, i.e. is NULL, the
+ * identify information is still read from the drive and placed
+ * into the identify data buffer (@e port->identify) in the
+ * port data structure.
+ * When the identify buffer contains valid identify information @e
+ * port->identify_valid is non-zero.
+ *
+ * @port Pointer to the port structure.
+ * @user_buffer A user space buffer where the identify data should be
+ * copied.
+ *
+ * return value
+ * 0 Command completed successfully.
+ * -EFAULT An error occurred while coping data to the user buffer.
+ * -1 Command failed.
+ */
+static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
+{
+ int rv = 0;
+ struct host_to_dev_fis fis;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return -EFAULT;
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_ID_ATA;
+
+ /* Set the identify information as invalid. */
+ port->identify_valid = 0;
+
+ /* Clear the identify information. */
+ memset(port->identify, 0, sizeof(u16) * ATA_ID_WORDS);
+
+ /* Execute the command. */
+ if (mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ port->identify_dma,
+ sizeof(u16) * ATA_ID_WORDS,
+ 0,
+ GFP_KERNEL,
+ MTIP_INTERNAL_COMMAND_TIMEOUT_MS)
+ < 0) {
+ rv = -1;
+ goto out;
+ }
+
+ /*
+ * Perform any necessary byte-swapping. Yes, the kernel does in fact
+ * perform field-sensitive swapping on the string fields.
+ * See the kernel use of ata_id_string() for proof of this.
+ */
+#ifdef __LITTLE_ENDIAN
+ ata_swap_string(port->identify + 27, 40); /* model string*/
+ ata_swap_string(port->identify + 23, 8); /* firmware string*/
+ ata_swap_string(port->identify + 10, 20); /* serial# string*/
+#else
+ {
+ int i;
+ for (i = 0; i < ATA_ID_WORDS; i++)
+ port->identify[i] = le16_to_cpu(port->identify[i]);
+ }
+#endif
+
+ /* Set the identify buffer as valid. */
+ port->identify_valid = 1;
+
+ if (user_buffer) {
+ if (copy_to_user(
+ user_buffer,
+ port->identify,
+ ATA_ID_WORDS * sizeof(u16))) {
+ rv = -EFAULT;
+ goto out;
+ }
+ }
+
+out:
+ return rv;
+}
+
+/*
+ * Issue a standby immediate command to the device.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ * 0 Command was executed successfully.
+ * -1 An error occurred while executing the command.
+ */
+static int mtip_standby_immediate(struct mtip_port *port)
+{
+ int rv;
+ struct host_to_dev_fis fis;
+ unsigned long start;
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_STANDBYNOW1;
+
+ start = jiffies;
+ rv = mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ 0,
+ 0,
+ 0,
+ GFP_ATOMIC,
+ 15000);
+ dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n",
+ jiffies_to_msecs(jiffies - start));
+ if (rv)
+ dev_warn(&port->dd->pdev->dev,
+ "STANDBY IMMEDIATE command failed.\n");
+
+ return rv;
+}
+
+/*
+ * Issue a READ LOG EXT command to the device.
+ *
+ * @port pointer to the port structure.
+ * @page page number to fetch
+ * @buffer pointer to buffer
+ * @buffer_dma dma address corresponding to @buffer
+ * @sectors page length to fetch, in sectors
+ *
+ * return value
+ * @rv return value from mtip_exec_internal_command()
+ */
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+ dma_addr_t buffer_dma, unsigned int sectors)
+{
+ struct host_to_dev_fis fis;
+
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_READ_LOG_EXT;
+ fis.sect_count = sectors & 0xFF;
+ fis.sect_cnt_ex = (sectors >> 8) & 0xFF;
+ fis.lba_low = page;
+ fis.lba_mid = 0;
+ fis.device = ATA_DEVICE_OBS;
+
+ memset(buffer, 0, sectors * ATA_SECT_SIZE);
+
+ return mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ buffer_dma,
+ sectors * ATA_SECT_SIZE,
+ 0,
+ GFP_ATOMIC,
+ MTIP_INTERNAL_COMMAND_TIMEOUT_MS);
+}
+
+/*
+ * Issue a SMART READ DATA command to the device.
+ *
+ * @port pointer to the port structure.
+ * @buffer pointer to buffer
+ * @buffer_dma dma address corresponding to @buffer
+ *
+ * return value
+ * @rv return value from mtip_exec_internal_command()
+ */
+static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer,
+ dma_addr_t buffer_dma)
+{
+ struct host_to_dev_fis fis;
+
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_SMART;
+ fis.features = 0xD0;
+ fis.sect_count = 1;
+ fis.lba_mid = 0x4F;
+ fis.lba_hi = 0xC2;
+ fis.device = ATA_DEVICE_OBS;
+
+ return mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ buffer_dma,
+ ATA_SECT_SIZE,
+ 0,
+ GFP_ATOMIC,
+ 15000);
+}
+
+/*
+ * Get the value of a smart attribute
+ *
+ * @port pointer to the port structure
+ * @id attribute number
+ * @attrib pointer to return attrib information corresponding to @id
+ *
+ * return value
+ * -EINVAL NULL buffer passed or unsupported attribute @id.
+ * -EPERM Identify data not valid, SMART not supported or not enabled
+ */
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+ struct smart_attr *attrib)
+{
+ int rv, i;
+ struct smart_attr *pattr;
+
+ if (!attrib)
+ return -EINVAL;
+
+ if (!port->identify_valid) {
+ dev_warn(&port->dd->pdev->dev, "IDENTIFY DATA not valid\n");
+ return -EPERM;
+ }
+ if (!(port->identify[82] & 0x1)) {
+ dev_warn(&port->dd->pdev->dev, "SMART not supported\n");
+ return -EPERM;
+ }
+ if (!(port->identify[85] & 0x1)) {
+ dev_warn(&port->dd->pdev->dev, "SMART not enabled\n");
+ return -EPERM;
+ }
+
+ memset(port->smart_buf, 0, ATA_SECT_SIZE);
+ rv = mtip_get_smart_data(port, port->smart_buf, port->smart_buf_dma);
+ if (rv) {
+ dev_warn(&port->dd->pdev->dev, "Failed to ge SMART data\n");
+ return rv;
+ }
+
+ pattr = (struct smart_attr *)(port->smart_buf + 2);
+ for (i = 0; i < 29; i++, pattr++)
+ if (pattr->attr_id == id) {
+ memcpy(attrib, pattr, sizeof(struct smart_attr));
+ break;
+ }
+
+ if (i == 29) {
+ dev_warn(&port->dd->pdev->dev,
+ "Query for invalid SMART attribute ID\n");
+ rv = -EINVAL;
+ }
+
+ return rv;
+}
+
+/*
+ * Get the drive capacity.
+ *
+ * @dd Pointer to the device data structure.
+ * @sectors Pointer to the variable that will receive the sector count.
+ *
+ * return value
+ * 1 Capacity was returned successfully.
+ * 0 The identify information is invalid.
+ */
+static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors)
+{
+ struct mtip_port *port = dd->port;
+ u64 total, raw0, raw1, raw2, raw3;
+ raw0 = port->identify[100];
+ raw1 = port->identify[101];
+ raw2 = port->identify[102];
+ raw3 = port->identify[103];
+ total = raw0 | raw1<<16 | raw2<<32 | raw3<<48;
+ *sectors = total;
+ return (bool) !!port->identify_valid;
+}
+
+/*
+ * Reset the HBA.
+ *
+ * Resets the HBA by setting the HBA Reset bit in the Global
+ * HBA Control register. After setting the HBA Reset bit the
+ * function waits for 1 second before reading the HBA Reset
+ * bit to make sure it has cleared. If HBA Reset is not clear
+ * an error is returned. Cannot be used in non-blockable
+ * context.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 The reset was successful.
+ * -1 The HBA Reset bit did not clear.
+ */
+static int mtip_hba_reset(struct driver_data *dd)
+{
+ mtip_deinit_port(dd->port);
+
+ /* Set the reset bit */
+ writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+ /* Flush */
+ readl(dd->mmio + HOST_CTL);
+
+ /* Wait for reset to clear */
+ ssleep(1);
+
+ /* Check the bit has cleared */
+ if (readl(dd->mmio + HOST_CTL) & HOST_RESET) {
+ dev_err(&dd->pdev->dev,
+ "Reset bit did not clear.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Display the identify command data.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_dump_identify(struct mtip_port *port)
+{
+ sector_t sectors;
+ unsigned short revid;
+ char cbuf[42];
+
+ if (!port->identify_valid)
+ return;
+
+ strlcpy(cbuf, (char *)(port->identify+10), 21);
+ dev_info(&port->dd->pdev->dev,
+ "Serial No.: %s\n", cbuf);
+
+ strlcpy(cbuf, (char *)(port->identify+23), 9);
+ dev_info(&port->dd->pdev->dev,
+ "Firmware Ver.: %s\n", cbuf);
+
+ strlcpy(cbuf, (char *)(port->identify+27), 41);
+ dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
+
+ if (mtip_hw_get_capacity(port->dd, &sectors))
+ dev_info(&port->dd->pdev->dev,
+ "Capacity: %llu sectors (%llu MB)\n",
+ (u64)sectors,
+ ((u64)sectors) * ATA_SECT_SIZE >> 20);
+
+ pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid);
+ switch (revid & 0xFF) {
+ case 0x1:
+ strlcpy(cbuf, "A0", 3);
+ break;
+ case 0x3:
+ strlcpy(cbuf, "A2", 3);
+ break;
+ default:
+ strlcpy(cbuf, "?", 2);
+ break;
+ }
+ dev_info(&port->dd->pdev->dev,
+ "Card Type: %s\n", cbuf);
+}
+
+/*
+ * Map the commands scatter list into the command table.
+ *
+ * @command Pointer to the command.
+ * @nents Number of scatter list entries.
+ *
+ * return value
+ * None
+ */
+static inline void fill_command_sg(struct driver_data *dd,
+ struct mtip_cmd *command,
+ int nents)
+{
+ int n;
+ unsigned int dma_len;
+ struct mtip_cmd_sg *command_sg;
+ struct scatterlist *sg = command->sg;
+
+ command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
+
+ for (n = 0; n < nents; n++) {
+ dma_len = sg_dma_len(sg);
+ if (dma_len > 0x400000)
+ dev_err(&dd->pdev->dev,
+ "DMA segment length truncated\n");
+ command_sg->info = __force_bit2int
+ cpu_to_le32((dma_len-1) & 0x3FFFFF);
+ command_sg->dba = __force_bit2int
+ cpu_to_le32(sg_dma_address(sg));
+ command_sg->dba_upper = __force_bit2int
+ cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
+ command_sg++;
+ sg++;
+ }
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * return value 0 The command completed successfully.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_task(struct mtip_port *port, u8 *command)
+{
+ struct host_to_dev_fis fis;
+ struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = command[0];
+ fis.features = command[1];
+ fis.sect_count = command[2];
+ fis.sector = command[3];
+ fis.cyl_low = command[4];
+ fis.cyl_hi = command[5];
+ fis.device = command[6] & ~0x10; /* Clear the dev bit*/
+
+ dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[2],
+ command[3],
+ command[4],
+ command[5],
+ command[6]);
+
+ /* Execute the command. */
+ if (mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ 0,
+ 0,
+ 0,
+ GFP_KERNEL,
+ MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) {
+ return -1;
+ }
+
+ command[0] = reply->command; /* Status*/
+ command[1] = reply->features; /* Error*/
+ command[4] = reply->cyl_low;
+ command[5] = reply->cyl_hi;
+
+ dbg_printk(MTIP_DRV_NAME " %s: Completion Status: stat %x, err %x , cyl_lo %x cyl_hi %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[4],
+ command[5]);
+
+ return 0;
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * @param port Pointer to the port data structure.
+ * @param command Pointer to the user specified command parameters.
+ * @param user_buffer Pointer to the user space buffer where read sector
+ * data should be copied.
+ *
+ * return value 0 The command completed successfully.
+ * return value -EFAULT An error occurred while copying the completion
+ * data to the user space buffer.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_command(struct mtip_port *port, u8 *command,
+ void __user *user_buffer)
+{
+ struct host_to_dev_fis fis;
+ struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = command[0];
+ fis.features = command[2];
+ fis.sect_count = command[3];
+ if (fis.command == ATA_CMD_SMART) {
+ fis.sector = command[1];
+ fis.cyl_low = 0x4F;
+ fis.cyl_hi = 0xC2;
+ }
+
+ dbg_printk(MTIP_DRV_NAME
+ " %s: User Command: cmd %x, sect %x, "
+ "feat %x, sectcnt %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[2],
+ command[3]);
+
+ memset(port->sector_buffer, 0x00, ATA_SECT_SIZE);
+
+ /* Execute the command. */
+ if (mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ port->sector_buffer_dma,
+ (command[3] != 0) ? ATA_SECT_SIZE : 0,
+ 0,
+ GFP_KERNEL,
+ MTIP_IOCTL_COMMAND_TIMEOUT_MS)
+ < 0) {
+ return -1;
+ }
+
+ /* Collect the completion status. */
+ command[0] = reply->command; /* Status*/
+ command[1] = reply->features; /* Error*/
+ command[2] = command[3];
+
+ dbg_printk(MTIP_DRV_NAME
+ " %s: Completion Status: stat %x, "
+ "err %x, cmd %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[2]);
+
+ if (user_buffer && command[3]) {
+ if (copy_to_user(user_buffer,
+ port->sector_buffer,
+ ATA_SECT_SIZE * command[3])) {
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Indicates whether a command has a single sector payload.
+ *
+ * @command passed to the device to perform the certain event.
+ * @features passed to the device to perform the certain event.
+ *
+ * return value
+ * 1 command is one that always has a single sector payload,
+ * regardless of the value in the Sector Count field.
+ * 0 otherwise
+ *
+ */
+static unsigned int implicit_sector(unsigned char command,
+ unsigned char features)
+{
+ unsigned int rv = 0;
+
+ /* list of commands that have an implicit sector count of 1 */
+ switch (command) {
+ case ATA_CMD_SEC_SET_PASS:
+ case ATA_CMD_SEC_UNLOCK:
+ case ATA_CMD_SEC_ERASE_PREP:
+ case ATA_CMD_SEC_ERASE_UNIT:
+ case ATA_CMD_SEC_FREEZE_LOCK:
+ case ATA_CMD_SEC_DISABLE_PASS:
+ case ATA_CMD_PMP_READ:
+ case ATA_CMD_PMP_WRITE:
+ rv = 1;
+ break;
+ case ATA_CMD_SET_MAX:
+ if (features == ATA_SET_MAX_UNLOCK)
+ rv = 1;
+ break;
+ case ATA_CMD_SMART:
+ if ((features == ATA_SMART_READ_VALUES) ||
+ (features == ATA_SMART_READ_THRESHOLDS))
+ rv = 1;
+ break;
+ case ATA_CMD_CONF_OVERLAY:
+ if ((features == ATA_DCO_IDENTIFY) ||
+ (features == ATA_DCO_SET))
+ rv = 1;
+ break;
+ }
+ return rv;
+}
+
+/*
+ * Executes a taskfile
+ * See ide_taskfile_ioctl() for derivation
+ */
+static int exec_drive_taskfile(struct driver_data *dd,
+ void __user *buf,
+ ide_task_request_t *req_task,
+ int outtotal)
+{
+ struct host_to_dev_fis fis;
+ struct host_to_dev_fis *reply;
+ u8 *outbuf = NULL;
+ u8 *inbuf = NULL;
+ dma_addr_t outbuf_dma = 0;
+ dma_addr_t inbuf_dma = 0;
+ dma_addr_t dma_buffer = 0;
+ int err = 0;
+ unsigned int taskin = 0;
+ unsigned int taskout = 0;
+ u8 nsect = 0;
+ unsigned int timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+ unsigned int force_single_sector;
+ unsigned int transfer_size;
+ unsigned long task_file_data;
+ int intotal = outtotal + req_task->out_size;
+
+ taskout = req_task->out_size;
+ taskin = req_task->in_size;
+ /* 130560 = 512 * 0xFF*/
+ if (taskin > 130560 || taskout > 130560) {
+ err = -EINVAL;
+ goto abort;
+ }
+
+ if (taskout) {
+ outbuf = kzalloc(taskout, GFP_KERNEL);
+ if (outbuf == NULL) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ if (copy_from_user(outbuf, buf + outtotal, taskout)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ outbuf_dma = pci_map_single(dd->pdev,
+ outbuf,
+ taskout,
+ DMA_TO_DEVICE);
+ if (outbuf_dma == 0) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ dma_buffer = outbuf_dma;
+ }
+
+ if (taskin) {
+ inbuf = kzalloc(taskin, GFP_KERNEL);
+ if (inbuf == NULL) {
+ err = -ENOMEM;
+ goto abort;
+ }
+
+ if (copy_from_user(inbuf, buf + intotal, taskin)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ inbuf_dma = pci_map_single(dd->pdev,
+ inbuf,
+ taskin, DMA_FROM_DEVICE);
+ if (inbuf_dma == 0) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ dma_buffer = inbuf_dma;
+ }
+
+ /* only supports PIO and non-data commands from this ioctl. */
+ switch (req_task->data_phase) {
+ case TASKFILE_OUT:
+ nsect = taskout / ATA_SECT_SIZE;
+ reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+ break;
+ case TASKFILE_IN:
+ reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+ break;
+ case TASKFILE_NO_DATA:
+ reply = (dd->port->rxfis + RX_FIS_D2H_REG);
+ break;
+ default:
+ err = -EINVAL;
+ goto abort;
+ }
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = req_task->io_ports[7];
+ fis.features = req_task->io_ports[1];
+ fis.sect_count = req_task->io_ports[2];
+ fis.lba_low = req_task->io_ports[3];
+ fis.lba_mid = req_task->io_ports[4];
+ fis.lba_hi = req_task->io_ports[5];
+ /* Clear the dev bit*/
+ fis.device = req_task->io_ports[6] & ~0x10;
+
+ if ((req_task->in_flags.all == 0) && (req_task->out_flags.all & 1)) {
+ req_task->in_flags.all =
+ IDE_TASKFILE_STD_IN_FLAGS |
+ (IDE_HOB_STD_IN_FLAGS << 8);
+ fis.lba_low_ex = req_task->hob_ports[3];
+ fis.lba_mid_ex = req_task->hob_ports[4];
+ fis.lba_hi_ex = req_task->hob_ports[5];
+ fis.features_ex = req_task->hob_ports[1];
+ fis.sect_cnt_ex = req_task->hob_ports[2];
+
+ } else {
+ req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
+ }
+
+ force_single_sector = implicit_sector(fis.command, fis.features);
+
+ if ((taskin || taskout) && (!fis.sect_count)) {
+ if (nsect)
+ fis.sect_count = nsect;
+ else {
+ if (!force_single_sector) {
+ dev_warn(&dd->pdev->dev,
+ "data movement but "
+ "sect_count is 0\n");
+ err = -EINVAL;
+ goto abort;
+ }
+ }
+ }
+
+ dbg_printk(MTIP_DRV_NAME
+ " %s: cmd %x, feat %x, nsect %x,"
+ " sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x,"
+ " head/dev %x\n",
+ __func__,
+ fis.command,
+ fis.features,
+ fis.sect_count,
+ fis.lba_low,
+ fis.lba_mid,
+ fis.lba_hi,
+ fis.device);
+
+ switch (fis.command) {
+ case ATA_CMD_DOWNLOAD_MICRO:
+ /* Change timeout for Download Microcode to 2 minutes */
+ timeout = 120000;
+ break;
+ case ATA_CMD_SEC_ERASE_UNIT:
+ /* Change timeout for Security Erase Unit to 4 minutes.*/
+ timeout = 240000;
+ break;
+ case ATA_CMD_STANDBYNOW1:
+ /* Change timeout for standby immediate to 10 seconds.*/
+ timeout = 10000;
+ break;
+ case 0xF7:
+ case 0xFA:
+ /* Change timeout for vendor unique command to 10 secs */
+ timeout = 10000;
+ break;
+ case ATA_CMD_SMART:
+ /* Change timeout for vendor unique command to 15 secs */
+ timeout = 15000;
+ break;
+ default:
+ timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+ break;
+ }
+
+ /* Determine the correct transfer size.*/
+ if (force_single_sector)
+ transfer_size = ATA_SECT_SIZE;
+ else
+ transfer_size = ATA_SECT_SIZE * fis.sect_count;
+
+ /* Execute the command.*/
+ if (mtip_exec_internal_command(dd->port,
+ &fis,
+ 5,
+ dma_buffer,
+ transfer_size,
+ 0,
+ GFP_KERNEL,
+ timeout) < 0) {
+ err = -EIO;
+ goto abort;
+ }
+
+ task_file_data = readl(dd->port->mmio+PORT_TFDATA);
+
+ if ((req_task->data_phase == TASKFILE_IN) && !(task_file_data & 1)) {
+ reply = dd->port->rxfis + RX_FIS_PIO_SETUP;
+ req_task->io_ports[7] = reply->control;
+ } else {
+ reply = dd->port->rxfis + RX_FIS_D2H_REG;
+ req_task->io_ports[7] = reply->command;
+ }
+
+ /* reclaim the DMA buffers.*/
+ if (inbuf_dma)
+ pci_unmap_single(dd->pdev, inbuf_dma,
+ taskin, DMA_FROM_DEVICE);
+ if (outbuf_dma)
+ pci_unmap_single(dd->pdev, outbuf_dma,
+ taskout, DMA_TO_DEVICE);
+ inbuf_dma = 0;
+ outbuf_dma = 0;
+
+ /* return the ATA registers to the caller.*/
+ req_task->io_ports[1] = reply->features;
+ req_task->io_ports[2] = reply->sect_count;
+ req_task->io_ports[3] = reply->lba_low;
+ req_task->io_ports[4] = reply->lba_mid;
+ req_task->io_ports[5] = reply->lba_hi;
+ req_task->io_ports[6] = reply->device;
+
+ if (req_task->out_flags.all & 1) {
+
+ req_task->hob_ports[3] = reply->lba_low_ex;
+ req_task->hob_ports[4] = reply->lba_mid_ex;
+ req_task->hob_ports[5] = reply->lba_hi_ex;
+ req_task->hob_ports[1] = reply->features_ex;
+ req_task->hob_ports[2] = reply->sect_cnt_ex;
+ }
+ dbg_printk(MTIP_DRV_NAME
+ " %s: Completion: stat %x,"
+ "err %x, sect_cnt %x, lbalo %x,"
+ "lbamid %x, lbahi %x, dev %x\n",
+ __func__,
+ req_task->io_ports[7],
+ req_task->io_ports[1],
+ req_task->io_ports[2],
+ req_task->io_ports[3],
+ req_task->io_ports[4],
+ req_task->io_ports[5],
+ req_task->io_ports[6]);
+
+ if (taskout) {
+ if (copy_to_user(buf + outtotal, outbuf, taskout)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ }
+ if (taskin) {
+ if (copy_to_user(buf + intotal, inbuf, taskin)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ }
+abort:
+ if (inbuf_dma)
+ pci_unmap_single(dd->pdev, inbuf_dma,
+ taskin, DMA_FROM_DEVICE);
+ if (outbuf_dma)
+ pci_unmap_single(dd->pdev, outbuf_dma,
+ taskout, DMA_TO_DEVICE);
+ kfree(outbuf);
+ kfree(inbuf);
+
+ return err;
+}
+
+/*
+ * Handle IOCTL calls from the Block Layer.
+ *
+ * This function is called by the Block Layer when it receives an IOCTL
+ * command that it does not understand. If the IOCTL command is not supported
+ * this function returns -ENOTTY.
+ *
+ * @dd Pointer to the driver data structure.
+ * @cmd IOCTL command passed from the Block Layer.
+ * @arg IOCTL argument passed from the Block Layer.
+ *
+ * return value
+ * 0 The IOCTL completed successfully.
+ * -ENOTTY The specified command is not supported.
+ * -EFAULT An error occurred copying data to a user space buffer.
+ * -EIO An error occurred while executing the command.
+ */
+static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case HDIO_GET_IDENTITY:
+ if (mtip_get_identify(dd->port, (void __user *) arg) < 0) {
+ dev_warn(&dd->pdev->dev,
+ "Unable to read identity\n");
+ return -EIO;
+ }
+
+ break;
+ case HDIO_DRIVE_CMD:
+ {
+ u8 drive_command[4];
+
+ /* Copy the user command info to our buffer. */
+ if (copy_from_user(drive_command,
+ (void __user *) arg,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ /* Execute the drive command. */
+ if (exec_drive_command(dd->port,
+ drive_command,
+ (void __user *) (arg+4)))
+ return -EIO;
+
+ /* Copy the status back to the users buffer. */
+ if (copy_to_user((void __user *) arg,
+ drive_command,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ break;
+ }
+ case HDIO_DRIVE_TASK:
+ {
+ u8 drive_command[7];
+
+ /* Copy the user command info to our buffer. */
+ if (copy_from_user(drive_command,
+ (void __user *) arg,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ /* Execute the drive command. */
+ if (exec_drive_task(dd->port, drive_command))
+ return -EIO;
+
+ /* Copy the status back to the users buffer. */
+ if (copy_to_user((void __user *) arg,
+ drive_command,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ break;
+ }
+ case HDIO_DRIVE_TASKFILE: {
+ ide_task_request_t req_task;
+ int ret, outtotal;
+
+ if (copy_from_user(&req_task, (void __user *) arg,
+ sizeof(req_task)))
+ return -EFAULT;
+
+ outtotal = sizeof(req_task);
+
+ ret = exec_drive_taskfile(dd, (void __user *) arg,
+ &req_task, outtotal);
+
+ if (copy_to_user((void __user *) arg, &req_task,
+ sizeof(req_task)))
+ return -EFAULT;
+
+ return ret;
+ }
+
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Submit an IO to the hw
+ *
+ * This function is called by the block layer to issue an io
+ * to the device. Upon completion, the callback function will
+ * be called with the data parameter passed as the callback data.
+ *
+ * @dd Pointer to the driver data structure.
+ * @start First sector to read.
+ * @nsect Number of sectors to read.
+ * @nents Number of entries in scatter list for the read command.
+ * @tag The tag of this read command.
+ * @callback Pointer to the function that should be called
+ * when the read completes.
+ * @data Callback data passed to the callback function
+ * when the read completes.
+ * @dir Direction (read or write)
+ *
+ * return value
+ * None
+ */
+static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
+ int nsect, int nents, int tag, void *callback,
+ void *data, int dir)
+{
+ struct host_to_dev_fis *fis;
+ struct mtip_port *port = dd->port;
+ struct mtip_cmd *command = &port->commands[tag];
+ int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+
+ /* Map the scatter list for DMA access */
+ nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
+
+ command->scatter_ents = nents;
+
+ /*
+ * The number of retries for this command before it is
+ * reported as a failure to the upper layers.
+ */
+ command->retries = MTIP_MAX_RETRIES;
+
+ /* Fill out fis */
+ fis = command->command;
+ fis->type = 0x27;
+ fis->opts = 1 << 7;
+ fis->command =
+ (dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE);
+ *((unsigned int *) &fis->lba_low) = (start & 0xFFFFFF);
+ *((unsigned int *) &fis->lba_low_ex) = ((start >> 24) & 0xFFFFFF);
+ fis->device = 1 << 6;
+ fis->features = nsect & 0xFF;
+ fis->features_ex = (nsect >> 8) & 0xFF;
+ fis->sect_count = ((tag << 3) | (tag >> 5));
+ fis->sect_cnt_ex = 0;
+ fis->control = 0;
+ fis->res2 = 0;
+ fis->res3 = 0;
+ fill_command_sg(dd, command, nents);
+
+ /* Populate the command header */
+ command->command_header->opts =
+ __force_bit2int cpu_to_le32(
+ (nents << 16) | 5 | AHCI_CMD_PREFETCH);
+ command->command_header->byte_count = 0;
+
+ /*
+ * Set the completion function and data for the command
+ * within this layer.
+ */
+ command->comp_data = dd;
+ command->comp_func = mtip_async_complete;
+ command->direction = dma_dir;
+
+ /*
+ * Set the completion function and data for the command passed
+ * from the upper layer.
+ */
+ command->async_data = data;
+ command->async_callback = callback;
+
+ /*
+ * To prevent this command from being issued
+ * if an internal command is in progress or error handling is active.
+ */
+ if (port->flags & MTIP_PF_PAUSE_IO) {
+ set_bit(tag, port->cmds_to_issue);
+ set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+ return;
+ }
+
+ /* Issue the command to the hardware */
+ mtip_issue_ncq_command(port, tag);
+
+ return;
+}
+
+/*
+ * Release a command slot.
+ *
+ * @dd Pointer to the driver data structure.
+ * @tag Slot tag
+ *
+ * return value
+ * None
+ */
+static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
+{
+ release_slot(dd->port, tag);
+}
+
+/*
+ * Obtain a command slot and return its associated scatter list.
+ *
+ * @dd Pointer to the driver data structure.
+ * @tag Pointer to an int that will receive the allocated command
+ * slot tag.
+ *
+ * return value
+ * Pointer to the scatter list for the allocated command slot
+ * or NULL if no command slots are available.
+ */
+static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
+ int *tag)
+{
+ /*
+ * It is possible that, even with this semaphore, a thread
+ * may think that no command slots are available. Therefore, we
+ * need to make an attempt to get_slot().
+ */
+ down(&dd->port->cmd_slot);
+ *tag = get_slot(dd->port);
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+ up(&dd->port->cmd_slot);
+ return NULL;
+ }
+ if (unlikely(*tag < 0))
+ return NULL;
+
+ return dd->port->commands[*tag].sg;
+}
+
+/*
+ * Sysfs register/status dump.
+ *
+ * @dev Pointer to the device structure, passed by the kernrel.
+ * @attr Pointer to the device_attribute structure passed by the kernel.
+ * @buf Pointer to the char buffer that will receive the stats info.
+ *
+ * return value
+ * The size, in bytes, of the data copied into buf.
+ */
+static ssize_t mtip_hw_show_registers(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ u32 group_allocated;
+ struct driver_data *dd = dev_to_disk(dev)->private_data;
+ int size = 0;
+ int n;
+
+ size += sprintf(&buf[size], "S ACTive:\n");
+
+ for (n = 0; n < dd->slot_groups; n++)
+ size += sprintf(&buf[size], "0x%08x\n",
+ readl(dd->port->s_active[n]));
+
+ size += sprintf(&buf[size], "Command Issue:\n");
+
+ for (n = 0; n < dd->slot_groups; n++)
+ size += sprintf(&buf[size], "0x%08x\n",
+ readl(dd->port->cmd_issue[n]));
+
+ size += sprintf(&buf[size], "Allocated:\n");
+
+ for (n = 0; n < dd->slot_groups; n++) {
+ if (sizeof(long) > sizeof(u32))
+ group_allocated =
+ dd->port->allocated[n/2] >> (32*(n&1));
+ else
+ group_allocated = dd->port->allocated[n];
+ size += sprintf(&buf[size], "0x%08x\n",
+ group_allocated);
+ }
+
+ size += sprintf(&buf[size], "Completed:\n");
+
+ for (n = 0; n < dd->slot_groups; n++)
+ size += sprintf(&buf[size], "0x%08x\n",
+ readl(dd->port->completed[n]));
+
+ size += sprintf(&buf[size], "PORT IRQ STAT : 0x%08x\n",
+ readl(dd->port->mmio + PORT_IRQ_STAT));
+ size += sprintf(&buf[size], "HOST IRQ STAT : 0x%08x\n",
+ readl(dd->mmio + HOST_IRQ_STAT));
+
+ return size;
+}
+
+static ssize_t mtip_hw_show_status(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct driver_data *dd = dev_to_disk(dev)->private_data;
+ int size = 0;
+
+ if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+ size += sprintf(buf, "%s", "thermal_shutdown\n");
+ else if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag))
+ size += sprintf(buf, "%s", "write_protect\n");
+ else
+ size += sprintf(buf, "%s", "online\n");
+
+ return size;
+}
+
+static DEVICE_ATTR(registers, S_IRUGO, mtip_hw_show_registers, NULL);
+static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL);
+
+/*
+ * Create the sysfs related attributes.
+ *
+ * @dd Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ * 0 Operation completed successfully.
+ * -EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
+{
+ if (!kobj || !dd)
+ return -EINVAL;
+
+ if (sysfs_create_file(kobj, &dev_attr_registers.attr))
+ dev_warn(&dd->pdev->dev,
+ "Error creating 'registers' sysfs entry\n");
+ if (sysfs_create_file(kobj, &dev_attr_status.attr))
+ dev_warn(&dd->pdev->dev,
+ "Error creating 'status' sysfs entry\n");
+ return 0;
+}
+
+/*
+ * Remove the sysfs related attributes.
+ *
+ * @dd Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ * 0 Operation completed successfully.
+ * -EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj)
+{
+ if (!kobj || !dd)
+ return -EINVAL;
+
+ sysfs_remove_file(kobj, &dev_attr_registers.attr);
+ sysfs_remove_file(kobj, &dev_attr_status.attr);
+
+ return 0;
+}
+
+/*
+ * Perform any init/resume time hardware setup
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * None
+ */
+static inline void hba_setup(struct driver_data *dd)
+{
+ u32 hwdata;
+ hwdata = readl(dd->mmio + HOST_HSORG);
+
+ /* interrupt bug workaround: use only 1 IS bit.*/
+ writel(hwdata |
+ HSORG_DISABLE_SLOTGRP_INTR |
+ HSORG_DISABLE_SLOTGRP_PXIS,
+ dd->mmio + HOST_HSORG);
+}
+
+/*
+ * Detect the details of the product, and store anything needed
+ * into the driver data structure. This includes product type and
+ * version and number of slot groups.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_detect_product(struct driver_data *dd)
+{
+ u32 hwdata;
+ unsigned int rev, slotgroups;
+
+ /*
+ * HBA base + 0xFC [15:0] - vendor-specific hardware interface
+ * info register:
+ * [15:8] hardware/software interface rev#
+ * [ 3] asic-style interface
+ * [ 2:0] number of slot groups, minus 1 (only valid for asic-style).
+ */
+ hwdata = readl(dd->mmio + HOST_HSORG);
+
+ dd->product_type = MTIP_PRODUCT_UNKNOWN;
+ dd->slot_groups = 1;
+
+ if (hwdata & 0x8) {
+ dd->product_type = MTIP_PRODUCT_ASICFPGA;
+ rev = (hwdata & HSORG_HWREV) >> 8;
+ slotgroups = (hwdata & HSORG_SLOTGROUPS) + 1;
+ dev_info(&dd->pdev->dev,
+ "ASIC-FPGA design, HS rev 0x%x, "
+ "%i slot groups [%i slots]\n",
+ rev,
+ slotgroups,
+ slotgroups * 32);
+
+ if (slotgroups > MTIP_MAX_SLOT_GROUPS) {
+ dev_warn(&dd->pdev->dev,
+ "Warning: driver only supports "
+ "%i slot groups.\n", MTIP_MAX_SLOT_GROUPS);
+ slotgroups = MTIP_MAX_SLOT_GROUPS;
+ }
+ dd->slot_groups = slotgroups;
+ return;
+ }
+
+ dev_warn(&dd->pdev->dev, "Unrecognized product id\n");
+}
+
+/*
+ * Blocking wait for FTL rebuild to complete
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ * 0 FTL rebuild completed successfully
+ * -EFAULT FTL rebuild error/timeout/interruption
+ */
+static int mtip_ftl_rebuild_poll(struct driver_data *dd)
+{
+ unsigned long timeout, cnt = 0, start;
+
+ dev_warn(&dd->pdev->dev,
+ "FTL rebuild in progress. Polling for completion.\n");
+
+ start = jiffies;
+ timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS);
+
+ do {
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag)))
+ return -EFAULT;
+ if (mtip_check_surprise_removal(dd->pdev))
+ return -EFAULT;
+
+ if (mtip_get_identify(dd->port, NULL) < 0)
+ return -EFAULT;
+
+ if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+ MTIP_FTL_REBUILD_MAGIC) {
+ ssleep(1);
+ /* Print message every 3 minutes */
+ if (cnt++ >= 180) {
+ dev_warn(&dd->pdev->dev,
+ "FTL rebuild in progress (%d secs).\n",
+ jiffies_to_msecs(jiffies - start) / 1000);
+ cnt = 0;
+ }
+ } else {
+ dev_warn(&dd->pdev->dev,
+ "FTL rebuild complete (%d secs).\n",
+ jiffies_to_msecs(jiffies - start) / 1000);
+ mtip_block_initialize(dd);
+ return 0;
+ }
+ ssleep(10);
+ } while (time_before(jiffies, timeout));
+
+ /* Check for timeout */
+ dev_err(&dd->pdev->dev,
+ "Timed out waiting for FTL rebuild to complete (%d secs).\n",
+ jiffies_to_msecs(jiffies - start) / 1000);
+ return -EFAULT;
+}
+
+/*
+ * service thread to issue queued commands
+ *
+ * @data Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+
+static int mtip_service_thread(void *data)
+{
+ struct driver_data *dd = (struct driver_data *)data;
+ unsigned long slot, slot_start, slot_wrap;
+ unsigned int num_cmd_slots = dd->slot_groups * 32;
+ struct mtip_port *port = dd->port;
+
+ while (1) {
+ /*
+ * the condition is to check neither an internal command is
+ * is in progress nor error handling is active
+ */
+ wait_event_interruptible(port->svc_wait, (port->flags) &&
+ !(port->flags & MTIP_PF_PAUSE_IO));
+
+ if (kthread_should_stop())
+ break;
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag)))
+ break;
+
+ set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+ if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+ slot = 1;
+ /* used to restrict the loop to one iteration */
+ slot_start = num_cmd_slots;
+ slot_wrap = 0;
+ while (1) {
+ slot = find_next_bit(port->cmds_to_issue,
+ num_cmd_slots, slot);
+ if (slot_wrap == 1) {
+ if ((slot_start >= slot) ||
+ (slot >= num_cmd_slots))
+ break;
+ }
+ if (unlikely(slot_start == num_cmd_slots))
+ slot_start = slot;
+
+ if (unlikely(slot == num_cmd_slots)) {
+ slot = 1;
+ slot_wrap = 1;
+ continue;
+ }
+
+ /* Issue the command to the hardware */
+ mtip_issue_ncq_command(port, slot);
+
+ clear_bit(slot, port->cmds_to_issue);
+ }
+
+ clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+ } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
+ if (!mtip_ftl_rebuild_poll(dd))
+ set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
+ &dd->dd_flag);
+ clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
+ }
+ clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+ if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Called once for each card.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 on success, else an error code.
+ */
+static int mtip_hw_init(struct driver_data *dd)
+{
+ int i;
+ int rv;
+ unsigned int num_command_slots;
+ unsigned long timeout, timetaken;
+ unsigned char *buf;
+ struct smart_attr attr242;
+
+ dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
+
+ mtip_detect_product(dd);
+ if (dd->product_type == MTIP_PRODUCT_UNKNOWN) {
+ rv = -EIO;
+ goto out1;
+ }
+ num_command_slots = dd->slot_groups * 32;
+
+ hba_setup(dd);
+
+ tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd);
+
+ dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL);
+ if (!dd->port) {
+ dev_err(&dd->pdev->dev,
+ "Memory allocation: port structure\n");
+ return -ENOMEM;
+ }
+
+ /* Counting semaphore to track command slot usage */
+ sema_init(&dd->port->cmd_slot, num_command_slots - 1);
+
+ /* Spinlock to prevent concurrent issue */
+ spin_lock_init(&dd->port->cmd_issue_lock);
+
+ /* Set the port mmio base address. */
+ dd->port->mmio = dd->mmio + PORT_OFFSET;
+ dd->port->dd = dd;
+
+ /* Allocate memory for the command list. */
+ dd->port->command_list =
+ dmam_alloc_coherent(&dd->pdev->dev,
+ HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
+ &dd->port->command_list_dma,
+ GFP_KERNEL);
+ if (!dd->port->command_list) {
+ dev_err(&dd->pdev->dev,
+ "Memory allocation: command list\n");
+ rv = -ENOMEM;
+ goto out1;
+ }
+
+ /* Clear the memory we have allocated. */
+ memset(dd->port->command_list,
+ 0,
+ HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4));
+
+ /* Setup the addresse of the RX FIS. */
+ dd->port->rxfis = dd->port->command_list + HW_CMD_SLOT_SZ;
+ dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ;
+
+ /* Setup the address of the command tables. */
+ dd->port->command_table = dd->port->rxfis + AHCI_RX_FIS_SZ;
+ dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ;
+
+ /* Setup the address of the identify data. */
+ dd->port->identify = dd->port->command_table +
+ HW_CMD_TBL_AR_SZ;
+ dd->port->identify_dma = dd->port->command_tbl_dma +
+ HW_CMD_TBL_AR_SZ;
+
+ /* Setup the address of the sector buffer - for some non-ncq cmds */
+ dd->port->sector_buffer = (void *) dd->port->identify + ATA_SECT_SIZE;
+ dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE;
+
+ /* Setup the address of the log buf - for read log command */
+ dd->port->log_buf = (void *)dd->port->sector_buffer + ATA_SECT_SIZE;
+ dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE;
+
+ /* Setup the address of the smart buf - for smart read data command */
+ dd->port->smart_buf = (void *)dd->port->log_buf + ATA_SECT_SIZE;
+ dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE;
+
+
+ /* Point the command headers at the command tables. */
+ for (i = 0; i < num_command_slots; i++) {
+ dd->port->commands[i].command_header =
+ dd->port->command_list +
+ (sizeof(struct mtip_cmd_hdr) * i);
+ dd->port->commands[i].command_header_dma =
+ dd->port->command_list_dma +
+ (sizeof(struct mtip_cmd_hdr) * i);
+
+ dd->port->commands[i].command =
+ dd->port->command_table + (HW_CMD_TBL_SZ * i);
+ dd->port->commands[i].command_dma =
+ dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i);
+
+ if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64)
+ dd->port->commands[i].command_header->ctbau =
+ __force_bit2int cpu_to_le32(
+ (dd->port->commands[i].command_dma >> 16) >> 16);
+ dd->port->commands[i].command_header->ctba =
+ __force_bit2int cpu_to_le32(
+ dd->port->commands[i].command_dma & 0xFFFFFFFF);
+
+ /*
+ * If this is not done, a bug is reported by the stock
+ * FC11 i386. Due to the fact that it has lots of kernel
+ * debugging enabled.
+ */
+ sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG);
+
+ /* Mark all commands as currently inactive.*/
+ atomic_set(&dd->port->commands[i].active, 0);
+ }
+
+ /* Setup the pointers to the extended s_active and CI registers. */
+ for (i = 0; i < dd->slot_groups; i++) {
+ dd->port->s_active[i] =
+ dd->port->mmio + i*0x80 + PORT_SCR_ACT;
+ dd->port->cmd_issue[i] =
+ dd->port->mmio + i*0x80 + PORT_COMMAND_ISSUE;
+ dd->port->completed[i] =
+ dd->port->mmio + i*0x80 + PORT_SDBV;
+ }
+
+ timetaken = jiffies;
+ timeout = jiffies + msecs_to_jiffies(30000);
+ while (((readl(dd->port->mmio + PORT_SCR_STAT) & 0x0F) != 0x03) &&
+ time_before(jiffies, timeout)) {
+ mdelay(100);
+ }
+ if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+ timetaken = jiffies - timetaken;
+ dev_warn(&dd->pdev->dev,
+ "Surprise removal detected at %u ms\n",
+ jiffies_to_msecs(timetaken));
+ rv = -ENODEV;
+ goto out2 ;
+ }
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+ timetaken = jiffies - timetaken;
+ dev_warn(&dd->pdev->dev,
+ "Removal detected at %u ms\n",
+ jiffies_to_msecs(timetaken));
+ rv = -EFAULT;
+ goto out2;
+ }
+
+ /* Conditionally reset the HBA. */
+ if (!(readl(dd->mmio + HOST_CAP) & HOST_CAP_NZDMA)) {
+ if (mtip_hba_reset(dd) < 0) {
+ dev_err(&dd->pdev->dev,
+ "Card did not reset within timeout\n");
+ rv = -EIO;
+ goto out2;
+ }
+ } else {
+ /* Clear any pending interrupts on the HBA */
+ writel(readl(dd->mmio + HOST_IRQ_STAT),
+ dd->mmio + HOST_IRQ_STAT);
+ }
+
+ mtip_init_port(dd->port);
+ mtip_start_port(dd->port);
+
+ /* Setup the ISR and enable interrupts. */
+ rv = devm_request_irq(&dd->pdev->dev,
+ dd->pdev->irq,
+ mtip_irq_handler,
+ IRQF_SHARED,
+ dev_driver_string(&dd->pdev->dev),
+ dd);
+
+ if (rv) {
+ dev_err(&dd->pdev->dev,
+ "Unable to allocate IRQ %d\n", dd->pdev->irq);
+ goto out2;
+ }
+
+ /* Enable interrupts on the HBA. */
+ writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+
+ init_timer(&dd->port->cmd_timer);
+ init_waitqueue_head(&dd->port->svc_wait);
+
+ dd->port->cmd_timer.data = (unsigned long int) dd->port;
+ dd->port->cmd_timer.function = mtip_timeout_function;
+ mod_timer(&dd->port->cmd_timer,
+ jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
+
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
+ rv = -EFAULT;
+ goto out3;
+ }
+
+ if (mtip_get_identify(dd->port, NULL) < 0) {
+ rv = -EFAULT;
+ goto out3;
+ }
+
+ if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+ MTIP_FTL_REBUILD_MAGIC) {
+ set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
+ return MTIP_FTL_REBUILD_MAGIC;
+ }
+ mtip_dump_identify(dd->port);
+
+ /* check write protect, over temp and rebuild statuses */
+ rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+ dd->port->log_buf,
+ dd->port->log_buf_dma, 1);
+ if (rv) {
+ dev_warn(&dd->pdev->dev,
+ "Error in READ LOG EXT (10h) command\n");
+ /* non-critical error, don't fail the load */
+ } else {
+ buf = (unsigned char *)dd->port->log_buf;
+ if (buf[259] & 0x1) {
+ dev_info(&dd->pdev->dev,
+ "Write protect bit is set.\n");
+ set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+ }
+ if (buf[288] == 0xF7) {
+ dev_info(&dd->pdev->dev,
+ "Exceeded Tmax, drive in thermal shutdown.\n");
+ set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+ }
+ if (buf[288] == 0xBF) {
+ dev_info(&dd->pdev->dev,
+ "Drive indicates rebuild has failed.\n");
+ /* TODO */
+ }
+ }
+
+ /* get write protect progess */
+ memset(&attr242, 0, sizeof(struct smart_attr));
+ if (mtip_get_smart_attr(dd->port, 242, &attr242))
+ dev_warn(&dd->pdev->dev,
+ "Unable to check write protect progress\n");
+ else
+ dev_info(&dd->pdev->dev,
+ "Write protect progress: %d%% (%d blocks)\n",
+ attr242.cur, attr242.data);
+ return rv;
+
+out3:
+ del_timer_sync(&dd->port->cmd_timer);
+
+ /* Disable interrupts on the HBA. */
+ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+
+ /*Release the IRQ. */
+ devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+out2:
+ mtip_deinit_port(dd->port);
+
+ /* Free the command/command header memory. */
+ dmam_free_coherent(&dd->pdev->dev,
+ HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
+ dd->port->command_list,
+ dd->port->command_list_dma);
+out1:
+ /* Free the memory allocated for the for structure. */
+ kfree(dd->port);
+
+ return rv;
+}
+
+/*
+ * Called to deinitialize an interface.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_hw_exit(struct driver_data *dd)
+{
+ /*
+ * Send standby immediate (E0h) to the drive so that it
+ * saves its state.
+ */
+ if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
+
+ if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags))
+ if (mtip_standby_immediate(dd->port))
+ dev_warn(&dd->pdev->dev,
+ "STANDBY IMMEDIATE failed\n");
+
+ /* de-initialize the port. */
+ mtip_deinit_port(dd->port);
+
+ /* Disable interrupts on the HBA. */
+ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+ }
+
+ del_timer_sync(&dd->port->cmd_timer);
+
+ /* Release the IRQ. */
+ devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+ /* Stop the bottom half tasklet. */
+ tasklet_kill(&dd->tasklet);
+
+ /* Free the command/command header memory. */
+ dmam_free_coherent(&dd->pdev->dev,
+ HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
+ dd->port->command_list,
+ dd->port->command_list_dma);
+ /* Free the memory allocated for the for structure. */
+ kfree(dd->port);
+
+ return 0;
+}
+
+/*
+ * Issue a Standby Immediate command to the device.
+ *
+ * This function is called by the Block Layer just before the
+ * system powers off during a shutdown.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_hw_shutdown(struct driver_data *dd)
+{
+ /*
+ * Send standby immediate (E0h) to the drive so that it
+ * saves its state.
+ */
+ mtip_standby_immediate(dd->port);
+
+ return 0;
+}
+
+/*
+ * Suspend function
+ *
+ * This function is called by the Block Layer just before the
+ * system hibernates.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 Suspend was successful
+ * -EFAULT Suspend was not successful
+ */
+static int mtip_hw_suspend(struct driver_data *dd)
+{
+ /*
+ * Send standby immediate (E0h) to the drive
+ * so that it saves its state.
+ */
+ if (mtip_standby_immediate(dd->port) != 0) {
+ dev_err(&dd->pdev->dev,
+ "Failed standby-immediate command\n");
+ return -EFAULT;
+ }
+
+ /* Disable interrupts on the HBA.*/
+ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+ mtip_deinit_port(dd->port);
+
+ return 0;
+}
+
+/*
+ * Resume function
+ *
+ * This function is called by the Block Layer as the
+ * system resumes.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 Resume was successful
+ * -EFAULT Resume was not successful
+ */
+static int mtip_hw_resume(struct driver_data *dd)
+{
+ /* Perform any needed hardware setup steps */
+ hba_setup(dd);
+
+ /* Reset the HBA */
+ if (mtip_hba_reset(dd) != 0) {
+ dev_err(&dd->pdev->dev,
+ "Unable to reset the HBA\n");
+ return -EFAULT;
+ }
+
+ /*
+ * Enable the port, DMA engine, and FIS reception specific
+ * h/w in controller.
+ */
+ mtip_init_port(dd->port);
+ mtip_start_port(dd->port);
+
+ /* Enable interrupts on the HBA.*/
+ writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+
+ return 0;
+}
+
+/*
+ * Helper function for reusing disk name
+ * upon hot insertion.
+ */
+static int rssd_disk_name_format(char *prefix,
+ int index,
+ char *buf,
+ int buflen)
+{
+ const int base = 'z' - 'a' + 1;
+ char *begin = buf + strlen(prefix);
+ char *end = buf + buflen;
+ char *p;
+ int unit;
+
+ p = end - 1;
+ *p = '\0';
+ unit = base;
+ do {
+ if (p == begin)
+ return -EINVAL;
+ *--p = 'a' + (index % unit);
+ index = (index / unit) - 1;
+ } while (index >= 0);
+
+ memmove(begin, p, end - p);
+ memcpy(buf, prefix, strlen(prefix));
+
+ return 0;
+}
+
+/*
+ * Block layer IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ * 0 IOCTL completed successfully.
+ * -ENOTTY IOCTL not supported or invalid driver data
+ * structure pointer.
+ */
+static int mtip_block_ioctl(struct block_device *dev,
+ fmode_t mode,
+ unsigned cmd,
+ unsigned long arg)
+{
+ struct driver_data *dd = dev->bd_disk->private_data;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!dd)
+ return -ENOTTY;
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+ return -ENOTTY;
+
+ switch (cmd) {
+ case BLKFLSBUF:
+ return -ENOTTY;
+ default:
+ return mtip_hw_ioctl(dd, cmd, arg);
+ }
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Block layer compat IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ * 0 IOCTL completed successfully.
+ * -ENOTTY IOCTL not supported or invalid driver data
+ * structure pointer.
+ */
+static int mtip_block_compat_ioctl(struct block_device *dev,
+ fmode_t mode,
+ unsigned cmd,
+ unsigned long arg)
+{
+ struct driver_data *dd = dev->bd_disk->private_data;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!dd)
+ return -ENOTTY;
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+ return -ENOTTY;
+
+ switch (cmd) {
+ case BLKFLSBUF:
+ return -ENOTTY;
+ case HDIO_DRIVE_TASKFILE: {
+ struct mtip_compat_ide_task_request_s __user *compat_req_task;
+ ide_task_request_t req_task;
+ int compat_tasksize, outtotal, ret;
+
+ compat_tasksize =
+ sizeof(struct mtip_compat_ide_task_request_s);
+
+ compat_req_task =
+ (struct mtip_compat_ide_task_request_s __user *) arg;
+
+ if (copy_from_user(&req_task, (void __user *) arg,
+ compat_tasksize - (2 * sizeof(compat_long_t))))
+ return -EFAULT;
+
+ if (get_user(req_task.out_size, &compat_req_task->out_size))
+ return -EFAULT;
+
+ if (get_user(req_task.in_size, &compat_req_task->in_size))
+ return -EFAULT;
+
+ outtotal = sizeof(struct mtip_compat_ide_task_request_s);
+
+ ret = exec_drive_taskfile(dd, (void __user *) arg,
+ &req_task, outtotal);
+
+ if (copy_to_user((void __user *) arg, &req_task,
+ compat_tasksize -
+ (2 * sizeof(compat_long_t))))
+ return -EFAULT;
+
+ if (put_user(req_task.out_size, &compat_req_task->out_size))
+ return -EFAULT;
+
+ if (put_user(req_task.in_size, &compat_req_task->in_size))
+ return -EFAULT;
+
+ return ret;
+ }
+ default:
+ return mtip_hw_ioctl(dd, cmd, arg);
+ }
+}
+#endif
+
+/*
+ * Obtain the geometry of the device.
+ *
+ * You may think that this function is obsolete, but some applications,
+ * fdisk for example still used CHS values. This function describes the
+ * device as having 224 heads and 56 sectors per cylinder. These values are
+ * chosen so that each cylinder is aligned on a 4KB boundary. Since a
+ * partition is described in terms of a start and end cylinder this means
+ * that each partition is also 4KB aligned. Non-aligned partitions adversely
+ * affects performance.
+ *
+ * @dev Pointer to the block_device strucutre.
+ * @geo Pointer to a hd_geometry structure.
+ *
+ * return value
+ * 0 Operation completed successfully.
+ * -ENOTTY An error occurred while reading the drive capacity.
+ */
+static int mtip_block_getgeo(struct block_device *dev,
+ struct hd_geometry *geo)
+{
+ struct driver_data *dd = dev->bd_disk->private_data;
+ sector_t capacity;
+
+ if (!dd)
+ return -ENOTTY;
+
+ if (!(mtip_hw_get_capacity(dd, &capacity))) {
+ dev_warn(&dd->pdev->dev,
+ "Could not get drive capacity.\n");
+ return -ENOTTY;
+ }
+
+ geo->heads = 224;
+ geo->sectors = 56;
+ sector_div(capacity, (geo->heads * geo->sectors));
+ geo->cylinders = capacity;
+ return 0;
+}
+
+/*
+ * Block device operation function.
+ *
+ * This structure contains pointers to the functions required by the block
+ * layer.
+ */
+static const struct block_device_operations mtip_block_ops = {
+ .ioctl = mtip_block_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = mtip_block_compat_ioctl,
+#endif
+ .getgeo = mtip_block_getgeo,
+ .owner = THIS_MODULE
+};
+
+/*
+ * Block layer make request function.
+ *
+ * This function is called by the kernel to process a BIO for
+ * the P320 device.
+ *
+ * @queue Pointer to the request queue. Unused other than to obtain
+ * the driver data structure.
+ * @bio Pointer to the BIO.
+ *
+ */
+static void mtip_make_request(struct request_queue *queue, struct bio *bio)
+{
+ struct driver_data *dd = queue->queuedata;
+ struct scatterlist *sg;
+ struct bio_vec *bvec;
+ int nents = 0;
+ int tag = 0;
+
+ if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag))) {
+ bio_endio(bio, -ENXIO);
+ return;
+ }
+ if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
+ bio_endio(bio, -ENODATA);
+ return;
+ }
+ if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
+ &dd->dd_flag) &&
+ bio_data_dir(bio))) {
+ bio_endio(bio, -ENODATA);
+ return;
+ }
+ }
+
+ if (unlikely(!bio_has_data(bio))) {
+ blk_queue_flush(queue, 0);
+ bio_endio(bio, 0);
+ return;
+ }
+
+ sg = mtip_hw_get_scatterlist(dd, &tag);
+ if (likely(sg != NULL)) {
+ blk_queue_bounce(queue, &bio);
+
+ if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) {
+ dev_warn(&dd->pdev->dev,
+ "Maximum number of SGL entries exceeded\n");
+ bio_io_error(bio);
+ mtip_hw_release_scatterlist(dd, tag);
+ return;
+ }
+
+ /* Create the scatter list for this bio. */
+ bio_for_each_segment(bvec, bio, nents) {
+ sg_set_page(&sg[nents],
+ bvec->bv_page,
+ bvec->bv_len,
+ bvec->bv_offset);
+ }
+
+ /* Issue the read/write. */
+ mtip_hw_submit_io(dd,
+ bio->bi_sector,
+ bio_sectors(bio),
+ nents,
+ tag,
+ bio_endio,
+ bio,
+ bio_data_dir(bio));
+ } else
+ bio_io_error(bio);
+}
+
+/*
+ * Block layer initialization function.
+ *
+ * This function is called once by the PCI layer for each P320
+ * device that is connected to the system.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 on success else an error code.
+ */
+static int mtip_block_initialize(struct driver_data *dd)
+{
+ int rv = 0, wait_for_rebuild = 0;
+ sector_t capacity;
+ unsigned int index = 0;
+ struct kobject *kobj;
+ unsigned char thd_name[16];
+
+ if (dd->disk)
+ goto skip_create_disk; /* hw init done, before rebuild */
+
+ /* Initialize the protocol layer. */
+ wait_for_rebuild = mtip_hw_init(dd);
+ if (wait_for_rebuild < 0) {
+ dev_err(&dd->pdev->dev,
+ "Protocol layer initialization failed\n");
+ rv = -EINVAL;
+ goto protocol_init_error;
+ }
+
+ dd->disk = alloc_disk(MTIP_MAX_MINORS);
+ if (dd->disk == NULL) {
+ dev_err(&dd->pdev->dev,
+ "Unable to allocate gendisk structure\n");
+ rv = -EINVAL;
+ goto alloc_disk_error;
+ }
+
+ /* Generate the disk name, implemented same as in sd.c */
+ do {
+ if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL))
+ goto ida_get_error;
+
+ spin_lock(&rssd_index_lock);
+ rv = ida_get_new(&rssd_index_ida, &index);
+ spin_unlock(&rssd_index_lock);
+ } while (rv == -EAGAIN);
+
+ if (rv)
+ goto ida_get_error;
+
+ rv = rssd_disk_name_format("rssd",
+ index,
+ dd->disk->disk_name,
+ DISK_NAME_LEN);
+ if (rv)
+ goto disk_index_error;
+
+ dd->disk->driverfs_dev = &dd->pdev->dev;
+ dd->disk->major = dd->major;
+ dd->disk->first_minor = dd->instance * MTIP_MAX_MINORS;
+ dd->disk->fops = &mtip_block_ops;
+ dd->disk->private_data = dd;
+ dd->index = index;
+
+ /*
+ * if rebuild pending, start the service thread, and delay the block
+ * queue creation and add_disk()
+ */
+ if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+ goto start_service_thread;
+
+skip_create_disk:
+ /* Allocate the request queue. */
+ dd->queue = blk_alloc_queue(GFP_KERNEL);
+ if (dd->queue == NULL) {
+ dev_err(&dd->pdev->dev,
+ "Unable to allocate request queue\n");
+ rv = -ENOMEM;
+ goto block_queue_alloc_init_error;
+ }
+
+ /* Attach our request function to the request queue. */
+ blk_queue_make_request(dd->queue, mtip_make_request);
+
+ dd->disk->queue = dd->queue;
+ dd->queue->queuedata = dd;
+
+ /* Set device limits. */
+ set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
+ blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
+ blk_queue_physical_block_size(dd->queue, 4096);
+ blk_queue_io_min(dd->queue, 4096);
+ /*
+ * write back cache is not supported in the device. FUA depends on
+ * write back cache support, hence setting flush support to zero.
+ */
+ blk_queue_flush(dd->queue, 0);
+
+ /* Set the capacity of the device in 512 byte sectors. */
+ if (!(mtip_hw_get_capacity(dd, &capacity))) {
+ dev_warn(&dd->pdev->dev,
+ "Could not read drive capacity\n");
+ rv = -EIO;
+ goto read_capacity_error;
+ }
+ set_capacity(dd->disk, capacity);
+
+ /* Enable the block device and add it to /dev */
+ add_disk(dd->disk);
+
+ /*
+ * Now that the disk is active, initialize any sysfs attributes
+ * managed by the protocol layer.
+ */
+ kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+ if (kobj) {
+ mtip_hw_sysfs_init(dd, kobj);
+ kobject_put(kobj);
+ }
+
+ if (dd->mtip_svc_handler) {
+ set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+ return rv; /* service thread created for handling rebuild */
+ }
+
+start_service_thread:
+ sprintf(thd_name, "mtip_svc_thd_%02d", index);
+
+ dd->mtip_svc_handler = kthread_run(mtip_service_thread,
+ dd, thd_name);
+
+ if (IS_ERR(dd->mtip_svc_handler)) {
+ dev_err(&dd->pdev->dev, "service thread failed to start\n");
+ dd->mtip_svc_handler = NULL;
+ rv = -EFAULT;
+ goto kthread_run_error;
+ }
+
+ if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+ rv = wait_for_rebuild;
+
+ return rv;
+
+kthread_run_error:
+ /* Delete our gendisk. This also removes the device from /dev */
+ del_gendisk(dd->disk);
+
+read_capacity_error:
+ blk_cleanup_queue(dd->queue);
+
+block_queue_alloc_init_error:
+disk_index_error:
+ spin_lock(&rssd_index_lock);
+ ida_remove(&rssd_index_ida, index);
+ spin_unlock(&rssd_index_lock);
+
+ida_get_error:
+ put_disk(dd->disk);
+
+alloc_disk_error:
+ mtip_hw_exit(dd); /* De-initialize the protocol layer. */
+
+protocol_init_error:
+ return rv;
+}
+
+/*
+ * Block layer deinitialization function.
+ *
+ * Called by the PCI layer as each P320 device is removed.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_block_remove(struct driver_data *dd)
+{
+ struct kobject *kobj;
+
+ if (dd->mtip_svc_handler) {
+ set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
+ wake_up_interruptible(&dd->port->svc_wait);
+ kthread_stop(dd->mtip_svc_handler);
+ }
+
+ /* Clean up the sysfs attributes, if created */
+ if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
+ kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+ if (kobj) {
+ mtip_hw_sysfs_exit(dd, kobj);
+ kobject_put(kobj);
+ }
+ }
+
+ /*
+ * Delete our gendisk structure. This also removes the device
+ * from /dev
+ */
+ del_gendisk(dd->disk);
+
+ spin_lock(&rssd_index_lock);
+ ida_remove(&rssd_index_ida, dd->index);
+ spin_unlock(&rssd_index_lock);
+
+ blk_cleanup_queue(dd->queue);
+ dd->disk = NULL;
+ dd->queue = NULL;
+
+ /* De-initialize the protocol layer. */
+ mtip_hw_exit(dd);
+
+ return 0;
+}
+
+/*
+ * Function called by the PCI layer when just before the
+ * machine shuts down.
+ *
+ * If a protocol layer shutdown function is present it will be called
+ * by this function.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_block_shutdown(struct driver_data *dd)
+{
+ dev_info(&dd->pdev->dev,
+ "Shutting down %s ...\n", dd->disk->disk_name);
+
+ /* Delete our gendisk structure, and cleanup the blk queue. */
+ del_gendisk(dd->disk);
+
+ spin_lock(&rssd_index_lock);
+ ida_remove(&rssd_index_ida, dd->index);
+ spin_unlock(&rssd_index_lock);
+
+ blk_cleanup_queue(dd->queue);
+ dd->disk = NULL;
+ dd->queue = NULL;
+
+ mtip_hw_shutdown(dd);
+ return 0;
+}
+
+static int mtip_block_suspend(struct driver_data *dd)
+{
+ dev_info(&dd->pdev->dev,
+ "Suspending %s ...\n", dd->disk->disk_name);
+ mtip_hw_suspend(dd);
+ return 0;
+}
+
+static int mtip_block_resume(struct driver_data *dd)
+{
+ dev_info(&dd->pdev->dev, "Resuming %s ...\n",
+ dd->disk->disk_name);
+ mtip_hw_resume(dd);
+ return 0;
+}
+
+/*
+ * Called for each supported PCI device detected.
+ *
+ * This function allocates the private data structure, enables the
+ * PCI device and then calls the block layer initialization function.
+ *
+ * return value
+ * 0 on success else an error code.
+ */
+static int mtip_pci_probe(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ int rv = 0;
+ struct driver_data *dd = NULL;
+
+ /* Allocate memory for this devices private data. */
+ dd = kzalloc(sizeof(struct driver_data), GFP_KERNEL);
+ if (dd == NULL) {
+ dev_err(&pdev->dev,
+ "Unable to allocate memory for driver data\n");
+ return -ENOMEM;
+ }
+
+ /* Attach the private data to this PCI device. */
+ pci_set_drvdata(pdev, dd);
+
+ rv = pcim_enable_device(pdev);
+ if (rv < 0) {
+ dev_err(&pdev->dev, "Unable to enable device\n");
+ goto iomap_err;
+ }
+
+ /* Map BAR5 to memory. */
+ rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME);
+ if (rv < 0) {
+ dev_err(&pdev->dev, "Unable to map regions\n");
+ goto iomap_err;
+ }
+
+ if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+ rv = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+
+ if (rv) {
+ rv = pci_set_consistent_dma_mask(pdev,
+ DMA_BIT_MASK(32));
+ if (rv) {
+ dev_warn(&pdev->dev,
+ "64-bit DMA enable failed\n");
+ goto setmask_err;
+ }
+ }
+ }
+
+ pci_set_master(pdev);
+
+ if (pci_enable_msi(pdev)) {
+ dev_warn(&pdev->dev,
+ "Unable to enable MSI interrupt.\n");
+ goto block_initialize_err;
+ }
+
+ /* Copy the info we may need later into the private data structure. */
+ dd->major = mtip_major;
+ dd->instance = instance;
+ dd->pdev = pdev;
+
+ /* Initialize the block layer. */
+ rv = mtip_block_initialize(dd);
+ if (rv < 0) {
+ dev_err(&pdev->dev,
+ "Unable to initialize block layer\n");
+ goto block_initialize_err;
+ }
+
+ /*
+ * Increment the instance count so that each device has a unique
+ * instance number.
+ */
+ instance++;
+ if (rv != MTIP_FTL_REBUILD_MAGIC)
+ set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+ goto done;
+
+block_initialize_err:
+ pci_disable_msi(pdev);
+
+setmask_err:
+ pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+
+iomap_err:
+ kfree(dd);
+ pci_set_drvdata(pdev, NULL);
+ return rv;
+done:
+ return rv;
+}
+
+/*
+ * Called for each probed device when the device is removed or the
+ * driver is unloaded.
+ *
+ * return value
+ * None
+ */
+static void mtip_pci_remove(struct pci_dev *pdev)
+{
+ struct driver_data *dd = pci_get_drvdata(pdev);
+ int counter = 0;
+
+ set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag);
+
+ if (mtip_check_surprise_removal(pdev)) {
+ while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
+ counter++;
+ msleep(20);
+ if (counter == 10) {
+ /* Cleanup the outstanding commands */
+ mtip_command_cleanup(dd);
+ break;
+ }
+ }
+ }
+
+ /* Clean up the block layer. */
+ mtip_block_remove(dd);
+
+ pci_disable_msi(pdev);
+
+ kfree(dd);
+ pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+}
+
+/*
+ * Called for each probed device when the device is suspended.
+ *
+ * return value
+ * 0 Success
+ * <0 Error
+ */
+static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
+{
+ int rv = 0;
+ struct driver_data *dd = pci_get_drvdata(pdev);
+
+ if (!dd) {
+ dev_err(&pdev->dev,
+ "Driver private datastructure is NULL\n");
+ return -EFAULT;
+ }
+
+ set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+ /* Disable ports & interrupts then send standby immediate */
+ rv = mtip_block_suspend(dd);
+ if (rv < 0) {
+ dev_err(&pdev->dev,
+ "Failed to suspend controller\n");
+ return rv;
+ }
+
+ /*
+ * Save the pci config space to pdev structure &
+ * disable the device
+ */
+ pci_save_state(pdev);
+ pci_disable_device(pdev);
+
+ /* Move to Low power state*/
+ pci_set_power_state(pdev, PCI_D3hot);
+
+ return rv;
+}
+
+/*
+ * Called for each probed device when the device is resumed.
+ *
+ * return value
+ * 0 Success
+ * <0 Error
+ */
+static int mtip_pci_resume(struct pci_dev *pdev)
+{
+ int rv = 0;
+ struct driver_data *dd;
+
+ dd = pci_get_drvdata(pdev);
+ if (!dd) {
+ dev_err(&pdev->dev,
+ "Driver private datastructure is NULL\n");
+ return -EFAULT;
+ }
+
+ /* Move the device to active State */
+ pci_set_power_state(pdev, PCI_D0);
+
+ /* Restore PCI configuration space */
+ pci_restore_state(pdev);
+
+ /* Enable the PCI device*/
+ rv = pcim_enable_device(pdev);
+ if (rv < 0) {
+ dev_err(&pdev->dev,
+ "Failed to enable card during resume\n");
+ goto err;
+ }
+ pci_set_master(pdev);
+
+ /*
+ * Calls hbaReset, initPort, & startPort function
+ * then enables interrupts
+ */
+ rv = mtip_block_resume(dd);
+ if (rv < 0)
+ dev_err(&pdev->dev, "Unable to resume\n");
+
+err:
+ clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+ return rv;
+}
+
+/*
+ * Shutdown routine
+ *
+ * return value
+ * None
+ */
+static void mtip_pci_shutdown(struct pci_dev *pdev)
+{
+ struct driver_data *dd = pci_get_drvdata(pdev);
+ if (dd)
+ mtip_block_shutdown(dd);
+}
+
+/* Table of device ids supported by this driver. */
+static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = {
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320_DEVICE_ID) },
+ { 0 }
+};
+
+/* Structure that describes the PCI driver functions. */
+static struct pci_driver mtip_pci_driver = {
+ .name = MTIP_DRV_NAME,
+ .id_table = mtip_pci_tbl,
+ .probe = mtip_pci_probe,
+ .remove = mtip_pci_remove,
+ .suspend = mtip_pci_suspend,
+ .resume = mtip_pci_resume,
+ .shutdown = mtip_pci_shutdown,
+};
+
+MODULE_DEVICE_TABLE(pci, mtip_pci_tbl);
+
+/*
+ * Module initialization function.
+ *
+ * Called once when the module is loaded. This function allocates a major
+ * block device number to the Cyclone devices and registers the PCI layer
+ * of the driver.
+ *
+ * Return value
+ * 0 on success else error code.
+ */
+static int __init mtip_init(void)
+{
+ int error;
+
+ printk(KERN_INFO MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n");
+
+ /* Allocate a major block device number to use with this driver. */
+ error = register_blkdev(0, MTIP_DRV_NAME);
+ if (error <= 0) {
+ printk(KERN_ERR "Unable to register block device (%d)\n",
+ error);
+ return -EBUSY;
+ }
+ mtip_major = error;
+
+ /* Register our PCI operations. */
+ error = pci_register_driver(&mtip_pci_driver);
+ if (error)
+ unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+
+ return error;
+}
+
+/*
+ * Module de-initialization function.
+ *
+ * Called once when the module is unloaded. This function deallocates
+ * the major block device number allocated by mtip_init() and
+ * unregisters the PCI layer of the driver.
+ *
+ * Return value
+ * none
+ */
+static void __exit mtip_exit(void)
+{
+ /* Release the allocated major block device number. */
+ unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+
+ /* Unregister the PCI driver. */
+ pci_unregister_driver(&mtip_pci_driver);
+}
+
+MODULE_AUTHOR("Micron Technology, Inc");
+MODULE_DESCRIPTION("Micron RealSSD PCIe Block Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(MTIP_DRV_VERSION);
+
+module_init(mtip_init);
+module_exit(mtip_exit);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
new file mode 100644
index 000000000000..4ef58336310a
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -0,0 +1,450 @@
+/*
+ * mtip32xx.h - Header file for the P320 SSD Block Driver
+ * Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ * Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __MTIP32XX_H__
+#define __MTIP32XX_H__
+
+#include <linux/spinlock.h>
+#include <linux/rwsem.h>
+#include <linux/ata.h>
+#include <linux/interrupt.h>
+#include <linux/genhd.h>
+#include <linux/version.h>
+
+/* Offset of Subsystem Device ID in pci confoguration space */
+#define PCI_SUBSYSTEM_DEVICEID 0x2E
+
+/* offset of Device Control register in PCIe extended capabilites space */
+#define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET 0x48
+
+/* # of times to retry timed out/failed IOs */
+#define MTIP_MAX_RETRIES 2
+
+/* Various timeout values in ms */
+#define MTIP_NCQ_COMMAND_TIMEOUT_MS 5000
+#define MTIP_IOCTL_COMMAND_TIMEOUT_MS 5000
+#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS 5000
+
+/* check for timeouts every 500ms */
+#define MTIP_TIMEOUT_CHECK_PERIOD 500
+
+/* ftl rebuild */
+#define MTIP_FTL_REBUILD_OFFSET 142
+#define MTIP_FTL_REBUILD_MAGIC 0xED51
+#define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000
+
+/* Macro to extract the tag bit number from a tag value. */
+#define MTIP_TAG_BIT(tag) (tag & 0x1F)
+
+/*
+ * Macro to extract the tag index from a tag value. The index
+ * is used to access the correct s_active/Command Issue register based
+ * on the tag value.
+ */
+#define MTIP_TAG_INDEX(tag) (tag >> 5)
+
+/*
+ * Maximum number of scatter gather entries
+ * a single command may have.
+ */
+#define MTIP_MAX_SG 128
+
+/*
+ * Maximum number of slot groups (Command Issue & s_active registers)
+ * NOTE: This is the driver maximum; check dd->slot_groups for actual value.
+ */
+#define MTIP_MAX_SLOT_GROUPS 8
+
+/* Internal command tag. */
+#define MTIP_TAG_INTERNAL 0
+
+/* Micron Vendor ID & P320x SSD Device ID */
+#define PCI_VENDOR_ID_MICRON 0x1344
+#define P320_DEVICE_ID 0x5150
+
+/* Driver name and version strings */
+#define MTIP_DRV_NAME "mtip32xx"
+#define MTIP_DRV_VERSION "1.2.6os3"
+
+/* Maximum number of minor device numbers per device. */
+#define MTIP_MAX_MINORS 16
+
+/* Maximum number of supported command slots. */
+#define MTIP_MAX_COMMAND_SLOTS (MTIP_MAX_SLOT_GROUPS * 32)
+
+/*
+ * Per-tag bitfield size in longs.
+ * Linux bit manipulation functions
+ * (i.e. test_and_set_bit, find_next_zero_bit)
+ * manipulate memory in longs, so we try to make the math work.
+ * take the slot groups and find the number of longs, rounding up.
+ * Careful! i386 and x86_64 use different size longs!
+ */
+#define U32_PER_LONG (sizeof(long) / sizeof(u32))
+#define SLOTBITS_IN_LONGS ((MTIP_MAX_SLOT_GROUPS + \
+ (U32_PER_LONG-1))/U32_PER_LONG)
+
+/* BAR number used to access the HBA registers. */
+#define MTIP_ABAR 5
+
+#ifdef DEBUG
+ #define dbg_printk(format, arg...) \
+ printk(pr_fmt(format), ##arg);
+#else
+ #define dbg_printk(format, arg...)
+#endif
+
+#define __force_bit2int (unsigned int __force)
+
+/* below are bit numbers in 'flags' defined in mtip_port */
+#define MTIP_PF_IC_ACTIVE_BIT 0 /* pio/ioctl */
+#define MTIP_PF_EH_ACTIVE_BIT 1 /* error handling */
+#define MTIP_PF_SE_ACTIVE_BIT 2 /* secure erase */
+#define MTIP_PF_DM_ACTIVE_BIT 3 /* download microcde */
+#define MTIP_PF_PAUSE_IO ((1 << MTIP_PF_IC_ACTIVE_BIT) | \
+ (1 << MTIP_PF_EH_ACTIVE_BIT) | \
+ (1 << MTIP_PF_SE_ACTIVE_BIT) | \
+ (1 << MTIP_PF_DM_ACTIVE_BIT))
+
+#define MTIP_PF_SVC_THD_ACTIVE_BIT 4
+#define MTIP_PF_ISSUE_CMDS_BIT 5
+#define MTIP_PF_REBUILD_BIT 6
+#define MTIP_PF_SVC_THD_STOP_BIT 8
+
+/* below are bit numbers in 'dd_flag' defined in driver_data */
+#define MTIP_DDF_REMOVE_PENDING_BIT 1
+#define MTIP_DDF_OVER_TEMP_BIT 2
+#define MTIP_DDF_WRITE_PROTECT_BIT 3
+#define MTIP_DDF_STOP_IO ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | \
+ (1 << MTIP_DDF_OVER_TEMP_BIT) | \
+ (1 << MTIP_DDF_WRITE_PROTECT_BIT))
+
+#define MTIP_DDF_CLEANUP_BIT 5
+#define MTIP_DDF_RESUME_BIT 6
+#define MTIP_DDF_INIT_DONE_BIT 7
+#define MTIP_DDF_REBUILD_FAILED_BIT 8
+
+__packed struct smart_attr{
+ u8 attr_id;
+ u16 flags;
+ u8 cur;
+ u8 worst;
+ u32 data;
+ u8 res[3];
+};
+
+/* Register Frame Information Structure (FIS), host to device. */
+struct host_to_dev_fis {
+ /*
+ * FIS type.
+ * - 27h Register FIS, host to device.
+ * - 34h Register FIS, device to host.
+ * - 39h DMA Activate FIS, device to host.
+ * - 41h DMA Setup FIS, bi-directional.
+ * - 46h Data FIS, bi-directional.
+ * - 58h BIST Activate FIS, bi-directional.
+ * - 5Fh PIO Setup FIS, device to host.
+ * - A1h Set Device Bits FIS, device to host.
+ */
+ unsigned char type;
+ unsigned char opts;
+ unsigned char command;
+ unsigned char features;
+
+ union {
+ unsigned char lba_low;
+ unsigned char sector;
+ };
+ union {
+ unsigned char lba_mid;
+ unsigned char cyl_low;
+ };
+ union {
+ unsigned char lba_hi;
+ unsigned char cyl_hi;
+ };
+ union {
+ unsigned char device;
+ unsigned char head;
+ };
+
+ union {
+ unsigned char lba_low_ex;
+ unsigned char sector_ex;
+ };
+ union {
+ unsigned char lba_mid_ex;
+ unsigned char cyl_low_ex;
+ };
+ union {
+ unsigned char lba_hi_ex;
+ unsigned char cyl_hi_ex;
+ };
+ unsigned char features_ex;
+
+ unsigned char sect_count;
+ unsigned char sect_cnt_ex;
+ unsigned char res2;
+ unsigned char control;
+
+ unsigned int res3;
+};
+
+/* Command header structure. */
+struct mtip_cmd_hdr {
+ /*
+ * Command options.
+ * - Bits 31:16 Number of PRD entries.
+ * - Bits 15:8 Unused in this implementation.
+ * - Bit 7 Prefetch bit, informs the drive to prefetch PRD entries.
+ * - Bit 6 Write bit, should be set when writing data to the device.
+ * - Bit 5 Unused in this implementation.
+ * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
+ */
+ unsigned int opts;
+ /* This field is unsed when using NCQ. */
+ union {
+ unsigned int byte_count;
+ unsigned int status;
+ };
+ /*
+ * Lower 32 bits of the command table address associated with this
+ * header. The command table addresses must be 128 byte aligned.
+ */
+ unsigned int ctba;
+ /*
+ * If 64 bit addressing is used this field is the upper 32 bits
+ * of the command table address associated with this command.
+ */
+ unsigned int ctbau;
+ /* Reserved and unused. */
+ unsigned int res[4];
+};
+
+/* Command scatter gather structure (PRD). */
+struct mtip_cmd_sg {
+ /*
+ * Low 32 bits of the data buffer address. For P320 this
+ * address must be 8 byte aligned signified by bits 2:0 being
+ * set to 0.
+ */
+ unsigned int dba;
+ /*
+ * When 64 bit addressing is used this field is the upper
+ * 32 bits of the data buffer address.
+ */
+ unsigned int dba_upper;
+ /* Unused. */
+ unsigned int reserved;
+ /*
+ * Bit 31: interrupt when this data block has been transferred.
+ * Bits 30..22: reserved
+ * Bits 21..0: byte count (minus 1). For P320 the byte count must be
+ * 8 byte aligned signified by bits 2:0 being set to 1.
+ */
+ unsigned int info;
+};
+struct mtip_port;
+
+/* Structure used to describe a command. */
+struct mtip_cmd {
+
+ struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
+
+ dma_addr_t command_header_dma; /* corresponding physical address */
+
+ void *command; /* ptr to command table entry */
+
+ dma_addr_t command_dma; /* corresponding physical address */
+
+ void *comp_data; /* data passed to completion function comp_func() */
+ /*
+ * Completion function called by the ISR upon completion of
+ * a command.
+ */
+ void (*comp_func)(struct mtip_port *port,
+ int tag,
+ void *data,
+ int status);
+ /* Additional callback function that may be called by comp_func() */
+ void (*async_callback)(void *data, int status);
+
+ void *async_data; /* Addl. data passed to async_callback() */
+
+ int scatter_ents; /* Number of scatter list entries used */
+
+ struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+
+ int retries; /* The number of retries left for this command. */
+
+ int direction; /* Data transfer direction */
+
+ unsigned long comp_time; /* command completion time, in jiffies */
+
+ atomic_t active; /* declares if this command sent to the drive. */
+};
+
+/* Structure used to describe a port. */
+struct mtip_port {
+ /* Pointer back to the driver data for this port. */
+ struct driver_data *dd;
+ /*
+ * Used to determine if the data pointed to by the
+ * identify field is valid.
+ */
+ unsigned long identify_valid;
+ /* Base address of the memory mapped IO for the port. */
+ void __iomem *mmio;
+ /* Array of pointers to the memory mapped s_active registers. */
+ void __iomem *s_active[MTIP_MAX_SLOT_GROUPS];
+ /* Array of pointers to the memory mapped completed registers. */
+ void __iomem *completed[MTIP_MAX_SLOT_GROUPS];
+ /* Array of pointers to the memory mapped Command Issue registers. */
+ void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS];
+ /*
+ * Pointer to the beginning of the command header memory as used
+ * by the driver.
+ */
+ void *command_list;
+ /*
+ * Pointer to the beginning of the command header memory as used
+ * by the DMA.
+ */
+ dma_addr_t command_list_dma;
+ /*
+ * Pointer to the beginning of the RX FIS memory as used
+ * by the driver.
+ */
+ void *rxfis;
+ /*
+ * Pointer to the beginning of the RX FIS memory as used
+ * by the DMA.
+ */
+ dma_addr_t rxfis_dma;
+ /*
+ * Pointer to the beginning of the command table memory as used
+ * by the driver.
+ */
+ void *command_table;
+ /*
+ * Pointer to the beginning of the command table memory as used
+ * by the DMA.
+ */
+ dma_addr_t command_tbl_dma;
+ /*
+ * Pointer to the beginning of the identify data memory as used
+ * by the driver.
+ */
+ u16 *identify;
+ /*
+ * Pointer to the beginning of the identify data memory as used
+ * by the DMA.
+ */
+ dma_addr_t identify_dma;
+ /*
+ * Pointer to the beginning of a sector buffer that is used
+ * by the driver when issuing internal commands.
+ */
+ u16 *sector_buffer;
+ /*
+ * Pointer to the beginning of a sector buffer that is used
+ * by the DMA when the driver issues internal commands.
+ */
+ dma_addr_t sector_buffer_dma;
+ /*
+ * Bit significant, used to determine if a command slot has
+ * been allocated. i.e. the slot is in use. Bits are cleared
+ * when the command slot and all associated data structures
+ * are no longer needed.
+ */
+ u16 *log_buf;
+ dma_addr_t log_buf_dma;
+
+ u8 *smart_buf;
+ dma_addr_t smart_buf_dma;
+
+ unsigned long allocated[SLOTBITS_IN_LONGS];
+ /*
+ * used to queue commands when an internal command is in progress
+ * or error handling is active
+ */
+ unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
+ /*
+ * Array of command slots. Structure includes pointers to the
+ * command header and command table, and completion function and data
+ * pointers.
+ */
+ struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS];
+ /* Used by mtip_service_thread to wait for an event */
+ wait_queue_head_t svc_wait;
+ /*
+ * indicates the state of the port. Also, helps the service thread
+ * to determine its action on wake up.
+ */
+ unsigned long flags;
+ /*
+ * Timer used to complete commands that have been active for too long.
+ */
+ struct timer_list cmd_timer;
+ unsigned long ic_pause_timer;
+ /*
+ * Semaphore used to block threads if there are no
+ * command slots available.
+ */
+ struct semaphore cmd_slot;
+ /* Spinlock for working around command-issue bug. */
+ spinlock_t cmd_issue_lock;
+};
+
+/*
+ * Driver private data structure.
+ *
+ * One structure is allocated per probed device.
+ */
+struct driver_data {
+ void __iomem *mmio; /* Base address of the HBA registers. */
+
+ int major; /* Major device number. */
+
+ int instance; /* Instance number. First device probed is 0, ... */
+
+ struct gendisk *disk; /* Pointer to our gendisk structure. */
+
+ struct pci_dev *pdev; /* Pointer to the PCI device structure. */
+
+ struct request_queue *queue; /* Our request queue. */
+
+ struct mtip_port *port; /* Pointer to the port data structure. */
+
+ /* Tasklet used to process the bottom half of the ISR. */
+ struct tasklet_struct tasklet;
+
+ unsigned product_type; /* magic value declaring the product type */
+
+ unsigned slot_groups; /* number of slot groups the product supports */
+
+ unsigned long index; /* Index to determine the disk name */
+
+ unsigned long dd_flag; /* NOTE: use atomic bit operations on this */
+
+ struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
+};
+
+#endif
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index c3f0ee16594d..061427a75d37 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -34,12 +34,11 @@
#include <linux/kthread.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/types.h>
#include <linux/nbd.h>
-#define LO_MAGIC 0x68797548
+#define NBD_MAGIC 0x68797548
#ifdef NDEBUG
#define dprintk(flags, fmt...)
@@ -116,7 +115,7 @@ static void nbd_end_request(struct request *req)
spin_unlock_irqrestore(q->queue_lock, flags);
}
-static void sock_shutdown(struct nbd_device *lo, int lock)
+static void sock_shutdown(struct nbd_device *nbd, int lock)
{
/* Forcibly shutdown the socket causing all listeners
* to error
@@ -125,14 +124,14 @@ static void sock_shutdown(struct nbd_device *lo, int lock)
* there should be a more generic interface rather than
* calling socket ops directly here */
if (lock)
- mutex_lock(&lo->tx_lock);
- if (lo->sock) {
- dev_warn(disk_to_dev(lo->disk), "shutting down socket\n");
- kernel_sock_shutdown(lo->sock, SHUT_RDWR);
- lo->sock = NULL;
+ mutex_lock(&nbd->tx_lock);
+ if (nbd->sock) {
+ dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
+ kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+ nbd->sock = NULL;
}
if (lock)
- mutex_unlock(&lo->tx_lock);
+ mutex_unlock(&nbd->tx_lock);
}
static void nbd_xmit_timeout(unsigned long arg)
@@ -147,17 +146,17 @@ static void nbd_xmit_timeout(unsigned long arg)
/*
* Send or receive packet.
*/
-static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
+static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
int msg_flags)
{
- struct socket *sock = lo->sock;
+ struct socket *sock = nbd->sock;
int result;
struct msghdr msg;
struct kvec iov;
sigset_t blocked, oldset;
if (unlikely(!sock)) {
- dev_err(disk_to_dev(lo->disk),
+ dev_err(disk_to_dev(nbd->disk),
"Attempted %s on closed socket in sock_xmit\n",
(send ? "send" : "recv"));
return -EINVAL;
@@ -181,15 +180,15 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
if (send) {
struct timer_list ti;
- if (lo->xmit_timeout) {
+ if (nbd->xmit_timeout) {
init_timer(&ti);
ti.function = nbd_xmit_timeout;
ti.data = (unsigned long)current;
- ti.expires = jiffies + lo->xmit_timeout;
+ ti.expires = jiffies + nbd->xmit_timeout;
add_timer(&ti);
}
result = kernel_sendmsg(sock, &msg, &iov, 1, size);
- if (lo->xmit_timeout)
+ if (nbd->xmit_timeout)
del_timer_sync(&ti);
} else
result = kernel_recvmsg(sock, &msg, &iov, 1, size,
@@ -201,7 +200,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
task_pid_nr(current), current->comm,
dequeue_signal_lock(current, &current->blocked, &info));
result = -EINTR;
- sock_shutdown(lo, !send);
+ sock_shutdown(nbd, !send);
break;
}
@@ -219,18 +218,19 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
return result;
}
-static inline int sock_send_bvec(struct nbd_device *lo, struct bio_vec *bvec,
+static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
int flags)
{
int result;
void *kaddr = kmap(bvec->bv_page);
- result = sock_xmit(lo, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags);
+ result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
+ bvec->bv_len, flags);
kunmap(bvec->bv_page);
return result;
}
/* always call with the tx_lock held */
-static int nbd_send_req(struct nbd_device *lo, struct request *req)
+static int nbd_send_req(struct nbd_device *nbd, struct request *req)
{
int result, flags;
struct nbd_request request;
@@ -243,14 +243,14 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
memcpy(request.handle, &req, sizeof(req));
dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
- lo->disk->disk_name, req,
+ nbd->disk->disk_name, req,
nbdcmd_to_ascii(nbd_cmd(req)),
(unsigned long long)blk_rq_pos(req) << 9,
blk_rq_bytes(req));
- result = sock_xmit(lo, 1, &request, sizeof(request),
+ result = sock_xmit(nbd, 1, &request, sizeof(request),
(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
if (result <= 0) {
- dev_err(disk_to_dev(lo->disk),
+ dev_err(disk_to_dev(nbd->disk),
"Send control failed (result %d)\n", result);
goto error_out;
}
@@ -267,10 +267,10 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
if (!rq_iter_last(req, iter))
flags = MSG_MORE;
dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
- lo->disk->disk_name, req, bvec->bv_len);
- result = sock_send_bvec(lo, bvec, flags);
+ nbd->disk->disk_name, req, bvec->bv_len);
+ result = sock_send_bvec(nbd, bvec, flags);
if (result <= 0) {
- dev_err(disk_to_dev(lo->disk),
+ dev_err(disk_to_dev(nbd->disk),
"Send data failed (result %d)\n",
result);
goto error_out;
@@ -283,25 +283,25 @@ error_out:
return -EIO;
}
-static struct request *nbd_find_request(struct nbd_device *lo,
+static struct request *nbd_find_request(struct nbd_device *nbd,
struct request *xreq)
{
struct request *req, *tmp;
int err;
- err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq);
+ err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
if (unlikely(err))
goto out;
- spin_lock(&lo->queue_lock);
- list_for_each_entry_safe(req, tmp, &lo->queue_head, queuelist) {
+ spin_lock(&nbd->queue_lock);
+ list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
if (req != xreq)
continue;
list_del_init(&req->queuelist);
- spin_unlock(&lo->queue_lock);
+ spin_unlock(&nbd->queue_lock);
return req;
}
- spin_unlock(&lo->queue_lock);
+ spin_unlock(&nbd->queue_lock);
err = -ENOENT;
@@ -309,78 +309,78 @@ out:
return ERR_PTR(err);
}
-static inline int sock_recv_bvec(struct nbd_device *lo, struct bio_vec *bvec)
+static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
{
int result;
void *kaddr = kmap(bvec->bv_page);
- result = sock_xmit(lo, 0, kaddr + bvec->bv_offset, bvec->bv_len,
+ result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
MSG_WAITALL);
kunmap(bvec->bv_page);
return result;
}
/* NULL returned = something went wrong, inform userspace */
-static struct request *nbd_read_stat(struct nbd_device *lo)
+static struct request *nbd_read_stat(struct nbd_device *nbd)
{
int result;
struct nbd_reply reply;
struct request *req;
reply.magic = 0;
- result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL);
+ result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
if (result <= 0) {
- dev_err(disk_to_dev(lo->disk),
+ dev_err(disk_to_dev(nbd->disk),
"Receive control failed (result %d)\n", result);
goto harderror;
}
if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
- dev_err(disk_to_dev(lo->disk), "Wrong magic (0x%lx)\n",
+ dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
(unsigned long)ntohl(reply.magic));
result = -EPROTO;
goto harderror;
}
- req = nbd_find_request(lo, *(struct request **)reply.handle);
+ req = nbd_find_request(nbd, *(struct request **)reply.handle);
if (IS_ERR(req)) {
result = PTR_ERR(req);
if (result != -ENOENT)
goto harderror;
- dev_err(disk_to_dev(lo->disk), "Unexpected reply (%p)\n",
+ dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
reply.handle);
result = -EBADR;
goto harderror;
}
if (ntohl(reply.error)) {
- dev_err(disk_to_dev(lo->disk), "Other side returned error (%d)\n",
+ dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
ntohl(reply.error));
req->errors++;
return req;
}
dprintk(DBG_RX, "%s: request %p: got reply\n",
- lo->disk->disk_name, req);
+ nbd->disk->disk_name, req);
if (nbd_cmd(req) == NBD_CMD_READ) {
struct req_iterator iter;
struct bio_vec *bvec;
rq_for_each_segment(bvec, req, iter) {
- result = sock_recv_bvec(lo, bvec);
+ result = sock_recv_bvec(nbd, bvec);
if (result <= 0) {
- dev_err(disk_to_dev(lo->disk), "Receive data failed (result %d)\n",
+ dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
result);
req->errors++;
return req;
}
dprintk(DBG_RX, "%s: request %p: got %d bytes data\n",
- lo->disk->disk_name, req, bvec->bv_len);
+ nbd->disk->disk_name, req, bvec->bv_len);
}
}
return req;
harderror:
- lo->harderror = result;
+ nbd->harderror = result;
return NULL;
}
@@ -398,48 +398,48 @@ static struct device_attribute pid_attr = {
.show = pid_show,
};
-static int nbd_do_it(struct nbd_device *lo)
+static int nbd_do_it(struct nbd_device *nbd)
{
struct request *req;
int ret;
- BUG_ON(lo->magic != LO_MAGIC);
+ BUG_ON(nbd->magic != NBD_MAGIC);
- lo->pid = task_pid_nr(current);
- ret = device_create_file(disk_to_dev(lo->disk), &pid_attr);
+ nbd->pid = task_pid_nr(current);
+ ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
if (ret) {
- dev_err(disk_to_dev(lo->disk), "device_create_file failed!\n");
- lo->pid = 0;
+ dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+ nbd->pid = 0;
return ret;
}
- while ((req = nbd_read_stat(lo)) != NULL)
+ while ((req = nbd_read_stat(nbd)) != NULL)
nbd_end_request(req);
- device_remove_file(disk_to_dev(lo->disk), &pid_attr);
- lo->pid = 0;
+ device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+ nbd->pid = 0;
return 0;
}
-static void nbd_clear_que(struct nbd_device *lo)
+static void nbd_clear_que(struct nbd_device *nbd)
{
struct request *req;
- BUG_ON(lo->magic != LO_MAGIC);
+ BUG_ON(nbd->magic != NBD_MAGIC);
/*
- * Because we have set lo->sock to NULL under the tx_lock, all
+ * Because we have set nbd->sock to NULL under the tx_lock, all
* modifications to the list must have completed by now. For
* the same reason, the active_req must be NULL.
*
* As a consequence, we don't need to take the spin lock while
* purging the list here.
*/
- BUG_ON(lo->sock);
- BUG_ON(lo->active_req);
+ BUG_ON(nbd->sock);
+ BUG_ON(nbd->active_req);
- while (!list_empty(&lo->queue_head)) {
- req = list_entry(lo->queue_head.next, struct request,
+ while (!list_empty(&nbd->queue_head)) {
+ req = list_entry(nbd->queue_head.next, struct request,
queuelist);
list_del_init(&req->queuelist);
req->errors++;
@@ -448,7 +448,7 @@ static void nbd_clear_que(struct nbd_device *lo)
}
-static void nbd_handle_req(struct nbd_device *lo, struct request *req)
+static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
{
if (req->cmd_type != REQ_TYPE_FS)
goto error_out;
@@ -456,8 +456,8 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req)
nbd_cmd(req) = NBD_CMD_READ;
if (rq_data_dir(req) == WRITE) {
nbd_cmd(req) = NBD_CMD_WRITE;
- if (lo->flags & NBD_READ_ONLY) {
- dev_err(disk_to_dev(lo->disk),
+ if (nbd->flags & NBD_READ_ONLY) {
+ dev_err(disk_to_dev(nbd->disk),
"Write on read-only\n");
goto error_out;
}
@@ -465,29 +465,29 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req)
req->errors = 0;
- mutex_lock(&lo->tx_lock);
- if (unlikely(!lo->sock)) {
- mutex_unlock(&lo->tx_lock);
- dev_err(disk_to_dev(lo->disk),
+ mutex_lock(&nbd->tx_lock);
+ if (unlikely(!nbd->sock)) {
+ mutex_unlock(&nbd->tx_lock);
+ dev_err(disk_to_dev(nbd->disk),
"Attempted send on closed socket\n");
goto error_out;
}
- lo->active_req = req;
+ nbd->active_req = req;
- if (nbd_send_req(lo, req) != 0) {
- dev_err(disk_to_dev(lo->disk), "Request send failed\n");
+ if (nbd_send_req(nbd, req) != 0) {
+ dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
req->errors++;
nbd_end_request(req);
} else {
- spin_lock(&lo->queue_lock);
- list_add(&req->queuelist, &lo->queue_head);
- spin_unlock(&lo->queue_lock);
+ spin_lock(&nbd->queue_lock);
+ list_add(&req->queuelist, &nbd->queue_head);
+ spin_unlock(&nbd->queue_lock);
}
- lo->active_req = NULL;
- mutex_unlock(&lo->tx_lock);
- wake_up_all(&lo->active_wq);
+ nbd->active_req = NULL;
+ mutex_unlock(&nbd->tx_lock);
+ wake_up_all(&nbd->active_wq);
return;
@@ -498,28 +498,28 @@ error_out:
static int nbd_thread(void *data)
{
- struct nbd_device *lo = data;
+ struct nbd_device *nbd = data;
struct request *req;
set_user_nice(current, -20);
- while (!kthread_should_stop() || !list_empty(&lo->waiting_queue)) {
+ while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
/* wait for something to do */
- wait_event_interruptible(lo->waiting_wq,
+ wait_event_interruptible(nbd->waiting_wq,
kthread_should_stop() ||
- !list_empty(&lo->waiting_queue));
+ !list_empty(&nbd->waiting_queue));
/* extract request */
- if (list_empty(&lo->waiting_queue))
+ if (list_empty(&nbd->waiting_queue))
continue;
- spin_lock_irq(&lo->queue_lock);
- req = list_entry(lo->waiting_queue.next, struct request,
+ spin_lock_irq(&nbd->queue_lock);
+ req = list_entry(nbd->waiting_queue.next, struct request,
queuelist);
list_del_init(&req->queuelist);
- spin_unlock_irq(&lo->queue_lock);
+ spin_unlock_irq(&nbd->queue_lock);
/* handle request */
- nbd_handle_req(lo, req);
+ nbd_handle_req(nbd, req);
}
return 0;
}
@@ -527,7 +527,7 @@ static int nbd_thread(void *data)
/*
* We always wait for result of write, for now. It would be nice to make it optional
* in future
- * if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_WRITE_NOCHK))
+ * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
* { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
*/
@@ -536,19 +536,19 @@ static void do_nbd_request(struct request_queue *q)
struct request *req;
while ((req = blk_fetch_request(q)) != NULL) {
- struct nbd_device *lo;
+ struct nbd_device *nbd;
spin_unlock_irq(q->queue_lock);
dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
req->rq_disk->disk_name, req, req->cmd_type);
- lo = req->rq_disk->private_data;
+ nbd = req->rq_disk->private_data;
- BUG_ON(lo->magic != LO_MAGIC);
+ BUG_ON(nbd->magic != NBD_MAGIC);
- if (unlikely(!lo->sock)) {
- dev_err(disk_to_dev(lo->disk),
+ if (unlikely(!nbd->sock)) {
+ dev_err(disk_to_dev(nbd->disk),
"Attempted send on closed socket\n");
req->errors++;
nbd_end_request(req);
@@ -556,11 +556,11 @@ static void do_nbd_request(struct request_queue *q)
continue;
}
- spin_lock_irq(&lo->queue_lock);
- list_add_tail(&req->queuelist, &lo->waiting_queue);
- spin_unlock_irq(&lo->queue_lock);
+ spin_lock_irq(&nbd->queue_lock);
+ list_add_tail(&req->queuelist, &nbd->waiting_queue);
+ spin_unlock_irq(&nbd->queue_lock);
- wake_up(&lo->waiting_wq);
+ wake_up(&nbd->waiting_wq);
spin_lock_irq(q->queue_lock);
}
@@ -568,32 +568,32 @@ static void do_nbd_request(struct request_queue *q)
/* Must be called with tx_lock held */
-static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
unsigned int cmd, unsigned long arg)
{
switch (cmd) {
case NBD_DISCONNECT: {
struct request sreq;
- dev_info(disk_to_dev(lo->disk), "NBD_DISCONNECT\n");
+ dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
blk_rq_init(NULL, &sreq);
sreq.cmd_type = REQ_TYPE_SPECIAL;
nbd_cmd(&sreq) = NBD_CMD_DISC;
- if (!lo->sock)
+ if (!nbd->sock)
return -EINVAL;
- nbd_send_req(lo, &sreq);
+ nbd_send_req(nbd, &sreq);
return 0;
}
case NBD_CLEAR_SOCK: {
struct file *file;
- lo->sock = NULL;
- file = lo->file;
- lo->file = NULL;
- nbd_clear_que(lo);
- BUG_ON(!list_empty(&lo->queue_head));
+ nbd->sock = NULL;
+ file = nbd->file;
+ nbd->file = NULL;
+ nbd_clear_que(nbd);
+ BUG_ON(!list_empty(&nbd->queue_head));
if (file)
fput(file);
return 0;
@@ -601,14 +601,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
case NBD_SET_SOCK: {
struct file *file;
- if (lo->file)
+ if (nbd->file)
return -EBUSY;
file = fget(arg);
if (file) {
struct inode *inode = file->f_path.dentry->d_inode;
if (S_ISSOCK(inode->i_mode)) {
- lo->file = file;
- lo->sock = SOCKET_I(inode);
+ nbd->file = file;
+ nbd->sock = SOCKET_I(inode);
if (max_part > 0)
bdev->bd_invalidated = 1;
return 0;
@@ -620,29 +620,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
}
case NBD_SET_BLKSIZE:
- lo->blksize = arg;
- lo->bytesize &= ~(lo->blksize-1);
- bdev->bd_inode->i_size = lo->bytesize;
- set_blocksize(bdev, lo->blksize);
- set_capacity(lo->disk, lo->bytesize >> 9);
+ nbd->blksize = arg;
+ nbd->bytesize &= ~(nbd->blksize-1);
+ bdev->bd_inode->i_size = nbd->bytesize;
+ set_blocksize(bdev, nbd->blksize);
+ set_capacity(nbd->disk, nbd->bytesize >> 9);
return 0;
case NBD_SET_SIZE:
- lo->bytesize = arg & ~(lo->blksize-1);
- bdev->bd_inode->i_size = lo->bytesize;
- set_blocksize(bdev, lo->blksize);
- set_capacity(lo->disk, lo->bytesize >> 9);
+ nbd->bytesize = arg & ~(nbd->blksize-1);
+ bdev->bd_inode->i_size = nbd->bytesize;
+ set_blocksize(bdev, nbd->blksize);
+ set_capacity(nbd->disk, nbd->bytesize >> 9);
return 0;
case NBD_SET_TIMEOUT:
- lo->xmit_timeout = arg * HZ;
+ nbd->xmit_timeout = arg * HZ;
return 0;
case NBD_SET_SIZE_BLOCKS:
- lo->bytesize = ((u64) arg) * lo->blksize;
- bdev->bd_inode->i_size = lo->bytesize;
- set_blocksize(bdev, lo->blksize);
- set_capacity(lo->disk, lo->bytesize >> 9);
+ nbd->bytesize = ((u64) arg) * nbd->blksize;
+ bdev->bd_inode->i_size = nbd->bytesize;
+ set_blocksize(bdev, nbd->blksize);
+ set_capacity(nbd->disk, nbd->bytesize >> 9);
return 0;
case NBD_DO_IT: {
@@ -650,38 +650,38 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
struct file *file;
int error;
- if (lo->pid)
+ if (nbd->pid)
return -EBUSY;
- if (!lo->file)
+ if (!nbd->file)
return -EINVAL;
- mutex_unlock(&lo->tx_lock);
+ mutex_unlock(&nbd->tx_lock);
- thread = kthread_create(nbd_thread, lo, lo->disk->disk_name);
+ thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
if (IS_ERR(thread)) {
- mutex_lock(&lo->tx_lock);
+ mutex_lock(&nbd->tx_lock);
return PTR_ERR(thread);
}
wake_up_process(thread);
- error = nbd_do_it(lo);
+ error = nbd_do_it(nbd);
kthread_stop(thread);
- mutex_lock(&lo->tx_lock);
+ mutex_lock(&nbd->tx_lock);
if (error)
return error;
- sock_shutdown(lo, 0);
- file = lo->file;
- lo->file = NULL;
- nbd_clear_que(lo);
- dev_warn(disk_to_dev(lo->disk), "queue cleared\n");
+ sock_shutdown(nbd, 0);
+ file = nbd->file;
+ nbd->file = NULL;
+ nbd_clear_que(nbd);
+ dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
if (file)
fput(file);
- lo->bytesize = 0;
+ nbd->bytesize = 0;
bdev->bd_inode->i_size = 0;
- set_capacity(lo->disk, 0);
+ set_capacity(nbd->disk, 0);
if (max_part > 0)
ioctl_by_bdev(bdev, BLKRRPART, 0);
- return lo->harderror;
+ return nbd->harderror;
}
case NBD_CLEAR_QUE:
@@ -689,14 +689,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
* This is for compatibility only. The queue is always cleared
* by NBD_DO_IT or NBD_CLEAR_SOCK.
*/
- BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
+ BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head));
return 0;
case NBD_PRINT_DEBUG:
- dev_info(disk_to_dev(lo->disk),
+ dev_info(disk_to_dev(nbd->disk),
"next = %p, prev = %p, head = %p\n",
- lo->queue_head.next, lo->queue_head.prev,
- &lo->queue_head);
+ nbd->queue_head.next, nbd->queue_head.prev,
+ &nbd->queue_head);
return 0;
}
return -ENOTTY;
@@ -705,21 +705,21 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
- struct nbd_device *lo = bdev->bd_disk->private_data;
+ struct nbd_device *nbd = bdev->bd_disk->private_data;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- BUG_ON(lo->magic != LO_MAGIC);
+ BUG_ON(nbd->magic != NBD_MAGIC);
/* Anyone capable of this syscall can do *real bad* things */
dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
- lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
+ nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
- mutex_lock(&lo->tx_lock);
- error = __nbd_ioctl(bdev, lo, cmd, arg);
- mutex_unlock(&lo->tx_lock);
+ mutex_lock(&nbd->tx_lock);
+ error = __nbd_ioctl(bdev, nbd, cmd, arg);
+ mutex_unlock(&nbd->tx_lock);
return error;
}
@@ -805,7 +805,7 @@ static int __init nbd_init(void)
for (i = 0; i < nbds_max; i++) {
struct gendisk *disk = nbd_dev[i].disk;
nbd_dev[i].file = NULL;
- nbd_dev[i].magic = LO_MAGIC;
+ nbd_dev[i].magic = NBD_MAGIC;
nbd_dev[i].flags = 0;
INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
spin_lock_init(&nbd_dev[i].queue_lock);
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
new file mode 100644
index 000000000000..38a2d0631882
--- /dev/null
+++ b/drivers/block/nvme.c
@@ -0,0 +1,1740 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/nvme.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm-generic/io-64-nonatomic-lo-hi.h>
+
+#define NVME_Q_DEPTH 1024
+#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
+#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
+#define NVME_MINORS 64
+#define NVME_IO_TIMEOUT (5 * HZ)
+#define ADMIN_TIMEOUT (60 * HZ)
+
+static int nvme_major;
+module_param(nvme_major, int, 0);
+
+static int use_threaded_interrupts;
+module_param(use_threaded_interrupts, int, 0);
+
+static DEFINE_SPINLOCK(dev_list_lock);
+static LIST_HEAD(dev_list);
+static struct task_struct *nvme_thread;
+
+/*
+ * Represents an NVM Express device. Each nvme_dev is a PCI function.
+ */
+struct nvme_dev {
+ struct list_head node;
+ struct nvme_queue **queues;
+ u32 __iomem *dbs;
+ struct pci_dev *pci_dev;
+ struct dma_pool *prp_page_pool;
+ struct dma_pool *prp_small_pool;
+ int instance;
+ int queue_count;
+ int db_stride;
+ u32 ctrl_config;
+ struct msix_entry *entry;
+ struct nvme_bar __iomem *bar;
+ struct list_head namespaces;
+ char serial[20];
+ char model[40];
+ char firmware_rev[8];
+};
+
+/*
+ * An NVM Express namespace is equivalent to a SCSI LUN
+ */
+struct nvme_ns {
+ struct list_head list;
+
+ struct nvme_dev *dev;
+ struct request_queue *queue;
+ struct gendisk *disk;
+
+ int ns_id;
+ int lba_shift;
+};
+
+/*
+ * An NVM Express queue. Each device has at least two (one for admin
+ * commands and one for I/O commands).
+ */
+struct nvme_queue {
+ struct device *q_dmadev;
+ struct nvme_dev *dev;
+ spinlock_t q_lock;
+ struct nvme_command *sq_cmds;
+ volatile struct nvme_completion *cqes;
+ dma_addr_t sq_dma_addr;
+ dma_addr_t cq_dma_addr;
+ wait_queue_head_t sq_full;
+ wait_queue_t sq_cong_wait;
+ struct bio_list sq_cong;
+ u32 __iomem *q_db;
+ u16 q_depth;
+ u16 cq_vector;
+ u16 sq_head;
+ u16 sq_tail;
+ u16 cq_head;
+ u16 cq_phase;
+ unsigned long cmdid_data[];
+};
+
+/*
+ * Check we didin't inadvertently grow the command struct
+ */
+static inline void _nvme_check_size(void)
+{
+ BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
+ BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
+}
+
+typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
+ struct nvme_completion *);
+
+struct nvme_cmd_info {
+ nvme_completion_fn fn;
+ void *ctx;
+ unsigned long timeout;
+};
+
+static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+{
+ return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+}
+
+/**
+ * alloc_cmdid() - Allocate a Command ID
+ * @nvmeq: The queue that will be used for this command
+ * @ctx: A pointer that will be passed to the handler
+ * @handler: The function to call on completion
+ *
+ * Allocate a Command ID for a queue. The data passed in will
+ * be passed to the completion handler. This is implemented by using
+ * the bottom two bits of the ctx pointer to store the handler ID.
+ * Passing in a pointer that's not 4-byte aligned will cause a BUG.
+ * We can change this if it becomes a problem.
+ *
+ * May be called with local interrupts disabled and the q_lock held,
+ * or with interrupts enabled and no locks held.
+ */
+static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
+ nvme_completion_fn handler, unsigned timeout)
+{
+ int depth = nvmeq->q_depth - 1;
+ struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+ int cmdid;
+
+ do {
+ cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
+ if (cmdid >= depth)
+ return -EBUSY;
+ } while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
+
+ info[cmdid].fn = handler;
+ info[cmdid].ctx = ctx;
+ info[cmdid].timeout = jiffies + timeout;
+ return cmdid;
+}
+
+static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
+ nvme_completion_fn handler, unsigned timeout)
+{
+ int cmdid;
+ wait_event_killable(nvmeq->sq_full,
+ (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
+ return (cmdid < 0) ? -EINTR : cmdid;
+}
+
+/* Special values must be less than 0x1000 */
+#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
+#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
+#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
+#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
+#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE)
+
+static void special_completion(struct nvme_dev *dev, void *ctx,
+ struct nvme_completion *cqe)
+{
+ if (ctx == CMD_CTX_CANCELLED)
+ return;
+ if (ctx == CMD_CTX_FLUSH)
+ return;
+ if (ctx == CMD_CTX_COMPLETED) {
+ dev_warn(&dev->pci_dev->dev,
+ "completed id %d twice on queue %d\n",
+ cqe->command_id, le16_to_cpup(&cqe->sq_id));
+ return;
+ }
+ if (ctx == CMD_CTX_INVALID) {
+ dev_warn(&dev->pci_dev->dev,
+ "invalid id %d completed on queue %d\n",
+ cqe->command_id, le16_to_cpup(&cqe->sq_id));
+ return;
+ }
+
+ dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held. May not sleep.
+ */
+static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
+ nvme_completion_fn *fn)
+{
+ void *ctx;
+ struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+
+ if (cmdid >= nvmeq->q_depth) {
+ *fn = special_completion;
+ return CMD_CTX_INVALID;
+ }
+ *fn = info[cmdid].fn;
+ ctx = info[cmdid].ctx;
+ info[cmdid].fn = special_completion;
+ info[cmdid].ctx = CMD_CTX_COMPLETED;
+ clear_bit(cmdid, nvmeq->cmdid_data);
+ wake_up(&nvmeq->sq_full);
+ return ctx;
+}
+
+static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
+ nvme_completion_fn *fn)
+{
+ void *ctx;
+ struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+ if (fn)
+ *fn = info[cmdid].fn;
+ ctx = info[cmdid].ctx;
+ info[cmdid].fn = special_completion;
+ info[cmdid].ctx = CMD_CTX_CANCELLED;
+ return ctx;
+}
+
+static struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
+{
+ return dev->queues[get_cpu() + 1];
+}
+
+static void put_nvmeq(struct nvme_queue *nvmeq)
+{
+ put_cpu();
+}
+
+/**
+ * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * @nvmeq: The queue to use
+ * @cmd: The command to send
+ *
+ * Safe to use from interrupt context
+ */
+static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+{
+ unsigned long flags;
+ u16 tail;
+ spin_lock_irqsave(&nvmeq->q_lock, flags);
+ tail = nvmeq->sq_tail;
+ memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+ if (++tail == nvmeq->q_depth)
+ tail = 0;
+ writel(tail, nvmeq->q_db);
+ nvmeq->sq_tail = tail;
+ spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+
+ return 0;
+}
+
+/*
+ * The nvme_iod describes the data in an I/O, including the list of PRP
+ * entries. You can't see it in this data structure because C doesn't let
+ * me express that. Use nvme_alloc_iod to ensure there's enough space
+ * allocated to store the PRP list.
+ */
+struct nvme_iod {
+ void *private; /* For the use of the submitter of the I/O */
+ int npages; /* In the PRP list. 0 means small pool in use */
+ int offset; /* Of PRP list */
+ int nents; /* Used in scatterlist */
+ int length; /* Of data, in bytes */
+ dma_addr_t first_dma;
+ struct scatterlist sg[0];
+};
+
+static __le64 **iod_list(struct nvme_iod *iod)
+{
+ return ((void *)iod) + iod->offset;
+}
+
+/*
+ * Will slightly overestimate the number of pages needed. This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size)
+{
+ unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
+ return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static struct nvme_iod *
+nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
+{
+ struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
+ sizeof(__le64 *) * nvme_npages(nbytes) +
+ sizeof(struct scatterlist) * nseg, gfp);
+
+ if (iod) {
+ iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+ iod->npages = -1;
+ iod->length = nbytes;
+ }
+
+ return iod;
+}
+
+static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+{
+ const int last_prp = PAGE_SIZE / 8 - 1;
+ int i;
+ __le64 **list = iod_list(iod);
+ dma_addr_t prp_dma = iod->first_dma;
+
+ if (iod->npages == 0)
+ dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
+ for (i = 0; i < iod->npages; i++) {
+ __le64 *prp_list = list[i];
+ dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
+ dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
+ prp_dma = next_prp_dma;
+ }
+ kfree(iod);
+}
+
+static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
+{
+ struct nvme_queue *nvmeq = get_nvmeq(dev);
+ if (bio_list_empty(&nvmeq->sq_cong))
+ add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+ bio_list_add(&nvmeq->sq_cong, bio);
+ put_nvmeq(nvmeq);
+ wake_up_process(nvme_thread);
+}
+
+static void bio_completion(struct nvme_dev *dev, void *ctx,
+ struct nvme_completion *cqe)
+{
+ struct nvme_iod *iod = ctx;
+ struct bio *bio = iod->private;
+ u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+ dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+ bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ nvme_free_iod(dev, iod);
+ if (status) {
+ bio_endio(bio, -EIO);
+ } else if (bio->bi_vcnt > bio->bi_idx) {
+ requeue_bio(dev, bio);
+ } else {
+ bio_endio(bio, 0);
+ }
+}
+
+/* length is in bytes. gfp flags indicates whether we may sleep. */
+static int nvme_setup_prps(struct nvme_dev *dev,
+ struct nvme_common_command *cmd, struct nvme_iod *iod,
+ int total_len, gfp_t gfp)
+{
+ struct dma_pool *pool;
+ int length = total_len;
+ struct scatterlist *sg = iod->sg;
+ int dma_len = sg_dma_len(sg);
+ u64 dma_addr = sg_dma_address(sg);
+ int offset = offset_in_page(dma_addr);
+ __le64 *prp_list;
+ __le64 **list = iod_list(iod);
+ dma_addr_t prp_dma;
+ int nprps, i;
+
+ cmd->prp1 = cpu_to_le64(dma_addr);
+ length -= (PAGE_SIZE - offset);
+ if (length <= 0)
+ return total_len;
+
+ dma_len -= (PAGE_SIZE - offset);
+ if (dma_len) {
+ dma_addr += (PAGE_SIZE - offset);
+ } else {
+ sg = sg_next(sg);
+ dma_addr = sg_dma_address(sg);
+ dma_len = sg_dma_len(sg);
+ }
+
+ if (length <= PAGE_SIZE) {
+ cmd->prp2 = cpu_to_le64(dma_addr);
+ return total_len;
+ }
+
+ nprps = DIV_ROUND_UP(length, PAGE_SIZE);
+ if (nprps <= (256 / 8)) {
+ pool = dev->prp_small_pool;
+ iod->npages = 0;
+ } else {
+ pool = dev->prp_page_pool;
+ iod->npages = 1;
+ }
+
+ prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+ if (!prp_list) {
+ cmd->prp2 = cpu_to_le64(dma_addr);
+ iod->npages = -1;
+ return (total_len - length) + PAGE_SIZE;
+ }
+ list[0] = prp_list;
+ iod->first_dma = prp_dma;
+ cmd->prp2 = cpu_to_le64(prp_dma);
+ i = 0;
+ for (;;) {
+ if (i == PAGE_SIZE / 8) {
+ __le64 *old_prp_list = prp_list;
+ prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+ if (!prp_list)
+ return total_len - length;
+ list[iod->npages++] = prp_list;
+ prp_list[0] = old_prp_list[i - 1];
+ old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+ i = 1;
+ }
+ prp_list[i++] = cpu_to_le64(dma_addr);
+ dma_len -= PAGE_SIZE;
+ dma_addr += PAGE_SIZE;
+ length -= PAGE_SIZE;
+ if (length <= 0)
+ break;
+ if (dma_len > 0)
+ continue;
+ BUG_ON(dma_len < 0);
+ sg = sg_next(sg);
+ dma_addr = sg_dma_address(sg);
+ dma_len = sg_dma_len(sg);
+ }
+
+ return total_len;
+}
+
+/* NVMe scatterlists require no holes in the virtual address */
+#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \
+ (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
+
+static int nvme_map_bio(struct device *dev, struct nvme_iod *iod,
+ struct bio *bio, enum dma_data_direction dma_dir, int psegs)
+{
+ struct bio_vec *bvec, *bvprv = NULL;
+ struct scatterlist *sg = NULL;
+ int i, old_idx, length = 0, nsegs = 0;
+
+ sg_init_table(iod->sg, psegs);
+ old_idx = bio->bi_idx;
+ bio_for_each_segment(bvec, bio, i) {
+ if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
+ sg->length += bvec->bv_len;
+ } else {
+ if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
+ break;
+ sg = sg ? sg + 1 : iod->sg;
+ sg_set_page(sg, bvec->bv_page, bvec->bv_len,
+ bvec->bv_offset);
+ nsegs++;
+ }
+ length += bvec->bv_len;
+ bvprv = bvec;
+ }
+ bio->bi_idx = i;
+ iod->nents = nsegs;
+ sg_mark_end(sg);
+ if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) {
+ bio->bi_idx = old_idx;
+ return -ENOMEM;
+ }
+ return length;
+}
+
+static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+ int cmdid)
+{
+ struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+ memset(cmnd, 0, sizeof(*cmnd));
+ cmnd->common.opcode = nvme_cmd_flush;
+ cmnd->common.command_id = cmdid;
+ cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+
+ if (++nvmeq->sq_tail == nvmeq->q_depth)
+ nvmeq->sq_tail = 0;
+ writel(nvmeq->sq_tail, nvmeq->q_db);
+
+ return 0;
+}
+
+static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
+{
+ int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
+ special_completion, NVME_IO_TIMEOUT);
+ if (unlikely(cmdid < 0))
+ return cmdid;
+
+ return nvme_submit_flush(nvmeq, ns, cmdid);
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held. May not sleep.
+ */
+static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+ struct bio *bio)
+{
+ struct nvme_command *cmnd;
+ struct nvme_iod *iod;
+ enum dma_data_direction dma_dir;
+ int cmdid, length, result = -ENOMEM;
+ u16 control;
+ u32 dsmgmt;
+ int psegs = bio_phys_segments(ns->queue, bio);
+
+ if ((bio->bi_rw & REQ_FLUSH) && psegs) {
+ result = nvme_submit_flush_data(nvmeq, ns);
+ if (result)
+ return result;
+ }
+
+ iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC);
+ if (!iod)
+ goto nomem;
+ iod->private = bio;
+
+ result = -EBUSY;
+ cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
+ if (unlikely(cmdid < 0))
+ goto free_iod;
+
+ if ((bio->bi_rw & REQ_FLUSH) && !psegs)
+ return nvme_submit_flush(nvmeq, ns, cmdid);
+
+ control = 0;
+ if (bio->bi_rw & REQ_FUA)
+ control |= NVME_RW_FUA;
+ if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+ control |= NVME_RW_LR;
+
+ dsmgmt = 0;
+ if (bio->bi_rw & REQ_RAHEAD)
+ dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+
+ cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+ memset(cmnd, 0, sizeof(*cmnd));
+ if (bio_data_dir(bio)) {
+ cmnd->rw.opcode = nvme_cmd_write;
+ dma_dir = DMA_TO_DEVICE;
+ } else {
+ cmnd->rw.opcode = nvme_cmd_read;
+ dma_dir = DMA_FROM_DEVICE;
+ }
+
+ result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs);
+ if (result < 0)
+ goto free_iod;
+ length = result;
+
+ cmnd->rw.command_id = cmdid;
+ cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+ length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
+ GFP_ATOMIC);
+ cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
+ cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
+ cmnd->rw.control = cpu_to_le16(control);
+ cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+
+ bio->bi_sector += length >> 9;
+
+ if (++nvmeq->sq_tail == nvmeq->q_depth)
+ nvmeq->sq_tail = 0;
+ writel(nvmeq->sq_tail, nvmeq->q_db);
+
+ return 0;
+
+ free_iod:
+ nvme_free_iod(nvmeq->dev, iod);
+ nomem:
+ return result;
+}
+
+static void nvme_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct nvme_ns *ns = q->queuedata;
+ struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
+ int result = -EBUSY;
+
+ spin_lock_irq(&nvmeq->q_lock);
+ if (bio_list_empty(&nvmeq->sq_cong))
+ result = nvme_submit_bio_queue(nvmeq, ns, bio);
+ if (unlikely(result)) {
+ if (bio_list_empty(&nvmeq->sq_cong))
+ add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+ bio_list_add(&nvmeq->sq_cong, bio);
+ }
+
+ spin_unlock_irq(&nvmeq->q_lock);
+ put_nvmeq(nvmeq);
+}
+
+static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
+{
+ u16 head, phase;
+
+ head = nvmeq->cq_head;
+ phase = nvmeq->cq_phase;
+
+ for (;;) {
+ void *ctx;
+ nvme_completion_fn fn;
+ struct nvme_completion cqe = nvmeq->cqes[head];
+ if ((le16_to_cpu(cqe.status) & 1) != phase)
+ break;
+ nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
+ if (++head == nvmeq->q_depth) {
+ head = 0;
+ phase = !phase;
+ }
+
+ ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
+ fn(nvmeq->dev, ctx, &cqe);
+ }
+
+ /* If the controller ignores the cq head doorbell and continuously
+ * writes to the queue, it is theoretically possible to wrap around
+ * the queue twice and mistakenly return IRQ_NONE. Linux only
+ * requires that 0.1% of your interrupts are handled, so this isn't
+ * a big problem.
+ */
+ if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
+ return IRQ_NONE;
+
+ writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
+ nvmeq->cq_head = head;
+ nvmeq->cq_phase = phase;
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t nvme_irq(int irq, void *data)
+{
+ irqreturn_t result;
+ struct nvme_queue *nvmeq = data;
+ spin_lock(&nvmeq->q_lock);
+ result = nvme_process_cq(nvmeq);
+ spin_unlock(&nvmeq->q_lock);
+ return result;
+}
+
+static irqreturn_t nvme_irq_check(int irq, void *data)
+{
+ struct nvme_queue *nvmeq = data;
+ struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
+ if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
+ return IRQ_NONE;
+ return IRQ_WAKE_THREAD;
+}
+
+static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
+{
+ spin_lock_irq(&nvmeq->q_lock);
+ cancel_cmdid(nvmeq, cmdid, NULL);
+ spin_unlock_irq(&nvmeq->q_lock);
+}
+
+struct sync_cmd_info {
+ struct task_struct *task;
+ u32 result;
+ int status;
+};
+
+static void sync_completion(struct nvme_dev *dev, void *ctx,
+ struct nvme_completion *cqe)
+{
+ struct sync_cmd_info *cmdinfo = ctx;
+ cmdinfo->result = le32_to_cpup(&cqe->result);
+ cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+ wake_up_process(cmdinfo->task);
+}
+
+/*
+ * Returns 0 on success. If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
+ struct nvme_command *cmd, u32 *result, unsigned timeout)
+{
+ int cmdid;
+ struct sync_cmd_info cmdinfo;
+
+ cmdinfo.task = current;
+ cmdinfo.status = -EINTR;
+
+ cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion,
+ timeout);
+ if (cmdid < 0)
+ return cmdid;
+ cmd->common.command_id = cmdid;
+
+ set_current_state(TASK_KILLABLE);
+ nvme_submit_cmd(nvmeq, cmd);
+ schedule();
+
+ if (cmdinfo.status == -EINTR) {
+ nvme_abort_command(nvmeq, cmdid);
+ return -EINTR;
+ }
+
+ if (result)
+ *result = cmdinfo.result;
+
+ return cmdinfo.status;
+}
+
+static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+ u32 *result)
+{
+ return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT);
+}
+
+static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
+{
+ int status;
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.delete_queue.opcode = opcode;
+ c.delete_queue.qid = cpu_to_le16(id);
+
+ status = nvme_submit_admin_cmd(dev, &c, NULL);
+ if (status)
+ return -EIO;
+ return 0;
+}
+
+static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
+ struct nvme_queue *nvmeq)
+{
+ int status;
+ struct nvme_command c;
+ int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+
+ memset(&c, 0, sizeof(c));
+ c.create_cq.opcode = nvme_admin_create_cq;
+ c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
+ c.create_cq.cqid = cpu_to_le16(qid);
+ c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+ c.create_cq.cq_flags = cpu_to_le16(flags);
+ c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
+
+ status = nvme_submit_admin_cmd(dev, &c, NULL);
+ if (status)
+ return -EIO;
+ return 0;
+}
+
+static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
+ struct nvme_queue *nvmeq)
+{
+ int status;
+ struct nvme_command c;
+ int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+
+ memset(&c, 0, sizeof(c));
+ c.create_sq.opcode = nvme_admin_create_sq;
+ c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
+ c.create_sq.sqid = cpu_to_le16(qid);
+ c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+ c.create_sq.sq_flags = cpu_to_le16(flags);
+ c.create_sq.cqid = cpu_to_le16(qid);
+
+ status = nvme_submit_admin_cmd(dev, &c, NULL);
+ if (status)
+ return -EIO;
+ return 0;
+}
+
+static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
+{
+ return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
+}
+
+static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
+{
+ return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
+}
+
+static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
+ dma_addr_t dma_addr)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.identify.opcode = nvme_admin_identify;
+ c.identify.nsid = cpu_to_le32(nsid);
+ c.identify.prp1 = cpu_to_le64(dma_addr);
+ c.identify.cns = cpu_to_le32(cns);
+
+ return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
+ unsigned dword11, dma_addr_t dma_addr)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.features.opcode = nvme_admin_get_features;
+ c.features.prp1 = cpu_to_le64(dma_addr);
+ c.features.fid = cpu_to_le32(fid);
+ c.features.dword11 = cpu_to_le32(dword11);
+
+ return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
+ unsigned dword11, dma_addr_t dma_addr, u32 *result)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.features.opcode = nvme_admin_set_features;
+ c.features.prp1 = cpu_to_le64(dma_addr);
+ c.features.fid = cpu_to_le32(fid);
+ c.features.dword11 = cpu_to_le32(dword11);
+
+ return nvme_submit_admin_cmd(dev, &c, result);
+}
+
+static void nvme_free_queue(struct nvme_dev *dev, int qid)
+{
+ struct nvme_queue *nvmeq = dev->queues[qid];
+ int vector = dev->entry[nvmeq->cq_vector].vector;
+
+ irq_set_affinity_hint(vector, NULL);
+ free_irq(vector, nvmeq);
+
+ /* Don't tell the adapter to delete the admin queue */
+ if (qid) {
+ adapter_delete_sq(dev, qid);
+ adapter_delete_cq(dev, qid);
+ }
+
+ dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+ (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+ dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+ nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+ kfree(nvmeq);
+}
+
+static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
+ int depth, int vector)
+{
+ struct device *dmadev = &dev->pci_dev->dev;
+ unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info));
+ struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+ if (!nvmeq)
+ return NULL;
+
+ nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
+ &nvmeq->cq_dma_addr, GFP_KERNEL);
+ if (!nvmeq->cqes)
+ goto free_nvmeq;
+ memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
+
+ nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
+ &nvmeq->sq_dma_addr, GFP_KERNEL);
+ if (!nvmeq->sq_cmds)
+ goto free_cqdma;
+
+ nvmeq->q_dmadev = dmadev;
+ nvmeq->dev = dev;
+ spin_lock_init(&nvmeq->q_lock);
+ nvmeq->cq_head = 0;
+ nvmeq->cq_phase = 1;
+ init_waitqueue_head(&nvmeq->sq_full);
+ init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
+ bio_list_init(&nvmeq->sq_cong);
+ nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
+ nvmeq->q_depth = depth;
+ nvmeq->cq_vector = vector;
+
+ return nvmeq;
+
+ free_cqdma:
+ dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
+ nvmeq->cq_dma_addr);
+ free_nvmeq:
+ kfree(nvmeq);
+ return NULL;
+}
+
+static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+ const char *name)
+{
+ if (use_threaded_interrupts)
+ return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
+ nvme_irq_check, nvme_irq,
+ IRQF_DISABLED | IRQF_SHARED,
+ name, nvmeq);
+ return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
+ IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
+}
+
+static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
+ int qid, int cq_size, int vector)
+{
+ int result;
+ struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
+
+ if (!nvmeq)
+ return ERR_PTR(-ENOMEM);
+
+ result = adapter_alloc_cq(dev, qid, nvmeq);
+ if (result < 0)
+ goto free_nvmeq;
+
+ result = adapter_alloc_sq(dev, qid, nvmeq);
+ if (result < 0)
+ goto release_cq;
+
+ result = queue_request_irq(dev, nvmeq, "nvme");
+ if (result < 0)
+ goto release_sq;
+
+ return nvmeq;
+
+ release_sq:
+ adapter_delete_sq(dev, qid);
+ release_cq:
+ adapter_delete_cq(dev, qid);
+ free_nvmeq:
+ dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+ (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+ dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+ nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+ kfree(nvmeq);
+ return ERR_PTR(result);
+}
+
+static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
+{
+ int result;
+ u32 aqa;
+ u64 cap;
+ unsigned long timeout;
+ struct nvme_queue *nvmeq;
+
+ dev->dbs = ((void __iomem *)dev->bar) + 4096;
+
+ nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+ if (!nvmeq)
+ return -ENOMEM;
+
+ aqa = nvmeq->q_depth - 1;
+ aqa |= aqa << 16;
+
+ dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
+ dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
+ dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+ dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+
+ writel(0, &dev->bar->cc);
+ writel(aqa, &dev->bar->aqa);
+ writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
+ writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
+ writel(dev->ctrl_config, &dev->bar->cc);
+
+ cap = readq(&dev->bar->cap);
+ timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+ dev->db_stride = NVME_CAP_STRIDE(cap);
+
+ while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
+ msleep(100);
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ if (time_after(jiffies, timeout)) {
+ dev_err(&dev->pci_dev->dev,
+ "Device not ready; aborting initialisation\n");
+ return -ENODEV;
+ }
+ }
+
+ result = queue_request_irq(dev, nvmeq, "nvme admin");
+ dev->queues[0] = nvmeq;
+ return result;
+}
+
+static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
+ unsigned long addr, unsigned length)
+{
+ int i, err, count, nents, offset;
+ struct scatterlist *sg;
+ struct page **pages;
+ struct nvme_iod *iod;
+
+ if (addr & 3)
+ return ERR_PTR(-EINVAL);
+ if (!length)
+ return ERR_PTR(-EINVAL);
+
+ offset = offset_in_page(addr);
+ count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+ pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
+
+ err = get_user_pages_fast(addr, count, 1, pages);
+ if (err < count) {
+ count = err;
+ err = -EFAULT;
+ goto put_pages;
+ }
+
+ iod = nvme_alloc_iod(count, length, GFP_KERNEL);
+ sg = iod->sg;
+ sg_init_table(sg, count);
+ for (i = 0; i < count; i++) {
+ sg_set_page(&sg[i], pages[i],
+ min_t(int, length, PAGE_SIZE - offset), offset);
+ length -= (PAGE_SIZE - offset);
+ offset = 0;
+ }
+ sg_mark_end(&sg[i - 1]);
+ iod->nents = count;
+
+ err = -ENOMEM;
+ nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
+ write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ if (!nents)
+ goto free_iod;
+
+ kfree(pages);
+ return iod;
+
+ free_iod:
+ kfree(iod);
+ put_pages:
+ for (i = 0; i < count; i++)
+ put_page(pages[i]);
+ kfree(pages);
+ return ERR_PTR(err);
+}
+
+static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
+ struct nvme_iod *iod)
+{
+ int i;
+
+ dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+ write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+ for (i = 0; i < iod->nents; i++)
+ put_page(sg_page(&iod->sg[i]));
+}
+
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+ struct nvme_dev *dev = ns->dev;
+ struct nvme_queue *nvmeq;
+ struct nvme_user_io io;
+ struct nvme_command c;
+ unsigned length;
+ int status;
+ struct nvme_iod *iod;
+
+ if (copy_from_user(&io, uio, sizeof(io)))
+ return -EFAULT;
+ length = (io.nblocks + 1) << ns->lba_shift;
+
+ switch (io.opcode) {
+ case nvme_cmd_write:
+ case nvme_cmd_read:
+ case nvme_cmd_compare:
+ iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (IS_ERR(iod))
+ return PTR_ERR(iod);
+
+ memset(&c, 0, sizeof(c));
+ c.rw.opcode = io.opcode;
+ c.rw.flags = io.flags;
+ c.rw.nsid = cpu_to_le32(ns->ns_id);
+ c.rw.slba = cpu_to_le64(io.slba);
+ c.rw.length = cpu_to_le16(io.nblocks);
+ c.rw.control = cpu_to_le16(io.control);
+ c.rw.dsmgmt = cpu_to_le16(io.dsmgmt);
+ c.rw.reftag = io.reftag;
+ c.rw.apptag = io.apptag;
+ c.rw.appmask = io.appmask;
+ /* XXX: metadata */
+ length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
+
+ nvmeq = get_nvmeq(dev);
+ /*
+ * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
+ * disabled. We may be preempted at any point, and be rescheduled
+ * to a different CPU. That will cause cacheline bouncing, but no
+ * additional races since q_lock already protects against other CPUs.
+ */
+ put_nvmeq(nvmeq);
+ if (length != (io.nblocks + 1) << ns->lba_shift)
+ status = -ENOMEM;
+ else
+ status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
+
+ nvme_unmap_user_pages(dev, io.opcode & 1, iod);
+ nvme_free_iod(dev, iod);
+ return status;
+}
+
+static int nvme_user_admin_cmd(struct nvme_ns *ns,
+ struct nvme_admin_cmd __user *ucmd)
+{
+ struct nvme_dev *dev = ns->dev;
+ struct nvme_admin_cmd cmd;
+ struct nvme_command c;
+ int status, length;
+ struct nvme_iod *iod;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+ return -EFAULT;
+
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = cmd.opcode;
+ c.common.flags = cmd.flags;
+ c.common.nsid = cpu_to_le32(cmd.nsid);
+ c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+ c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+ c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+ c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+ c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+ c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+ c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+ c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+ length = cmd.data_len;
+ if (cmd.data_len) {
+ iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
+ length);
+ if (IS_ERR(iod))
+ return PTR_ERR(iod);
+ length = nvme_setup_prps(dev, &c.common, iod, length,
+ GFP_KERNEL);
+ }
+
+ if (length != cmd.data_len)
+ status = -ENOMEM;
+ else
+ status = nvme_submit_admin_cmd(dev, &c, NULL);
+
+ if (cmd.data_len) {
+ nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
+ nvme_free_iod(dev, iod);
+ }
+ return status;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+ unsigned long arg)
+{
+ struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+ switch (cmd) {
+ case NVME_IOCTL_ID:
+ return ns->ns_id;
+ case NVME_IOCTL_ADMIN_CMD:
+ return nvme_user_admin_cmd(ns, (void __user *)arg);
+ case NVME_IOCTL_SUBMIT_IO:
+ return nvme_submit_io(ns, (void __user *)arg);
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct block_device_operations nvme_fops = {
+ .owner = THIS_MODULE,
+ .ioctl = nvme_ioctl,
+ .compat_ioctl = nvme_ioctl,
+};
+
+static void nvme_timeout_ios(struct nvme_queue *nvmeq)
+{
+ int depth = nvmeq->q_depth - 1;
+ struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+ unsigned long now = jiffies;
+ int cmdid;
+
+ for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
+ void *ctx;
+ nvme_completion_fn fn;
+ static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
+
+ if (!time_after(now, info[cmdid].timeout))
+ continue;
+ dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
+ ctx = cancel_cmdid(nvmeq, cmdid, &fn);
+ fn(nvmeq->dev, ctx, &cqe);
+ }
+}
+
+static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
+{
+ while (bio_list_peek(&nvmeq->sq_cong)) {
+ struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
+ struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+ if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
+ bio_list_add_head(&nvmeq->sq_cong, bio);
+ break;
+ }
+ if (bio_list_empty(&nvmeq->sq_cong))
+ remove_wait_queue(&nvmeq->sq_full,
+ &nvmeq->sq_cong_wait);
+ }
+}
+
+static int nvme_kthread(void *data)
+{
+ struct nvme_dev *dev;
+
+ while (!kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ spin_lock(&dev_list_lock);
+ list_for_each_entry(dev, &dev_list, node) {
+ int i;
+ for (i = 0; i < dev->queue_count; i++) {
+ struct nvme_queue *nvmeq = dev->queues[i];
+ if (!nvmeq)
+ continue;
+ spin_lock_irq(&nvmeq->q_lock);
+ if (nvme_process_cq(nvmeq))
+ printk("process_cq did something\n");
+ nvme_timeout_ios(nvmeq);
+ nvme_resubmit_bios(nvmeq);
+ spin_unlock_irq(&nvmeq->q_lock);
+ }
+ }
+ spin_unlock(&dev_list_lock);
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+ }
+ return 0;
+}
+
+static DEFINE_IDA(nvme_index_ida);
+
+static int nvme_get_ns_idx(void)
+{
+ int index, error;
+
+ do {
+ if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL))
+ return -1;
+
+ spin_lock(&dev_list_lock);
+ error = ida_get_new(&nvme_index_ida, &index);
+ spin_unlock(&dev_list_lock);
+ } while (error == -EAGAIN);
+
+ if (error)
+ index = -1;
+ return index;
+}
+
+static void nvme_put_ns_idx(int index)
+{
+ spin_lock(&dev_list_lock);
+ ida_remove(&nvme_index_ida, index);
+ spin_unlock(&dev_list_lock);
+}
+
+static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
+ struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
+{
+ struct nvme_ns *ns;
+ struct gendisk *disk;
+ int lbaf;
+
+ if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
+ return NULL;
+
+ ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+ if (!ns)
+ return NULL;
+ ns->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!ns->queue)
+ goto out_free_ns;
+ ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+ queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+/* queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */
+ blk_queue_make_request(ns->queue, nvme_make_request);
+ ns->dev = dev;
+ ns->queue->queuedata = ns;
+
+ disk = alloc_disk(NVME_MINORS);
+ if (!disk)
+ goto out_free_queue;
+ ns->ns_id = nsid;
+ ns->disk = disk;
+ lbaf = id->flbas & 0xf;
+ ns->lba_shift = id->lbaf[lbaf].ds;
+
+ disk->major = nvme_major;
+ disk->minors = NVME_MINORS;
+ disk->first_minor = NVME_MINORS * nvme_get_ns_idx();
+ disk->fops = &nvme_fops;
+ disk->private_data = ns;
+ disk->queue = ns->queue;
+ disk->driverfs_dev = &dev->pci_dev->dev;
+ sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
+ set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+ return ns;
+
+ out_free_queue:
+ blk_cleanup_queue(ns->queue);
+ out_free_ns:
+ kfree(ns);
+ return NULL;
+}
+
+static void nvme_ns_free(struct nvme_ns *ns)
+{
+ int index = ns->disk->first_minor / NVME_MINORS;
+ put_disk(ns->disk);
+ nvme_put_ns_idx(index);
+ blk_cleanup_queue(ns->queue);
+ kfree(ns);
+}
+
+static int set_queue_count(struct nvme_dev *dev, int count)
+{
+ int status;
+ u32 result;
+ u32 q_count = (count - 1) | ((count - 1) << 16);
+
+ status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
+ &result);
+ if (status)
+ return -EIO;
+ return min(result & 0xffff, result >> 16) + 1;
+}
+
+static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
+{
+ int result, cpu, i, nr_io_queues, db_bar_size;
+
+ nr_io_queues = num_online_cpus();
+ result = set_queue_count(dev, nr_io_queues);
+ if (result < 0)
+ return result;
+ if (result < nr_io_queues)
+ nr_io_queues = result;
+
+ /* Deregister the admin queue's interrupt */
+ free_irq(dev->entry[0].vector, dev->queues[0]);
+
+ db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3));
+ if (db_bar_size > 8192) {
+ iounmap(dev->bar);
+ dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0),
+ db_bar_size);
+ dev->dbs = ((void __iomem *)dev->bar) + 4096;
+ dev->queues[0]->q_db = dev->dbs;
+ }
+
+ for (i = 0; i < nr_io_queues; i++)
+ dev->entry[i].entry = i;
+ for (;;) {
+ result = pci_enable_msix(dev->pci_dev, dev->entry,
+ nr_io_queues);
+ if (result == 0) {
+ break;
+ } else if (result > 0) {
+ nr_io_queues = result;
+ continue;
+ } else {
+ nr_io_queues = 1;
+ break;
+ }
+ }
+
+ result = queue_request_irq(dev, dev->queues[0], "nvme admin");
+ /* XXX: handle failure here */
+
+ cpu = cpumask_first(cpu_online_mask);
+ for (i = 0; i < nr_io_queues; i++) {
+ irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
+ cpu = cpumask_next(cpu, cpu_online_mask);
+ }
+
+ for (i = 0; i < nr_io_queues; i++) {
+ dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
+ NVME_Q_DEPTH, i);
+ if (IS_ERR(dev->queues[i + 1]))
+ return PTR_ERR(dev->queues[i + 1]);
+ dev->queue_count++;
+ }
+
+ for (; i < num_possible_cpus(); i++) {
+ int target = i % rounddown_pow_of_two(dev->queue_count - 1);
+ dev->queues[i + 1] = dev->queues[target + 1];
+ }
+
+ return 0;
+}
+
+static void nvme_free_queues(struct nvme_dev *dev)
+{
+ int i;
+
+ for (i = dev->queue_count - 1; i >= 0; i--)
+ nvme_free_queue(dev, i);
+}
+
+static int __devinit nvme_dev_add(struct nvme_dev *dev)
+{
+ int res, nn, i;
+ struct nvme_ns *ns, *next;
+ struct nvme_id_ctrl *ctrl;
+ struct nvme_id_ns *id_ns;
+ void *mem;
+ dma_addr_t dma_addr;
+
+ res = nvme_setup_io_queues(dev);
+ if (res)
+ return res;
+
+ mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
+ GFP_KERNEL);
+
+ res = nvme_identify(dev, 0, 1, dma_addr);
+ if (res) {
+ res = -EIO;
+ goto out_free;
+ }
+
+ ctrl = mem;
+ nn = le32_to_cpup(&ctrl->nn);
+ memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
+ memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
+ memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
+
+ id_ns = mem;
+ for (i = 1; i <= nn; i++) {
+ res = nvme_identify(dev, i, 0, dma_addr);
+ if (res)
+ continue;
+
+ if (id_ns->ncap == 0)
+ continue;
+
+ res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
+ dma_addr + 4096);
+ if (res)
+ continue;
+
+ ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
+ if (ns)
+ list_add_tail(&ns->list, &dev->namespaces);
+ }
+ list_for_each_entry(ns, &dev->namespaces, list)
+ add_disk(ns->disk);
+
+ goto out;
+
+ out_free:
+ list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+ list_del(&ns->list);
+ nvme_ns_free(ns);
+ }
+
+ out:
+ dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
+ return res;
+}
+
+static int nvme_dev_remove(struct nvme_dev *dev)
+{
+ struct nvme_ns *ns, *next;
+
+ spin_lock(&dev_list_lock);
+ list_del(&dev->node);
+ spin_unlock(&dev_list_lock);
+
+ /* TODO: wait all I/O finished or cancel them */
+
+ list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+ list_del(&ns->list);
+ del_gendisk(ns->disk);
+ nvme_ns_free(ns);
+ }
+
+ nvme_free_queues(dev);
+
+ return 0;
+}
+
+static int nvme_setup_prp_pools(struct nvme_dev *dev)
+{
+ struct device *dmadev = &dev->pci_dev->dev;
+ dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
+ PAGE_SIZE, PAGE_SIZE, 0);
+ if (!dev->prp_page_pool)
+ return -ENOMEM;
+
+ /* Optimisation for I/Os between 4k and 128k */
+ dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
+ 256, 256, 0);
+ if (!dev->prp_small_pool) {
+ dma_pool_destroy(dev->prp_page_pool);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void nvme_release_prp_pools(struct nvme_dev *dev)
+{
+ dma_pool_destroy(dev->prp_page_pool);
+ dma_pool_destroy(dev->prp_small_pool);
+}
+
+/* XXX: Use an ida or something to let remove / add work correctly */
+static void nvme_set_instance(struct nvme_dev *dev)
+{
+ static int instance;
+ dev->instance = instance++;
+}
+
+static void nvme_release_instance(struct nvme_dev *dev)
+{
+}
+
+static int __devinit nvme_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ int bars, result = -ENOMEM;
+ struct nvme_dev *dev;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
+ GFP_KERNEL);
+ if (!dev->entry)
+ goto free;
+ dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
+ GFP_KERNEL);
+ if (!dev->queues)
+ goto free;
+
+ if (pci_enable_device_mem(pdev))
+ goto free;
+ pci_set_master(pdev);
+ bars = pci_select_bars(pdev, IORESOURCE_MEM);
+ if (pci_request_selected_regions(pdev, bars, "nvme"))
+ goto disable;
+
+ INIT_LIST_HEAD(&dev->namespaces);
+ dev->pci_dev = pdev;
+ pci_set_drvdata(pdev, dev);
+ dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
+ dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
+ nvme_set_instance(dev);
+ dev->entry[0].vector = pdev->irq;
+
+ result = nvme_setup_prp_pools(dev);
+ if (result)
+ goto disable_msix;
+
+ dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+ if (!dev->bar) {
+ result = -ENOMEM;
+ goto disable_msix;
+ }
+
+ result = nvme_configure_admin_queue(dev);
+ if (result)
+ goto unmap;
+ dev->queue_count++;
+
+ spin_lock(&dev_list_lock);
+ list_add(&dev->node, &dev_list);
+ spin_unlock(&dev_list_lock);
+
+ result = nvme_dev_add(dev);
+ if (result)
+ goto delete;
+
+ return 0;
+
+ delete:
+ spin_lock(&dev_list_lock);
+ list_del(&dev->node);
+ spin_unlock(&dev_list_lock);
+
+ nvme_free_queues(dev);
+ unmap:
+ iounmap(dev->bar);
+ disable_msix:
+ pci_disable_msix(pdev);
+ nvme_release_instance(dev);
+ nvme_release_prp_pools(dev);
+ disable:
+ pci_disable_device(pdev);
+ pci_release_regions(pdev);
+ free:
+ kfree(dev->queues);
+ kfree(dev->entry);
+ kfree(dev);
+ return result;
+}
+
+static void __devexit nvme_remove(struct pci_dev *pdev)
+{
+ struct nvme_dev *dev = pci_get_drvdata(pdev);
+ nvme_dev_remove(dev);
+ pci_disable_msix(pdev);
+ iounmap(dev->bar);
+ nvme_release_instance(dev);
+ nvme_release_prp_pools(dev);
+ pci_disable_device(pdev);
+ pci_release_regions(pdev);
+ kfree(dev->queues);
+ kfree(dev->entry);
+ kfree(dev);
+}
+
+/* These functions are yet to be implemented */
+#define nvme_error_detected NULL
+#define nvme_dump_registers NULL
+#define nvme_link_reset NULL
+#define nvme_slot_reset NULL
+#define nvme_error_resume NULL
+#define nvme_suspend NULL
+#define nvme_resume NULL
+
+static struct pci_error_handlers nvme_err_handler = {
+ .error_detected = nvme_error_detected,
+ .mmio_enabled = nvme_dump_registers,
+ .link_reset = nvme_link_reset,
+ .slot_reset = nvme_slot_reset,
+ .resume = nvme_error_resume,
+};
+
+/* Move to pci_ids.h later */
+#define PCI_CLASS_STORAGE_EXPRESS 0x010802
+
+static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
+ { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
+ { 0, }
+};
+MODULE_DEVICE_TABLE(pci, nvme_id_table);
+
+static struct pci_driver nvme_driver = {
+ .name = "nvme",
+ .id_table = nvme_id_table,
+ .probe = nvme_probe,
+ .remove = __devexit_p(nvme_remove),
+ .suspend = nvme_suspend,
+ .resume = nvme_resume,
+ .err_handler = &nvme_err_handler,
+};
+
+static int __init nvme_init(void)
+{
+ int result = -EBUSY;
+
+ nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+ if (IS_ERR(nvme_thread))
+ return PTR_ERR(nvme_thread);
+
+ nvme_major = register_blkdev(nvme_major, "nvme");
+ if (nvme_major <= 0)
+ goto kill_kthread;
+
+ result = pci_register_driver(&nvme_driver);
+ if (result)
+ goto unregister_blkdev;
+ return 0;
+
+ unregister_blkdev:
+ unregister_blkdev(nvme_major, "nvme");
+ kill_kthread:
+ kthread_stop(nvme_thread);
+ return result;
+}
+
+static void __exit nvme_exit(void)
+{
+ pci_unregister_driver(&nvme_driver);
+ unregister_blkdev(nvme_major, "nvme");
+ kthread_stop(nvme_thread);
+}
+
+MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.8");
+module_init(nvme_init);
+module_exit(nvme_exit);
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
index ad124525ac23..ec64e7f5d1ce 100644
--- a/drivers/block/paride/bpck6.c
+++ b/drivers/block/paride/bpck6.c
@@ -20,9 +20,6 @@
*/
-/* PARAMETERS */
-static int verbose; /* set this to 1 to see debugging messages and whatnot */
-
#define BACKPACK_VERSION "2.0.2"
#include <linux/module.h>
@@ -36,6 +33,8 @@ static int verbose; /* set this to 1 to see debugging messages and whatnot */
#include "ppc6lnx.c"
#include "paride.h"
+/* PARAMETERS */
+static bool verbose; /* set this to 1 to see debugging messages and whatnot */
#define PPCSTRUCT(pi) ((Interface *)(pi->private))
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 46b8136c31bb..ba2b6b5e5910 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -144,7 +144,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
static DEFINE_MUTEX(pcd_mutex);
static DEFINE_SPINLOCK(pcd_lock);
-module_param(verbose, bool, 0644);
+module_param(verbose, int, 0644);
module_param(major, int, 0);
module_param(name, charp, 0);
module_param(nice, int, 0);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 869e7676d46f..831e3ac156e6 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -124,8 +124,9 @@
by default.
*/
+#include <linux/types.h>
-static int verbose = 0;
+static bool verbose = 0;
static int major = PD_MAJOR;
static char *name = PD_NAME;
static int cluster = 64;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index f21b520ef419..ec8f9ed6326e 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -118,13 +118,15 @@
#define PF_NAME "pf"
#define PF_UNITS 4
+#include <linux/types.h>
+
/* Here are things one can override from the insmod command.
Most are autoprobed by paride unless set here. Verbose is off
by default.
*/
-static int verbose = 0;
+static bool verbose = 0;
static int major = PF_MAJOR;
static char *name = PF_NAME;
static int cluster = 64;
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index a79fb4f7ff62..4a27b1de5fcb 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -130,13 +130,14 @@
#define PI_PG 4
#endif
+#include <linux/types.h>
/* Here are things one can override from the insmod command.
Most are autoprobed by paride unless set here. Verbose is 0
by default.
*/
-static int verbose = 0;
+static bool verbose = 0;
static int major = PG_MAJOR;
static char *name = PG_NAME;
static int disable = 0;
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 7179f79d7468..2596042eb987 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -109,13 +109,15 @@
#define PT_NAME "pt"
#define PT_UNITS 4
+#include <linux/types.h>
+
/* Here are things one can override from the insmod command.
Most are autoprobed by paride unless set here. Verbose is on
by default.
*/
-static int verbose = 0;
+static bool verbose = 0;
static int major = PT_MAJOR;
static char *name = PT_NAME;
static int disable = 0;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index d59edeabd93f..ba66e4445f41 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -987,14 +987,14 @@ static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct pag
while (copy_size > 0) {
struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
- void *vfrom = kmap_atomic(src_bvl->bv_page, KM_USER0) +
+ void *vfrom = kmap_atomic(src_bvl->bv_page) +
src_bvl->bv_offset + offs;
void *vto = page_address(dst_page) + dst_offs;
int len = min_t(int, copy_size, src_bvl->bv_len - offs);
BUG_ON(len < 0);
memcpy(vto, vfrom, len);
- kunmap_atomic(vfrom, KM_USER0);
+ kunmap_atomic(vfrom);
seg++;
offs = 0;
@@ -1019,10 +1019,10 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec)
offs = 0;
for (f = 0; f < pkt->frames; f++) {
if (bvec[f].bv_page != pkt->pages[p]) {
- void *vfrom = kmap_atomic(bvec[f].bv_page, KM_USER0) + bvec[f].bv_offset;
+ void *vfrom = kmap_atomic(bvec[f].bv_page) + bvec[f].bv_offset;
void *vto = page_address(pkt->pages[p]) + offs;
memcpy(vto, vfrom, CD_FRAMESIZE);
- kunmap_atomic(vfrom, KM_USER0);
+ kunmap_atomic(vfrom);
bvec[f].bv_page = pkt->pages[p];
bvec[f].bv_offset = offs;
} else {
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 148ab944378d..013c7a549fb6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -41,19 +41,35 @@
#include "rbd_types.h"
-#define DRV_NAME "rbd"
-#define DRV_NAME_LONG "rbd (rados block device)"
+/*
+ * The basic unit of block I/O is a sector. It is interpreted in a
+ * number of contexts in Linux (blk, bio, genhd), but the default is
+ * universally 512 bytes. These symbols are just slightly more
+ * meaningful than the bare numbers they represent.
+ */
+#define SECTOR_SHIFT 9
+#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
+
+#define RBD_DRV_NAME "rbd"
+#define RBD_DRV_NAME_LONG "rbd (rados block device)"
#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
-#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
#define RBD_MAX_POOL_NAME_LEN 64
#define RBD_MAX_SNAP_NAME_LEN 32
#define RBD_MAX_OPT_LEN 1024
#define RBD_SNAP_HEAD_NAME "-"
+/*
+ * An RBD device name will be "rbd#", where the "rbd" comes from
+ * RBD_DRV_NAME above, and # is a unique integer identifier.
+ * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
+ * enough to hold all possible device names.
+ */
#define DEV_NAME_LEN 32
+#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
@@ -66,7 +82,6 @@ struct rbd_image_header {
__u8 obj_order;
__u8 crypt_type;
__u8 comp_type;
- struct rw_semaphore snap_rwsem;
struct ceph_snap_context *snapc;
size_t snap_names_len;
u64 snap_seq;
@@ -83,7 +98,7 @@ struct rbd_options {
};
/*
- * an instance of the client. multiple devices may share a client.
+ * an instance of the client. multiple devices may share an rbd client.
*/
struct rbd_client {
struct ceph_client *client;
@@ -92,20 +107,9 @@ struct rbd_client {
struct list_head node;
};
-struct rbd_req_coll;
-
/*
- * a single io request
+ * a request completion status
*/
-struct rbd_request {
- struct request *rq; /* blk layer request */
- struct bio *bio; /* cloned bio */
- struct page **pages; /* list of used pages */
- u64 len;
- int coll_index;
- struct rbd_req_coll *coll;
-};
-
struct rbd_req_status {
int done;
int rc;
@@ -122,6 +126,18 @@ struct rbd_req_coll {
struct rbd_req_status status[0];
};
+/*
+ * a single io request
+ */
+struct rbd_request {
+ struct request *rq; /* blk layer request */
+ struct bio *bio; /* cloned bio */
+ struct page **pages; /* list of used pages */
+ u64 len;
+ int coll_index;
+ struct rbd_req_coll *coll;
+};
+
struct rbd_snap {
struct device dev;
const char *name;
@@ -140,7 +156,6 @@ struct rbd_device {
struct gendisk *disk; /* blkdev's gendisk and rq */
struct request_queue *q;
- struct ceph_client *client;
struct rbd_client *rbd_client;
char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
@@ -157,6 +172,8 @@ struct rbd_device {
struct ceph_osd_event *watch_event;
struct ceph_osd_request *watch_request;
+ /* protects updating the header */
+ struct rw_semaphore header_rwsem;
char snap_name[RBD_MAX_SNAP_NAME_LEN];
u32 cur_snap; /* index+1 of current snapshot within snap context
0 - for the head */
@@ -171,15 +188,13 @@ struct rbd_device {
struct device dev;
};
-static struct bus_type rbd_bus_type = {
- .name = "rbd",
-};
-
-static spinlock_t node_lock; /* protects client get/put */
-
static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
+
static LIST_HEAD(rbd_dev_list); /* devices */
-static LIST_HEAD(rbd_client_list); /* clients */
+static DEFINE_SPINLOCK(rbd_dev_list_lock);
+
+static LIST_HEAD(rbd_client_list); /* clients */
+static DEFINE_SPINLOCK(rbd_client_list_lock);
static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
static void rbd_dev_release(struct device *dev);
@@ -190,12 +205,32 @@ static ssize_t rbd_snap_add(struct device *dev,
static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
struct rbd_snap *snap);
+static ssize_t rbd_add(struct bus_type *bus, const char *buf,
+ size_t count);
+static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
+ size_t count);
+
+static struct bus_attribute rbd_bus_attrs[] = {
+ __ATTR(add, S_IWUSR, NULL, rbd_add),
+ __ATTR(remove, S_IWUSR, NULL, rbd_remove),
+ __ATTR_NULL
+};
-static struct rbd_device *dev_to_rbd(struct device *dev)
+static struct bus_type rbd_bus_type = {
+ .name = "rbd",
+ .bus_attrs = rbd_bus_attrs,
+};
+
+static void rbd_root_dev_release(struct device *dev)
{
- return container_of(dev, struct rbd_device, dev);
}
+static struct device rbd_root_dev = {
+ .init_name = "rbd",
+ .release = rbd_root_dev_release,
+};
+
+
static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
{
return get_device(&rbd_dev->dev);
@@ -210,8 +245,7 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev);
static int rbd_open(struct block_device *bdev, fmode_t mode)
{
- struct gendisk *disk = bdev->bd_disk;
- struct rbd_device *rbd_dev = disk->private_data;
+ struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
rbd_get_dev(rbd_dev);
@@ -256,9 +290,11 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt,
kref_init(&rbdc->kref);
INIT_LIST_HEAD(&rbdc->node);
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
if (IS_ERR(rbdc->client))
- goto out_rbdc;
+ goto out_mutex;
opt = NULL; /* Now rbdc->client is responsible for opt */
ret = ceph_open_session(rbdc->client);
@@ -267,16 +303,19 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt,
rbdc->rbd_opts = rbd_opts;
- spin_lock(&node_lock);
+ spin_lock(&rbd_client_list_lock);
list_add_tail(&rbdc->node, &rbd_client_list);
- spin_unlock(&node_lock);
+ spin_unlock(&rbd_client_list_lock);
+
+ mutex_unlock(&ctl_mutex);
dout("rbd_client_create created %p\n", rbdc);
return rbdc;
out_err:
ceph_destroy_client(rbdc->client);
-out_rbdc:
+out_mutex:
+ mutex_unlock(&ctl_mutex);
kfree(rbdc);
out_opt:
if (opt)
@@ -324,7 +363,7 @@ static int parse_rbd_opts_token(char *c, void *private)
substring_t argstr[MAX_OPT_ARGS];
int token, intval, ret;
- token = match_token((char *)c, rbdopt_tokens, argstr);
+ token = match_token(c, rbdopt_tokens, argstr);
if (token < 0)
return -EINVAL;
@@ -357,64 +396,61 @@ static int parse_rbd_opts_token(char *c, void *private)
* Get a ceph client with specific addr and configuration, if one does
* not exist create it.
*/
-static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
- char *options)
+static struct rbd_client *rbd_get_client(const char *mon_addr,
+ size_t mon_addr_len,
+ char *options)
{
struct rbd_client *rbdc;
struct ceph_options *opt;
- int ret;
struct rbd_options *rbd_opts;
rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
if (!rbd_opts)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
- ret = ceph_parse_options(&opt, options, mon_addr,
- mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts);
- if (ret < 0)
- goto done_err;
+ opt = ceph_parse_options(options, mon_addr,
+ mon_addr + mon_addr_len,
+ parse_rbd_opts_token, rbd_opts);
+ if (IS_ERR(opt)) {
+ kfree(rbd_opts);
+ return ERR_CAST(opt);
+ }
- spin_lock(&node_lock);
+ spin_lock(&rbd_client_list_lock);
rbdc = __rbd_client_find(opt);
if (rbdc) {
- ceph_destroy_options(opt);
-
/* using an existing client */
kref_get(&rbdc->kref);
- rbd_dev->rbd_client = rbdc;
- rbd_dev->client = rbdc->client;
- spin_unlock(&node_lock);
- return 0;
+ spin_unlock(&rbd_client_list_lock);
+
+ ceph_destroy_options(opt);
+ kfree(rbd_opts);
+
+ return rbdc;
}
- spin_unlock(&node_lock);
+ spin_unlock(&rbd_client_list_lock);
rbdc = rbd_client_create(opt, rbd_opts);
- if (IS_ERR(rbdc)) {
- ret = PTR_ERR(rbdc);
- goto done_err;
- }
- rbd_dev->rbd_client = rbdc;
- rbd_dev->client = rbdc->client;
- return 0;
-done_err:
- kfree(rbd_opts);
- return ret;
+ if (IS_ERR(rbdc))
+ kfree(rbd_opts);
+
+ return rbdc;
}
/*
* Destroy ceph client
+ *
+ * Caller must hold rbd_client_list_lock.
*/
static void rbd_client_release(struct kref *kref)
{
struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
dout("rbd_release_client %p\n", rbdc);
- spin_lock(&node_lock);
list_del(&rbdc->node);
- spin_unlock(&node_lock);
ceph_destroy_client(rbdc->client);
kfree(rbdc->rbd_opts);
@@ -427,9 +463,10 @@ static void rbd_client_release(struct kref *kref)
*/
static void rbd_put_client(struct rbd_device *rbd_dev)
{
+ spin_lock(&rbd_client_list_lock);
kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+ spin_unlock(&rbd_client_list_lock);
rbd_dev->rbd_client = NULL;
- rbd_dev->client = NULL;
}
/*
@@ -454,21 +491,19 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
gfp_t gfp_flags)
{
int i;
- u32 snap_count = le32_to_cpu(ondisk->snap_count);
- int ret = -ENOMEM;
+ u32 snap_count;
- if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) {
+ if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
return -ENXIO;
- }
- init_rwsem(&header->snap_rwsem);
- header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+ snap_count = le32_to_cpu(ondisk->snap_count);
header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
- snap_count *
- sizeof(struct rbd_image_snap_ondisk),
+ snap_count * sizeof (*ondisk),
gfp_flags);
if (!header->snapc)
return -ENOMEM;
+
+ header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
if (snap_count) {
header->snap_names = kmalloc(header->snap_names_len,
GFP_KERNEL);
@@ -495,8 +530,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
header->snapc->num_snaps = snap_count;
header->total_snaps = snap_count;
- if (snap_count &&
- allocated_snaps == snap_count) {
+ if (snap_count && allocated_snaps == snap_count) {
for (i = 0; i < snap_count; i++) {
header->snapc->snaps[i] =
le64_to_cpu(ondisk->snaps[i].id);
@@ -515,7 +549,7 @@ err_names:
kfree(header->snap_names);
err_snapc:
kfree(header->snapc);
- return ret;
+ return -ENOMEM;
}
static int snap_index(struct rbd_image_header *header, int snap_num)
@@ -539,35 +573,34 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
int i;
char *p = header->snap_names;
- for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
- if (strcmp(snap_name, p) == 0)
- break;
- }
- if (i == header->total_snaps)
- return -ENOENT;
- if (seq)
- *seq = header->snapc->snaps[i];
+ for (i = 0; i < header->total_snaps; i++) {
+ if (!strcmp(snap_name, p)) {
- if (size)
- *size = header->snap_sizes[i];
+ /* Found it. Pass back its id and/or size */
- return i;
+ if (seq)
+ *seq = header->snapc->snaps[i];
+ if (size)
+ *size = header->snap_sizes[i];
+ return i;
+ }
+ p += strlen(p) + 1; /* Skip ahead to the next name */
+ }
+ return -ENOENT;
}
-static int rbd_header_set_snap(struct rbd_device *dev,
- const char *snap_name,
- u64 *size)
+static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
{
struct rbd_image_header *header = &dev->header;
struct ceph_snap_context *snapc = header->snapc;
int ret = -ENOENT;
- down_write(&header->snap_rwsem);
+ BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
- if (!snap_name ||
- !*snap_name ||
- strcmp(snap_name, "-") == 0 ||
- strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
+ down_write(&dev->header_rwsem);
+
+ if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
+ sizeof (RBD_SNAP_HEAD_NAME))) {
if (header->total_snaps)
snapc->seq = header->snap_seq;
else
@@ -577,7 +610,7 @@ static int rbd_header_set_snap(struct rbd_device *dev,
if (size)
*size = header->image_size;
} else {
- ret = snap_by_name(header, snap_name, &snapc->seq, size);
+ ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
if (ret < 0)
goto done;
@@ -587,7 +620,7 @@ static int rbd_header_set_snap(struct rbd_device *dev,
ret = 0;
done:
- up_write(&header->snap_rwsem);
+ up_write(&dev->header_rwsem);
return ret;
}
@@ -714,7 +747,7 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
/* split the bio. We'll release it either in the next
call, or it will have to be released outside */
- bp = bio_split(old_chain, (len - total) / 512ULL);
+ bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
if (!bp)
goto err_out;
@@ -854,7 +887,7 @@ static int rbd_do_request(struct request *rq,
struct timespec mtime = CURRENT_TIME;
struct rbd_request *req_data;
struct ceph_osd_request_head *reqhead;
- struct rbd_image_header *header = &dev->header;
+ struct ceph_osd_client *osdc;
req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
if (!req_data) {
@@ -871,15 +904,13 @@ static int rbd_do_request(struct request *rq,
dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
- down_read(&header->snap_rwsem);
+ down_read(&dev->header_rwsem);
- req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
- snapc,
- ops,
- false,
- GFP_NOIO, pages, bio);
+ osdc = &dev->rbd_client->client->osdc;
+ req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
+ false, GFP_NOIO, pages, bio);
if (!req) {
- up_read(&header->snap_rwsem);
+ up_read(&dev->header_rwsem);
ret = -ENOMEM;
goto done_pages;
}
@@ -906,27 +937,27 @@ static int rbd_do_request(struct request *rq,
layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
layout->fl_pg_preferred = cpu_to_le32(-1);
layout->fl_pg_pool = cpu_to_le32(dev->poolid);
- ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
- ofs, &len, &bno, req, ops);
+ ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
+ req, ops);
ceph_osdc_build_request(req, ofs, &len,
ops,
snapc,
&mtime,
req->r_oid, req->r_oid_len);
- up_read(&header->snap_rwsem);
+ up_read(&dev->header_rwsem);
if (linger_req) {
- ceph_osdc_set_request_linger(&dev->client->osdc, req);
+ ceph_osdc_set_request_linger(osdc, req);
*linger_req = req;
}
- ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+ ret = ceph_osdc_start_request(osdc, req, false);
if (ret < 0)
goto done_err;
if (!rbd_cb) {
- ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
if (ver)
*ver = le64_to_cpu(req->r_reassert_version.version);
dout("reassert_ver=%lld\n",
@@ -1210,8 +1241,8 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
rc = __rbd_update_snaps(dev);
mutex_unlock(&ctl_mutex);
if (rc)
- pr_warning(DRV_NAME "%d got notification but failed to update"
- " snaps: %d\n", dev->major, rc);
+ pr_warning(RBD_DRV_NAME "%d got notification but failed to "
+ " update snaps: %d\n", dev->major, rc);
rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
}
@@ -1224,7 +1255,7 @@ static int rbd_req_sync_watch(struct rbd_device *dev,
u64 ver)
{
struct ceph_osd_req_op *ops;
- struct ceph_osd_client *osdc = &dev->client->osdc;
+ struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
if (ret < 0)
@@ -1311,7 +1342,7 @@ static int rbd_req_sync_notify(struct rbd_device *dev,
const char *obj)
{
struct ceph_osd_req_op *ops;
- struct ceph_osd_client *osdc = &dev->client->osdc;
+ struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
struct ceph_osd_event *event;
struct rbd_notify_info info;
int payload_len = sizeof(u32) + sizeof(u32);
@@ -1418,9 +1449,7 @@ static void rbd_rq_fn(struct request_queue *q)
struct request *rq;
struct bio_pair *bp = NULL;
- rq = blk_fetch_request(q);
-
- while (1) {
+ while ((rq = blk_fetch_request(q))) {
struct bio *bio;
struct bio *rq_bio, *next_bio = NULL;
bool do_write;
@@ -1438,32 +1467,32 @@ static void rbd_rq_fn(struct request_queue *q)
/* filter out block requests we don't understand */
if ((rq->cmd_type != REQ_TYPE_FS)) {
__blk_end_request_all(rq, 0);
- goto next;
+ continue;
}
/* deduce our operation (read, write) */
do_write = (rq_data_dir(rq) == WRITE);
size = blk_rq_bytes(rq);
- ofs = blk_rq_pos(rq) * 512ULL;
+ ofs = blk_rq_pos(rq) * SECTOR_SIZE;
rq_bio = rq->bio;
if (do_write && rbd_dev->read_only) {
__blk_end_request_all(rq, -EROFS);
- goto next;
+ continue;
}
spin_unlock_irq(q->queue_lock);
dout("%s 0x%x bytes at 0x%llx\n",
do_write ? "write" : "read",
- size, blk_rq_pos(rq) * 512ULL);
+ size, blk_rq_pos(rq) * SECTOR_SIZE);
num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
coll = rbd_alloc_coll(num_segs);
if (!coll) {
spin_lock_irq(q->queue_lock);
__blk_end_request_all(rq, -ENOMEM);
- goto next;
+ continue;
}
do {
@@ -1509,8 +1538,6 @@ next_seg:
if (bp)
bio_pair_release(bp);
spin_lock_irq(q->queue_lock);
-next:
- rq = blk_fetch_request(q);
}
}
@@ -1523,13 +1550,17 @@ static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
struct bio_vec *bvec)
{
struct rbd_device *rbd_dev = q->queuedata;
- unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
- sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
- unsigned int bio_sectors = bmd->bi_size >> 9;
+ unsigned int chunk_sectors;
+ sector_t sector;
+ unsigned int bio_sectors;
int max;
+ chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
+ sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+ bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
+
max = (chunk_sectors - ((sector & (chunk_sectors - 1))
- + bio_sectors)) << 9;
+ + bio_sectors)) << SECTOR_SHIFT;
if (max < 0)
max = 0; /* bio_add cannot handle a negative return */
if (max <= bvec->bv_len && bio_sectors == 0)
@@ -1562,15 +1593,16 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
ssize_t rc;
struct rbd_image_header_ondisk *dh;
int snap_count = 0;
- u64 snap_names_len = 0;
u64 ver;
+ size_t len;
+ /*
+ * First reads the fixed-size header to determine the number
+ * of snapshots, then re-reads it, along with all snapshot
+ * records as well as their stored names.
+ */
+ len = sizeof (*dh);
while (1) {
- int len = sizeof(*dh) +
- snap_count * sizeof(struct rbd_image_snap_ondisk) +
- snap_names_len;
-
- rc = -ENOMEM;
dh = kmalloc(len, GFP_KERNEL);
if (!dh)
return -ENOMEM;
@@ -1585,21 +1617,22 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
if (rc < 0) {
- if (rc == -ENXIO) {
+ if (rc == -ENXIO)
pr_warning("unrecognized header format"
" for image %s", rbd_dev->obj);
- }
goto out_dh;
}
- if (snap_count != header->total_snaps) {
- snap_count = header->total_snaps;
- snap_names_len = header->snap_names_len;
- rbd_header_free(header);
- kfree(dh);
- continue;
- }
- break;
+ if (snap_count == header->total_snaps)
+ break;
+
+ snap_count = header->total_snaps;
+ len = sizeof (*dh) +
+ snap_count * sizeof(struct rbd_image_snap_ondisk) +
+ header->snap_names_len;
+
+ rbd_header_free(header);
+ kfree(dh);
}
header->obj_version = ver;
@@ -1620,13 +1653,14 @@ static int rbd_header_add_snap(struct rbd_device *dev,
int ret;
void *data, *p, *e;
u64 ver;
+ struct ceph_mon_client *monc;
/* we should create a snapshot only if we're pointing at the head */
if (dev->cur_snap)
return -EINVAL;
- ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
- &new_snapid);
+ monc = &dev->rbd_client->client->monc;
+ ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
dout("created snapid=%lld\n", new_snapid);
if (ret < 0)
return ret;
@@ -1681,9 +1715,9 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev)
return ret;
/* resized? */
- set_capacity(rbd_dev->disk, h.image_size / 512ULL);
+ set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
- down_write(&rbd_dev->header.snap_rwsem);
+ down_write(&rbd_dev->header_rwsem);
snap_seq = rbd_dev->header.snapc->seq;
if (rbd_dev->header.total_snaps &&
@@ -1708,7 +1742,7 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev)
ret = __rbd_init_snaps_header(rbd_dev);
- up_write(&rbd_dev->header.snap_rwsem);
+ up_write(&rbd_dev->header_rwsem);
return ret;
}
@@ -1718,6 +1752,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
struct gendisk *disk;
struct request_queue *q;
int rc;
+ u64 segment_size;
u64 total_size = 0;
/* contact OSD, request size info about the object being mapped */
@@ -1730,7 +1765,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
if (rc)
return rc;
- rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+ rc = rbd_header_set_snap(rbd_dev, &total_size);
if (rc)
return rc;
@@ -1740,7 +1775,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
if (!disk)
goto out;
- snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
+ snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
rbd_dev->id);
disk->major = rbd_dev->major;
disk->first_minor = 0;
@@ -1753,11 +1788,15 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
if (!q)
goto out_disk;
+ /* We use the default size, but let's be explicit about it. */
+ blk_queue_physical_block_size(q, SECTOR_SIZE);
+
/* set io sizes to object size */
- blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
- blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
- blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
- blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
+ segment_size = rbd_obj_bytes(&rbd_dev->header);
+ blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
+ blk_queue_max_segment_size(q, segment_size);
+ blk_queue_io_min(q, segment_size);
+ blk_queue_io_opt(q, segment_size);
blk_queue_merge_bvec(q, rbd_merge_bvec);
disk->queue = q;
@@ -1768,7 +1807,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->q = q;
/* finally, announce the disk to the world */
- set_capacity(disk, total_size / 512ULL);
+ set_capacity(disk, total_size / SECTOR_SIZE);
add_disk(disk);
pr_info("%s: added with size 0x%llx\n",
@@ -1785,10 +1824,15 @@ out:
sysfs
*/
+static struct rbd_device *dev_to_rbd_dev(struct device *dev)
+{
+ return container_of(dev, struct rbd_device, dev);
+}
+
static ssize_t rbd_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
}
@@ -1796,7 +1840,7 @@ static ssize_t rbd_size_show(struct device *dev,
static ssize_t rbd_major_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
return sprintf(buf, "%d\n", rbd_dev->major);
}
@@ -1804,15 +1848,16 @@ static ssize_t rbd_major_show(struct device *dev,
static ssize_t rbd_client_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
+ return sprintf(buf, "client%lld\n",
+ ceph_client_id(rbd_dev->rbd_client->client));
}
static ssize_t rbd_pool_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
return sprintf(buf, "%s\n", rbd_dev->pool_name);
}
@@ -1820,7 +1865,7 @@ static ssize_t rbd_pool_show(struct device *dev,
static ssize_t rbd_name_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
return sprintf(buf, "%s\n", rbd_dev->obj);
}
@@ -1829,7 +1874,7 @@ static ssize_t rbd_snap_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
return sprintf(buf, "%s\n", rbd_dev->snap_name);
}
@@ -1839,7 +1884,7 @@ static ssize_t rbd_image_refresh(struct device *dev,
const char *buf,
size_t size)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
int rc;
int ret = size;
@@ -1904,7 +1949,7 @@ static ssize_t rbd_snap_size_show(struct device *dev,
{
struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
- return sprintf(buf, "%lld\n", (long long)snap->size);
+ return sprintf(buf, "%zd\n", snap->size);
}
static ssize_t rbd_snap_id_show(struct device *dev,
@@ -1913,7 +1958,7 @@ static ssize_t rbd_snap_id_show(struct device *dev,
{
struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
- return sprintf(buf, "%lld\n", (long long)snap->id);
+ return sprintf(buf, "%llu\n", (unsigned long long) snap->id);
}
static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
@@ -2085,19 +2130,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
return 0;
}
-
-static void rbd_root_dev_release(struct device *dev)
-{
-}
-
-static struct device rbd_root_dev = {
- .init_name = "rbd",
- .release = rbd_root_dev_release,
-};
-
static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
{
- int ret = -ENOMEM;
+ int ret;
struct device *dev;
struct rbd_snap *snap;
@@ -2111,7 +2146,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
dev_set_name(dev, "%d", rbd_dev->id);
ret = device_register(dev);
if (ret < 0)
- goto done_free;
+ goto out;
list_for_each_entry(snap, &rbd_dev->snaps, node) {
ret = rbd_register_snap_dev(rbd_dev, snap,
@@ -2119,10 +2154,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
if (ret < 0)
break;
}
-
- mutex_unlock(&ctl_mutex);
- return 0;
-done_free:
+out:
mutex_unlock(&ctl_mutex);
return ret;
}
@@ -2151,102 +2183,250 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
return ret;
}
+static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
+
+/*
+ * Get a unique rbd identifier for the given new rbd_dev, and add
+ * the rbd_dev to the global list. The minimum rbd id is 1.
+ */
+static void rbd_id_get(struct rbd_device *rbd_dev)
+{
+ rbd_dev->id = atomic64_inc_return(&rbd_id_max);
+
+ spin_lock(&rbd_dev_list_lock);
+ list_add_tail(&rbd_dev->node, &rbd_dev_list);
+ spin_unlock(&rbd_dev_list_lock);
+}
+
+/*
+ * Remove an rbd_dev from the global list, and record that its
+ * identifier is no longer in use.
+ */
+static void rbd_id_put(struct rbd_device *rbd_dev)
+{
+ struct list_head *tmp;
+ int rbd_id = rbd_dev->id;
+ int max_id;
+
+ BUG_ON(rbd_id < 1);
+
+ spin_lock(&rbd_dev_list_lock);
+ list_del_init(&rbd_dev->node);
+
+ /*
+ * If the id being "put" is not the current maximum, there
+ * is nothing special we need to do.
+ */
+ if (rbd_id != atomic64_read(&rbd_id_max)) {
+ spin_unlock(&rbd_dev_list_lock);
+ return;
+ }
+
+ /*
+ * We need to update the current maximum id. Search the
+ * list to find out what it is. We're more likely to find
+ * the maximum at the end, so search the list backward.
+ */
+ max_id = 0;
+ list_for_each_prev(tmp, &rbd_dev_list) {
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_id > max_id)
+ max_id = rbd_id;
+ }
+ spin_unlock(&rbd_dev_list_lock);
+
+ /*
+ * The max id could have been updated by rbd_id_get(), in
+ * which case it now accurately reflects the new maximum.
+ * Be careful not to overwrite the maximum value in that
+ * case.
+ */
+ atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
+}
+
+/*
+ * Skips over white space at *buf, and updates *buf to point to the
+ * first found non-space character (if any). Returns the length of
+ * the token (string of non-white space characters) found. Note
+ * that *buf must be terminated with '\0'.
+ */
+static inline size_t next_token(const char **buf)
+{
+ /*
+ * These are the characters that produce nonzero for
+ * isspace() in the "C" and "POSIX" locales.
+ */
+ const char *spaces = " \f\n\r\t\v";
+
+ *buf += strspn(*buf, spaces); /* Find start of token */
+
+ return strcspn(*buf, spaces); /* Return token length */
+}
+
+/*
+ * Finds the next token in *buf, and if the provided token buffer is
+ * big enough, copies the found token into it. The result, if
+ * copied, is guaranteed to be terminated with '\0'. Note that *buf
+ * must be terminated with '\0' on entry.
+ *
+ * Returns the length of the token found (not including the '\0').
+ * Return value will be 0 if no token is found, and it will be >=
+ * token_size if the token would not fit.
+ *
+ * The *buf pointer will be updated to point beyond the end of the
+ * found token. Note that this occurs even if the token buffer is
+ * too small to hold it.
+ */
+static inline size_t copy_token(const char **buf,
+ char *token,
+ size_t token_size)
+{
+ size_t len;
+
+ len = next_token(buf);
+ if (len < token_size) {
+ memcpy(token, *buf, len);
+ *(token + len) = '\0';
+ }
+ *buf += len;
+
+ return len;
+}
+
+/*
+ * This fills in the pool_name, obj, obj_len, snap_name, obj_len,
+ * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
+ * on the list of monitor addresses and other options provided via
+ * /sys/bus/rbd/add.
+ */
+static int rbd_add_parse_args(struct rbd_device *rbd_dev,
+ const char *buf,
+ const char **mon_addrs,
+ size_t *mon_addrs_size,
+ char *options,
+ size_t options_size)
+{
+ size_t len;
+
+ /* The first four tokens are required */
+
+ len = next_token(&buf);
+ if (!len)
+ return -EINVAL;
+ *mon_addrs_size = len + 1;
+ *mon_addrs = buf;
+
+ buf += len;
+
+ len = copy_token(&buf, options, options_size);
+ if (!len || len >= options_size)
+ return -EINVAL;
+
+ len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name));
+ if (!len || len >= sizeof (rbd_dev->pool_name))
+ return -EINVAL;
+
+ len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj));
+ if (!len || len >= sizeof (rbd_dev->obj))
+ return -EINVAL;
+
+ /* We have the object length in hand, save it. */
+
+ rbd_dev->obj_len = len;
+
+ BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN
+ < RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX));
+ sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
+
+ /*
+ * The snapshot name is optional, but it's an error if it's
+ * too long. If no snapshot is supplied, fill in the default.
+ */
+ len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name));
+ if (!len)
+ memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
+ sizeof (RBD_SNAP_HEAD_NAME));
+ else if (len >= sizeof (rbd_dev->snap_name))
+ return -EINVAL;
+
+ return 0;
+}
+
static ssize_t rbd_add(struct bus_type *bus,
const char *buf,
size_t count)
{
- struct ceph_osd_client *osdc;
struct rbd_device *rbd_dev;
- ssize_t rc = -ENOMEM;
- int irc, new_id = 0;
- struct list_head *tmp;
- char *mon_dev_name;
- char *options;
+ const char *mon_addrs = NULL;
+ size_t mon_addrs_size = 0;
+ char *options = NULL;
+ struct ceph_osd_client *osdc;
+ int rc = -ENOMEM;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
- mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
- if (!mon_dev_name)
- goto err_out_mod;
-
- options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
- if (!options)
- goto err_mon_dev;
-
- /* new rbd_device object */
rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
if (!rbd_dev)
- goto err_out_opt;
+ goto err_nomem;
+ options = kmalloc(count, GFP_KERNEL);
+ if (!options)
+ goto err_nomem;
/* static rbd_device initialization */
spin_lock_init(&rbd_dev->lock);
INIT_LIST_HEAD(&rbd_dev->node);
INIT_LIST_HEAD(&rbd_dev->snaps);
+ init_rwsem(&rbd_dev->header_rwsem);
- /* generate unique id: find highest unique id, add one */
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- list_for_each(tmp, &rbd_dev_list) {
- struct rbd_device *rbd_dev;
+ init_rwsem(&rbd_dev->header_rwsem);
- rbd_dev = list_entry(tmp, struct rbd_device, node);
- if (rbd_dev->id >= new_id)
- new_id = rbd_dev->id + 1;
- }
-
- rbd_dev->id = new_id;
+ /* generate unique id: find highest unique id, add one */
+ rbd_id_get(rbd_dev);
- /* add to global list */
- list_add_tail(&rbd_dev->node, &rbd_dev_list);
+ /* Fill in the device name, now that we have its id. */
+ BUILD_BUG_ON(DEV_NAME_LEN
+ < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
+ sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
/* parse add command */
- if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
- "%" __stringify(RBD_MAX_OPT_LEN) "s "
- "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
- "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
- "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
- mon_dev_name, options, rbd_dev->pool_name,
- rbd_dev->obj, rbd_dev->snap_name) < 4) {
- rc = -EINVAL;
- goto err_out_slot;
- }
-
- if (rbd_dev->snap_name[0] == 0)
- rbd_dev->snap_name[0] = '-';
-
- rbd_dev->obj_len = strlen(rbd_dev->obj);
- snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
- rbd_dev->obj, RBD_SUFFIX);
-
- /* initialize rest of new object */
- snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
- rc = rbd_get_client(rbd_dev, mon_dev_name, options);
- if (rc < 0)
- goto err_out_slot;
+ rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
+ options, count);
+ if (rc)
+ goto err_put_id;
- mutex_unlock(&ctl_mutex);
+ rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
+ options);
+ if (IS_ERR(rbd_dev->rbd_client)) {
+ rc = PTR_ERR(rbd_dev->rbd_client);
+ goto err_put_id;
+ }
/* pick the pool */
- osdc = &rbd_dev->client->osdc;
+ osdc = &rbd_dev->rbd_client->client->osdc;
rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
if (rc < 0)
goto err_out_client;
rbd_dev->poolid = rc;
/* register our block device */
- irc = register_blkdev(0, rbd_dev->name);
- if (irc < 0) {
- rc = irc;
+ rc = register_blkdev(0, rbd_dev->name);
+ if (rc < 0)
goto err_out_client;
- }
- rbd_dev->major = irc;
+ rbd_dev->major = rc;
rc = rbd_bus_add_dev(rbd_dev);
if (rc)
goto err_out_blkdev;
- /* set up and announce blkdev mapping */
+ /*
+ * At this point cleanup in the event of an error is the job
+ * of the sysfs code (initiated by rbd_bus_del_dev()).
+ *
+ * Set up and announce blkdev mapping.
+ */
rc = rbd_init_disk(rbd_dev);
if (rc)
goto err_out_bus;
@@ -2258,35 +2438,26 @@ static ssize_t rbd_add(struct bus_type *bus,
return count;
err_out_bus:
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- list_del_init(&rbd_dev->node);
- mutex_unlock(&ctl_mutex);
-
/* this will also clean up rest of rbd_dev stuff */
rbd_bus_del_dev(rbd_dev);
kfree(options);
- kfree(mon_dev_name);
return rc;
err_out_blkdev:
unregister_blkdev(rbd_dev->major, rbd_dev->name);
err_out_client:
rbd_put_client(rbd_dev);
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-err_out_slot:
- list_del_init(&rbd_dev->node);
- mutex_unlock(&ctl_mutex);
-
- kfree(rbd_dev);
-err_out_opt:
+err_put_id:
+ rbd_id_put(rbd_dev);
+err_nomem:
kfree(options);
-err_mon_dev:
- kfree(mon_dev_name);
-err_out_mod:
+ kfree(rbd_dev);
+
dout("Error adding device %s\n", buf);
module_put(THIS_MODULE);
- return rc;
+
+ return (ssize_t) rc;
}
static struct rbd_device *__rbd_get_dev(unsigned long id)
@@ -2294,22 +2465,28 @@ static struct rbd_device *__rbd_get_dev(unsigned long id)
struct list_head *tmp;
struct rbd_device *rbd_dev;
+ spin_lock(&rbd_dev_list_lock);
list_for_each(tmp, &rbd_dev_list) {
rbd_dev = list_entry(tmp, struct rbd_device, node);
- if (rbd_dev->id == id)
+ if (rbd_dev->id == id) {
+ spin_unlock(&rbd_dev_list_lock);
return rbd_dev;
+ }
}
+ spin_unlock(&rbd_dev_list_lock);
return NULL;
}
static void rbd_dev_release(struct device *dev)
{
- struct rbd_device *rbd_dev =
- container_of(dev, struct rbd_device, dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- if (rbd_dev->watch_request)
- ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
+ if (rbd_dev->watch_request) {
+ struct ceph_client *client = rbd_dev->rbd_client->client;
+
+ ceph_osdc_unregister_linger_request(&client->osdc,
rbd_dev->watch_request);
+ }
if (rbd_dev->watch_event)
rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
@@ -2318,6 +2495,9 @@ static void rbd_dev_release(struct device *dev)
/* clean up and free blkdev */
rbd_free_disk(rbd_dev);
unregister_blkdev(rbd_dev->major, rbd_dev->name);
+
+ /* done with the id, and with the rbd_dev */
+ rbd_id_put(rbd_dev);
kfree(rbd_dev);
/* release module ref */
@@ -2350,8 +2530,6 @@ static ssize_t rbd_remove(struct bus_type *bus,
goto done;
}
- list_del_init(&rbd_dev->node);
-
__rbd_remove_all_snaps(rbd_dev);
rbd_bus_del_dev(rbd_dev);
@@ -2365,7 +2543,7 @@ static ssize_t rbd_snap_add(struct device *dev,
const char *buf,
size_t count)
{
- struct rbd_device *rbd_dev = dev_to_rbd(dev);
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
int ret;
char *name = kmalloc(count + 1, GFP_KERNEL);
if (!name)
@@ -2401,12 +2579,6 @@ err_unlock:
return ret;
}
-static struct bus_attribute rbd_bus_attrs[] = {
- __ATTR(add, S_IWUSR, NULL, rbd_add),
- __ATTR(remove, S_IWUSR, NULL, rbd_remove),
- __ATTR_NULL
-};
-
/*
* create control files in sysfs
* /sys/bus/rbd/...
@@ -2415,21 +2587,21 @@ static int rbd_sysfs_init(void)
{
int ret;
- rbd_bus_type.bus_attrs = rbd_bus_attrs;
-
- ret = bus_register(&rbd_bus_type);
- if (ret < 0)
+ ret = device_register(&rbd_root_dev);
+ if (ret < 0)
return ret;
- ret = device_register(&rbd_root_dev);
+ ret = bus_register(&rbd_bus_type);
+ if (ret < 0)
+ device_unregister(&rbd_root_dev);
return ret;
}
static void rbd_sysfs_cleanup(void)
{
- device_unregister(&rbd_root_dev);
bus_unregister(&rbd_bus_type);
+ device_unregister(&rbd_root_dev);
}
int __init rbd_init(void)
@@ -2439,8 +2611,7 @@ int __init rbd_init(void)
rc = rbd_sysfs_init();
if (rc)
return rc;
- spin_lock_init(&node_lock);
- pr_info("loaded " DRV_NAME_LONG "\n");
+ pr_info("loaded " RBD_DRV_NAME_LONG "\n");
return 0;
}
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index fc6c678aa2cb..950708688f17 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -41,10 +41,6 @@
#define RBD_HEADER_SIGNATURE "RBD"
#define RBD_HEADER_VERSION "001.005"
-struct rbd_info {
- __le64 max_id;
-} __attribute__ ((packed));
-
struct rbd_image_snap_ondisk {
__le64 id;
__le64 image_size;
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 48e8fee9f2d4..9dcf76a10bb6 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -839,10 +839,7 @@ static struct vio_driver vdc_port_driver = {
.id_table = vdc_port_match,
.probe = vdc_port_probe,
.remove = vdc_port_remove,
- .driver = {
- .name = "vdc_port",
- .owner = THIS_MODULE,
- }
+ .name = "vdc_port",
};
static int __init vdc_init(void)
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index b70f0fca9a42..3fb6ab4c8b4e 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -619,8 +619,10 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
host->state == HST_DEV_SCAN);
spin_unlock_irq(&host->lock);
- DPRINTK("blk_insert_request, tag == %u\n", idx);
- blk_insert_request(host->oob_q, crq->rq, 1, crq);
+ DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+ crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+ crq->rq->special = crq;
+ blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
return 0;
@@ -658,8 +660,10 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
BUG_ON(rc < 0);
crq->msg_bucket = (u32) rc;
- DPRINTK("blk_insert_request, tag == %u\n", idx);
- blk_insert_request(host->oob_q, crq->rq, 1, crq);
+ DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+ crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+ crq->rq->special = crq;
+ blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
return 0;
}
@@ -1116,7 +1120,7 @@ static inline void carm_handle_resp(struct carm_host *host,
break;
case MISC_GET_FW_VER: {
struct carm_fw_ver *ver = (struct carm_fw_ver *)
- mem + sizeof(struct carm_msg_get_fw_ver);
+ (mem + sizeof(struct carm_msg_get_fw_ver));
if (!error) {
host->fw_ver = le32_to_cpu(ver->version);
host->flags |= (ver->features & FL_FW_VER_MASK);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 0e376d46bdd1..fcec0225ac76 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -119,43 +119,6 @@
/*
*/
-
-/* command block wrapper */
-struct bulk_cb_wrap {
- __le32 Signature; /* contains 'USBC' */
- u32 Tag; /* unique per command id */
- __le32 DataTransferLength; /* size of data */
- u8 Flags; /* direction in bit 0 */
- u8 Lun; /* LUN */
- u8 Length; /* of of the CDB */
- u8 CDB[UB_MAX_CDB_SIZE]; /* max command */
-};
-
-#define US_BULK_CB_WRAP_LEN 31
-#define US_BULK_CB_SIGN 0x43425355 /*spells out USBC */
-#define US_BULK_FLAG_IN 1
-#define US_BULK_FLAG_OUT 0
-
-/* command status wrapper */
-struct bulk_cs_wrap {
- __le32 Signature; /* should = 'USBS' */
- u32 Tag; /* same as original command */
- __le32 Residue; /* amount not transferred */
- u8 Status; /* see below */
-};
-
-#define US_BULK_CS_WRAP_LEN 13
-#define US_BULK_CS_SIGN 0x53425355 /* spells out 'USBS' */
-#define US_BULK_STAT_OK 0
-#define US_BULK_STAT_FAIL 1
-#define US_BULK_STAT_PHASE 2
-
-/* bulk-only class specific requests */
-#define US_BULK_RESET_REQUEST 0xff
-#define US_BULK_GET_MAX_LUN 0xfe
-
-/*
- */
struct ub_dev;
#define UB_MAX_REQ_SG 9 /* cdrecord requires 32KB and maybe a header */
@@ -1744,12 +1707,11 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode)
static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
- struct gendisk *disk = bdev->bd_disk;
void __user *usermem = (void __user *) arg;
int ret;
mutex_lock(&ub_mutex);
- ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
+ ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, usermem);
mutex_unlock(&ub_mutex);
return ret;
@@ -2478,6 +2440,8 @@ static int __init ub_init(void)
int rc;
int i;
+ pr_info("'Low Performance USB Block' driver is deprecated. "
+ "Please switch to usb-storage\n");
for (i = 0; i < UB_QLOCK_NUM; i++)
spin_lock_init(&ub_qlockv[i]);
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
deleted file mode 100644
index 9a5b2a2d616d..000000000000
--- a/drivers/block/viodasd.c
+++ /dev/null
@@ -1,809 +0,0 @@
-/* -*- linux-c -*-
- * viodasd.c
- * Authors: Dave Boutcher <boutcher@us.ibm.com>
- * Ryan Arnold <ryanarn@us.ibm.com>
- * Colin Devilbiss <devilbis@us.ibm.com>
- * Stephen Rothwell
- *
- * (C) Copyright 2000-2004 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * This routine provides access to disk space (termed "DASD" in historical
- * IBM terms) owned and managed by an OS/400 partition running on the
- * same box as this Linux partition.
- *
- * All disk operations are performed by sending messages back and forth to
- * the OS/400 partition.
- */
-
-#define pr_fmt(fmt) "viod: " fmt
-
-#include <linux/major.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/genhd.h>
-#include <linux/hdreg.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <linux/dma-mapping.h>
-#include <linux/completion.h>
-#include <linux/device.h>
-#include <linux/scatterlist.h>
-
-#include <asm/uaccess.h>
-#include <asm/vio.h>
-#include <asm/iseries/hv_types.h>
-#include <asm/iseries/hv_lp_event.h>
-#include <asm/iseries/hv_lp_config.h>
-#include <asm/iseries/vio.h>
-#include <asm/firmware.h>
-
-MODULE_DESCRIPTION("iSeries Virtual DASD");
-MODULE_AUTHOR("Dave Boutcher");
-MODULE_LICENSE("GPL");
-
-/*
- * We only support 7 partitions per physical disk....so with minor
- * numbers 0-255 we get a maximum of 32 disks.
- */
-#define VIOD_GENHD_NAME "iseries/vd"
-
-#define VIOD_VERS "1.64"
-
-enum {
- PARTITION_SHIFT = 3,
- MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS,
- MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name)
-};
-
-static DEFINE_MUTEX(viodasd_mutex);
-static DEFINE_SPINLOCK(viodasd_spinlock);
-
-#define VIOMAXREQ 16
-
-#define DEVICE_NO(cell) ((struct viodasd_device *)(cell) - &viodasd_devices[0])
-
-struct viodasd_waitevent {
- struct completion com;
- int rc;
- u16 sub_result;
- int max_disk; /* open */
-};
-
-static const struct vio_error_entry viodasd_err_table[] = {
- { 0x0201, EINVAL, "Invalid Range" },
- { 0x0202, EINVAL, "Invalid Token" },
- { 0x0203, EIO, "DMA Error" },
- { 0x0204, EIO, "Use Error" },
- { 0x0205, EIO, "Release Error" },
- { 0x0206, EINVAL, "Invalid Disk" },
- { 0x0207, EBUSY, "Can't Lock" },
- { 0x0208, EIO, "Already Locked" },
- { 0x0209, EIO, "Already Unlocked" },
- { 0x020A, EIO, "Invalid Arg" },
- { 0x020B, EIO, "Bad IFS File" },
- { 0x020C, EROFS, "Read Only Device" },
- { 0x02FF, EIO, "Internal Error" },
- { 0x0000, 0, NULL },
-};
-
-/*
- * Figure out the biggest I/O request (in sectors) we can accept
- */
-#define VIODASD_MAXSECTORS (4096 / 512 * VIOMAXBLOCKDMA)
-
-/*
- * Number of disk I/O requests we've sent to OS/400
- */
-static int num_req_outstanding;
-
-/*
- * This is our internal structure for keeping track of disk devices
- */
-struct viodasd_device {
- u16 cylinders;
- u16 tracks;
- u16 sectors;
- u16 bytes_per_sector;
- u64 size;
- int read_only;
- spinlock_t q_lock;
- struct gendisk *disk;
- struct device *dev;
-} viodasd_devices[MAX_DISKNO];
-
-/*
- * External open entry point.
- */
-static int viodasd_open(struct block_device *bdev, fmode_t mode)
-{
- struct viodasd_device *d = bdev->bd_disk->private_data;
- HvLpEvent_Rc hvrc;
- struct viodasd_waitevent we;
- u16 flags = 0;
-
- if (d->read_only) {
- if (mode & FMODE_WRITE)
- return -EROFS;
- flags = vioblockflags_ro;
- }
-
- init_completion(&we.com);
-
- /* Send the open event to OS/400 */
- hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
- HvLpEvent_Type_VirtualIo,
- viomajorsubtype_blockio | vioblockopen,
- HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
- viopath_sourceinst(viopath_hostLp),
- viopath_targetinst(viopath_hostLp),
- (u64)(unsigned long)&we, VIOVERSION << 16,
- ((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32),
- 0, 0, 0);
- if (hvrc != 0) {
- pr_warning("HV open failed %d\n", (int)hvrc);
- return -EIO;
- }
-
- wait_for_completion(&we.com);
-
- /* Check the return code */
- if (we.rc != 0) {
- const struct vio_error_entry *err =
- vio_lookup_rc(viodasd_err_table, we.sub_result);
-
- pr_warning("bad rc opening disk: %d:0x%04x (%s)\n",
- (int)we.rc, we.sub_result, err->msg);
- return -EIO;
- }
-
- return 0;
-}
-
-static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode)
-{
- int ret;
-
- mutex_lock(&viodasd_mutex);
- ret = viodasd_open(bdev, mode);
- mutex_unlock(&viodasd_mutex);
-
- return ret;
-}
-
-
-/*
- * External release entry point.
- */
-static int viodasd_release(struct gendisk *disk, fmode_t mode)
-{
- struct viodasd_device *d = disk->private_data;
- HvLpEvent_Rc hvrc;
-
- mutex_lock(&viodasd_mutex);
- /* Send the event to OS/400. We DON'T expect a response */
- hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
- HvLpEvent_Type_VirtualIo,
- viomajorsubtype_blockio | vioblockclose,
- HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck,
- viopath_sourceinst(viopath_hostLp),
- viopath_targetinst(viopath_hostLp),
- 0, VIOVERSION << 16,
- ((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */,
- 0, 0, 0);
- if (hvrc != 0)
- pr_warning("HV close call failed %d\n", (int)hvrc);
-
- mutex_unlock(&viodasd_mutex);
-
- return 0;
-}
-
-
-/* External ioctl entry point.
- */
-static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- struct gendisk *disk = bdev->bd_disk;
- struct viodasd_device *d = disk->private_data;
-
- geo->sectors = d->sectors ? d->sectors : 32;
- geo->heads = d->tracks ? d->tracks : 64;
- geo->cylinders = d->cylinders ? d->cylinders :
- get_capacity(disk) / (geo->sectors * geo->heads);
-
- return 0;
-}
-
-/*
- * Our file operations table
- */
-static const struct block_device_operations viodasd_fops = {
- .owner = THIS_MODULE,
- .open = viodasd_unlocked_open,
- .release = viodasd_release,
- .getgeo = viodasd_getgeo,
-};
-
-/*
- * End a request
- */
-static void viodasd_end_request(struct request *req, int error,
- int num_sectors)
-{
- __blk_end_request(req, error, num_sectors << 9);
-}
-
-/*
- * Send an actual I/O request to OS/400
- */
-static int send_request(struct request *req)
-{
- u64 start;
- int direction;
- int nsg;
- u16 viocmd;
- HvLpEvent_Rc hvrc;
- struct vioblocklpevent *bevent;
- struct HvLpEvent *hev;
- struct scatterlist sg[VIOMAXBLOCKDMA];
- int sgindex;
- struct viodasd_device *d;
- unsigned long flags;
-
- start = (u64)blk_rq_pos(req) << 9;
-
- if (rq_data_dir(req) == READ) {
- direction = DMA_FROM_DEVICE;
- viocmd = viomajorsubtype_blockio | vioblockread;
- } else {
- direction = DMA_TO_DEVICE;
- viocmd = viomajorsubtype_blockio | vioblockwrite;
- }
-
- d = req->rq_disk->private_data;
-
- /* Now build the scatter-gather list */
- sg_init_table(sg, VIOMAXBLOCKDMA);
- nsg = blk_rq_map_sg(req->q, req, sg);
- nsg = dma_map_sg(d->dev, sg, nsg, direction);
-
- spin_lock_irqsave(&viodasd_spinlock, flags);
- num_req_outstanding++;
-
- /* This optimization handles a single DMA block */
- if (nsg == 1)
- hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
- HvLpEvent_Type_VirtualIo, viocmd,
- HvLpEvent_AckInd_DoAck,
- HvLpEvent_AckType_ImmediateAck,
- viopath_sourceinst(viopath_hostLp),
- viopath_targetinst(viopath_hostLp),
- (u64)(unsigned long)req, VIOVERSION << 16,
- ((u64)DEVICE_NO(d) << 48), start,
- ((u64)sg_dma_address(&sg[0])) << 32,
- sg_dma_len(&sg[0]));
- else {
- bevent = (struct vioblocklpevent *)
- vio_get_event_buffer(viomajorsubtype_blockio);
- if (bevent == NULL) {
- pr_warning("error allocating disk event buffer\n");
- goto error_ret;
- }
-
- /*
- * Now build up the actual request. Note that we store
- * the pointer to the request in the correlation
- * token so we can match the response up later
- */
- memset(bevent, 0, sizeof(struct vioblocklpevent));
- hev = &bevent->event;
- hev->flags = HV_LP_EVENT_VALID | HV_LP_EVENT_DO_ACK |
- HV_LP_EVENT_INT;
- hev->xType = HvLpEvent_Type_VirtualIo;
- hev->xSubtype = viocmd;
- hev->xSourceLp = HvLpConfig_getLpIndex();
- hev->xTargetLp = viopath_hostLp;
- hev->xSizeMinus1 =
- offsetof(struct vioblocklpevent, u.rw_data.dma_info) +
- (sizeof(bevent->u.rw_data.dma_info[0]) * nsg) - 1;
- hev->xSourceInstanceId = viopath_sourceinst(viopath_hostLp);
- hev->xTargetInstanceId = viopath_targetinst(viopath_hostLp);
- hev->xCorrelationToken = (u64)req;
- bevent->version = VIOVERSION;
- bevent->disk = DEVICE_NO(d);
- bevent->u.rw_data.offset = start;
-
- /*
- * Copy just the dma information from the sg list
- * into the request
- */
- for (sgindex = 0; sgindex < nsg; sgindex++) {
- bevent->u.rw_data.dma_info[sgindex].token =
- sg_dma_address(&sg[sgindex]);
- bevent->u.rw_data.dma_info[sgindex].len =
- sg_dma_len(&sg[sgindex]);
- }
-
- /* Send the request */
- hvrc = HvCallEvent_signalLpEvent(&bevent->event);
- vio_free_event_buffer(viomajorsubtype_blockio, bevent);
- }
-
- if (hvrc != HvLpEvent_Rc_Good) {
- pr_warning("error sending disk event to OS/400 (rc %d)\n",
- (int)hvrc);
- goto error_ret;
- }
- spin_unlock_irqrestore(&viodasd_spinlock, flags);
- return 0;
-
-error_ret:
- num_req_outstanding--;
- spin_unlock_irqrestore(&viodasd_spinlock, flags);
- dma_unmap_sg(d->dev, sg, nsg, direction);
- return -1;
-}
-
-/*
- * This is the external request processing routine
- */
-static void do_viodasd_request(struct request_queue *q)
-{
- struct request *req;
-
- /*
- * If we already have the maximum number of requests
- * outstanding to OS/400 just bail out. We'll come
- * back later.
- */
- while (num_req_outstanding < VIOMAXREQ) {
- req = blk_fetch_request(q);
- if (req == NULL)
- return;
- /* check that request contains a valid command */
- if (req->cmd_type != REQ_TYPE_FS) {
- viodasd_end_request(req, -EIO, blk_rq_sectors(req));
- continue;
- }
- /* Try sending the request */
- if (send_request(req) != 0)
- viodasd_end_request(req, -EIO, blk_rq_sectors(req));
- }
-}
-
-/*
- * Probe a single disk and fill in the viodasd_device structure
- * for it.
- */
-static int probe_disk(struct viodasd_device *d)
-{
- HvLpEvent_Rc hvrc;
- struct viodasd_waitevent we;
- int dev_no = DEVICE_NO(d);
- struct gendisk *g;
- struct request_queue *q;
- u16 flags = 0;
-
-retry:
- init_completion(&we.com);
-
- /* Send the open event to OS/400 */
- hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
- HvLpEvent_Type_VirtualIo,
- viomajorsubtype_blockio | vioblockopen,
- HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
- viopath_sourceinst(viopath_hostLp),
- viopath_targetinst(viopath_hostLp),
- (u64)(unsigned long)&we, VIOVERSION << 16,
- ((u64)dev_no << 48) | ((u64)flags<< 32),
- 0, 0, 0);
- if (hvrc != 0) {
- pr_warning("bad rc on HV open %d\n", (int)hvrc);
- return 0;
- }
-
- wait_for_completion(&we.com);
-
- if (we.rc != 0) {
- if (flags != 0)
- return 0;
- /* try again with read only flag set */
- flags = vioblockflags_ro;
- goto retry;
- }
- if (we.max_disk > (MAX_DISKNO - 1)) {
- printk_once(KERN_INFO pr_fmt("Only examining the first %d of %d disks connected\n"),
- MAX_DISKNO, we.max_disk + 1);
- }
-
- /* Send the close event to OS/400. We DON'T expect a response */
- hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
- HvLpEvent_Type_VirtualIo,
- viomajorsubtype_blockio | vioblockclose,
- HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck,
- viopath_sourceinst(viopath_hostLp),
- viopath_targetinst(viopath_hostLp),
- 0, VIOVERSION << 16,
- ((u64)dev_no << 48) | ((u64)flags << 32),
- 0, 0, 0);
- if (hvrc != 0) {
- pr_warning("bad rc sending event to OS/400 %d\n", (int)hvrc);
- return 0;
- }
-
- if (d->dev == NULL) {
- /* this is when we reprobe for new disks */
- if (vio_create_viodasd(dev_no) == NULL) {
- pr_warning("cannot allocate virtual device for disk %d\n",
- dev_no);
- return 0;
- }
- /*
- * The vio_create_viodasd will have recursed into this
- * routine with d->dev set to the new vio device and
- * will finish the setup of the disk below.
- */
- return 1;
- }
-
- /* create the request queue for the disk */
- spin_lock_init(&d->q_lock);
- q = blk_init_queue(do_viodasd_request, &d->q_lock);
- if (q == NULL) {
- pr_warning("cannot allocate queue for disk %d\n", dev_no);
- return 0;
- }
- g = alloc_disk(1 << PARTITION_SHIFT);
- if (g == NULL) {
- pr_warning("cannot allocate disk structure for disk %d\n",
- dev_no);
- blk_cleanup_queue(q);
- return 0;
- }
-
- d->disk = g;
- blk_queue_max_segments(q, VIOMAXBLOCKDMA);
- blk_queue_max_hw_sectors(q, VIODASD_MAXSECTORS);
- g->major = VIODASD_MAJOR;
- g->first_minor = dev_no << PARTITION_SHIFT;
- if (dev_no >= 26)
- snprintf(g->disk_name, sizeof(g->disk_name),
- VIOD_GENHD_NAME "%c%c",
- 'a' + (dev_no / 26) - 1, 'a' + (dev_no % 26));
- else
- snprintf(g->disk_name, sizeof(g->disk_name),
- VIOD_GENHD_NAME "%c", 'a' + (dev_no % 26));
- g->fops = &viodasd_fops;
- g->queue = q;
- g->private_data = d;
- g->driverfs_dev = d->dev;
- set_capacity(g, d->size >> 9);
-
- pr_info("disk %d: %lu sectors (%lu MB) CHS=%d/%d/%d sector size %d%s\n",
- dev_no, (unsigned long)(d->size >> 9),
- (unsigned long)(d->size >> 20),
- (int)d->cylinders, (int)d->tracks,
- (int)d->sectors, (int)d->bytes_per_sector,
- d->read_only ? " (RO)" : "");
-
- /* register us in the global list */
- add_disk(g);
- return 1;
-}
-
-/* returns the total number of scatterlist elements converted */
-static int block_event_to_scatterlist(const struct vioblocklpevent *bevent,
- struct scatterlist *sg, int *total_len)
-{
- int i, numsg;
- const struct rw_data *rw_data = &bevent->u.rw_data;
- static const int offset =
- offsetof(struct vioblocklpevent, u.rw_data.dma_info);
- static const int element_size = sizeof(rw_data->dma_info[0]);
-
- numsg = ((bevent->event.xSizeMinus1 + 1) - offset) / element_size;
- if (numsg > VIOMAXBLOCKDMA)
- numsg = VIOMAXBLOCKDMA;
-
- *total_len = 0;
- sg_init_table(sg, VIOMAXBLOCKDMA);
- for (i = 0; (i < numsg) && (rw_data->dma_info[i].len > 0); ++i) {
- sg_dma_address(&sg[i]) = rw_data->dma_info[i].token;
- sg_dma_len(&sg[i]) = rw_data->dma_info[i].len;
- *total_len += rw_data->dma_info[i].len;
- }
- return i;
-}
-
-/*
- * Restart all queues, starting with the one _after_ the disk given,
- * thus reducing the chance of starvation of higher numbered disks.
- */
-static void viodasd_restart_all_queues_starting_from(int first_index)
-{
- int i;
-
- for (i = first_index + 1; i < MAX_DISKNO; ++i)
- if (viodasd_devices[i].disk)
- blk_run_queue(viodasd_devices[i].disk->queue);
- for (i = 0; i <= first_index; ++i)
- if (viodasd_devices[i].disk)
- blk_run_queue(viodasd_devices[i].disk->queue);
-}
-
-/*
- * For read and write requests, decrement the number of outstanding requests,
- * Free the DMA buffers we allocated.
- */
-static int viodasd_handle_read_write(struct vioblocklpevent *bevent)
-{
- int num_sg, num_sect, pci_direction, total_len;
- struct request *req;
- struct scatterlist sg[VIOMAXBLOCKDMA];
- struct HvLpEvent *event = &bevent->event;
- unsigned long irq_flags;
- struct viodasd_device *d;
- int error;
- spinlock_t *qlock;
-
- num_sg = block_event_to_scatterlist(bevent, sg, &total_len);
- num_sect = total_len >> 9;
- if (event->xSubtype == (viomajorsubtype_blockio | vioblockread))
- pci_direction = DMA_FROM_DEVICE;
- else
- pci_direction = DMA_TO_DEVICE;
- req = (struct request *)bevent->event.xCorrelationToken;
- d = req->rq_disk->private_data;
-
- dma_unmap_sg(d->dev, sg, num_sg, pci_direction);
-
- /*
- * Since this is running in interrupt mode, we need to make sure
- * we're not stepping on any global I/O operations
- */
- spin_lock_irqsave(&viodasd_spinlock, irq_flags);
- num_req_outstanding--;
- spin_unlock_irqrestore(&viodasd_spinlock, irq_flags);
-
- error = (event->xRc == HvLpEvent_Rc_Good) ? 0 : -EIO;
- if (error) {
- const struct vio_error_entry *err;
- err = vio_lookup_rc(viodasd_err_table, bevent->sub_result);
- pr_warning("read/write error %d:0x%04x (%s)\n",
- event->xRc, bevent->sub_result, err->msg);
- num_sect = blk_rq_sectors(req);
- }
- qlock = req->q->queue_lock;
- spin_lock_irqsave(qlock, irq_flags);
- viodasd_end_request(req, error, num_sect);
- spin_unlock_irqrestore(qlock, irq_flags);
-
- /* Finally, try to get more requests off of this device's queue */
- viodasd_restart_all_queues_starting_from(DEVICE_NO(d));
-
- return 0;
-}
-
-/* This routine handles incoming block LP events */
-static void handle_block_event(struct HvLpEvent *event)
-{
- struct vioblocklpevent *bevent = (struct vioblocklpevent *)event;
- struct viodasd_waitevent *pwe;
-
- if (event == NULL)
- /* Notification that a partition went away! */
- return;
- /* First, we should NEVER get an int here...only acks */
- if (hvlpevent_is_int(event)) {
- pr_warning("Yikes! got an int in viodasd event handler!\n");
- if (hvlpevent_need_ack(event)) {
- event->xRc = HvLpEvent_Rc_InvalidSubtype;
- HvCallEvent_ackLpEvent(event);
- }
- }
-
- switch (event->xSubtype & VIOMINOR_SUBTYPE_MASK) {
- case vioblockopen:
- /*
- * Handle a response to an open request. We get all the
- * disk information in the response, so update it. The
- * correlation token contains a pointer to a waitevent
- * structure that has a completion in it. update the
- * return code in the waitevent structure and post the
- * completion to wake up the guy who sent the request
- */
- pwe = (struct viodasd_waitevent *)event->xCorrelationToken;
- pwe->rc = event->xRc;
- pwe->sub_result = bevent->sub_result;
- if (event->xRc == HvLpEvent_Rc_Good) {
- const struct open_data *data = &bevent->u.open_data;
- struct viodasd_device *device =
- &viodasd_devices[bevent->disk];
- device->read_only =
- bevent->flags & vioblockflags_ro;
- device->size = data->disk_size;
- device->cylinders = data->cylinders;
- device->tracks = data->tracks;
- device->sectors = data->sectors;
- device->bytes_per_sector = data->bytes_per_sector;
- pwe->max_disk = data->max_disk;
- }
- complete(&pwe->com);
- break;
- case vioblockclose:
- break;
- case vioblockread:
- case vioblockwrite:
- viodasd_handle_read_write(bevent);
- break;
-
- default:
- pr_warning("invalid subtype!");
- if (hvlpevent_need_ack(event)) {
- event->xRc = HvLpEvent_Rc_InvalidSubtype;
- HvCallEvent_ackLpEvent(event);
- }
- }
-}
-
-/*
- * Get the driver to reprobe for more disks.
- */
-static ssize_t probe_disks(struct device_driver *drv, const char *buf,
- size_t count)
-{
- struct viodasd_device *d;
-
- for (d = viodasd_devices; d < &viodasd_devices[MAX_DISKNO]; d++) {
- if (d->disk == NULL)
- probe_disk(d);
- }
- return count;
-}
-static DRIVER_ATTR(probe, S_IWUSR, NULL, probe_disks);
-
-static int viodasd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
-{
- struct viodasd_device *d = &viodasd_devices[vdev->unit_address];
-
- d->dev = &vdev->dev;
- if (!probe_disk(d))
- return -ENODEV;
- return 0;
-}
-
-static int viodasd_remove(struct vio_dev *vdev)
-{
- struct viodasd_device *d;
-
- d = &viodasd_devices[vdev->unit_address];
- if (d->disk) {
- del_gendisk(d->disk);
- blk_cleanup_queue(d->disk->queue);
- put_disk(d->disk);
- d->disk = NULL;
- }
- d->dev = NULL;
- return 0;
-}
-
-/**
- * viodasd_device_table: Used by vio.c to match devices that we
- * support.
- */
-static struct vio_device_id viodasd_device_table[] __devinitdata = {
- { "block", "IBM,iSeries-viodasd" },
- { "", "" }
-};
-MODULE_DEVICE_TABLE(vio, viodasd_device_table);
-
-static struct vio_driver viodasd_driver = {
- .id_table = viodasd_device_table,
- .probe = viodasd_probe,
- .remove = viodasd_remove,
- .driver = {
- .name = "viodasd",
- .owner = THIS_MODULE,
- }
-};
-
-static int need_delete_probe;
-
-/*
- * Initialize the whole device driver. Handle module and non-module
- * versions
- */
-static int __init viodasd_init(void)
-{
- int rc;
-
- if (!firmware_has_feature(FW_FEATURE_ISERIES)) {
- rc = -ENODEV;
- goto early_fail;
- }
-
- /* Try to open to our host lp */
- if (viopath_hostLp == HvLpIndexInvalid)
- vio_set_hostlp();
-
- if (viopath_hostLp == HvLpIndexInvalid) {
- pr_warning("invalid hosting partition\n");
- rc = -EIO;
- goto early_fail;
- }
-
- pr_info("vers " VIOD_VERS ", hosting partition %d\n", viopath_hostLp);
-
- /* register the block device */
- rc = register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
- if (rc) {
- pr_warning("Unable to get major number %d for %s\n",
- VIODASD_MAJOR, VIOD_GENHD_NAME);
- goto early_fail;
- }
- /* Actually open the path to the hosting partition */
- rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio,
- VIOMAXREQ + 2);
- if (rc) {
- pr_warning("error opening path to host partition %d\n",
- viopath_hostLp);
- goto unregister_blk;
- }
-
- /* Initialize our request handler */
- vio_setHandler(viomajorsubtype_blockio, handle_block_event);
-
- rc = vio_register_driver(&viodasd_driver);
- if (rc) {
- pr_warning("vio_register_driver failed\n");
- goto unset_handler;
- }
-
- /*
- * If this call fails, it just means that we cannot dynamically
- * add virtual disks, but the driver will still work fine for
- * all existing disk, so ignore the failure.
- */
- if (!driver_create_file(&viodasd_driver.driver, &driver_attr_probe))
- need_delete_probe = 1;
-
- return 0;
-
-unset_handler:
- vio_clearHandler(viomajorsubtype_blockio);
- viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2);
-unregister_blk:
- unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-early_fail:
- return rc;
-}
-module_init(viodasd_init);
-
-void __exit viodasd_exit(void)
-{
- if (need_delete_probe)
- driver_remove_file(&viodasd_driver.driver, &driver_attr_probe);
- vio_unregister_driver(&viodasd_driver);
- vio_clearHandler(viomajorsubtype_blockio);
- viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2);
- unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-}
-module_exit(viodasd_exit);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4d0b70adf5f7..0e4ef3de9d5d 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -4,6 +4,7 @@
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
#include <linux/scatterlist.h>
@@ -36,6 +37,12 @@ struct virtio_blk
/* Process context for config space updates */
struct work_struct config_work;
+ /* Lock for config space updates */
+ struct mutex config_lock;
+
+ /* enable config space updates */
+ bool config_enable;
+
/* What host tells us, plus 2 for header & tailer. */
unsigned int sg_elems;
@@ -172,7 +179,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
}
}
- if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) {
+ if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) {
mempool_free(vbr, vblk->pool);
return false;
}
@@ -243,8 +250,8 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
return -ENOTTY;
- return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
- (void __user *)data);
+ return scsi_cmd_blk_ioctl(bdev, mode, cmd,
+ (void __user *)data);
}
/* We provide getgeo only to please some old bootloader/partitioning tools */
@@ -318,6 +325,10 @@ static void virtblk_config_changed_work(struct work_struct *work)
char cap_str_2[10], cap_str_10[10];
u64 capacity, size;
+ mutex_lock(&vblk->config_lock);
+ if (!vblk->config_enable)
+ goto done;
+
/* Host must always specify the capacity. */
vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
&capacity, sizeof(capacity));
@@ -340,6 +351,9 @@ static void virtblk_config_changed_work(struct work_struct *work)
cap_str_10, cap_str_2);
set_capacity(vblk->disk, capacity);
+ revalidate_disk(vblk->disk);
+done:
+ mutex_unlock(&vblk->config_lock);
}
static void virtblk_config_changed(struct virtio_device *vdev)
@@ -349,6 +363,18 @@ static void virtblk_config_changed(struct virtio_device *vdev)
queue_work(virtblk_wq, &vblk->config_work);
}
+static int init_vq(struct virtio_blk *vblk)
+{
+ int err = 0;
+
+ /* We expect one virtqueue, for output. */
+ vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests");
+ if (IS_ERR(vblk->vq))
+ err = PTR_ERR(vblk->vq);
+
+ return err;
+}
+
static int __devinit virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
@@ -388,14 +414,13 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
vblk->vdev = vdev;
vblk->sg_elems = sg_elems;
sg_init_table(vblk->sg, vblk->sg_elems);
+ mutex_init(&vblk->config_lock);
INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
+ vblk->config_enable = true;
- /* We expect one virtqueue, for output. */
- vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests");
- if (IS_ERR(vblk->vq)) {
- err = PTR_ERR(vblk->vq);
+ err = init_vq(vblk);
+ if (err)
goto out_free_vblk;
- }
vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
if (!vblk->pool) {
@@ -542,7 +567,10 @@ static void __devexit virtblk_remove(struct virtio_device *vdev)
struct virtio_blk *vblk = vdev->priv;
int index = vblk->index;
- flush_work(&vblk->config_work);
+ /* Prevent config work handler from accessing the device. */
+ mutex_lock(&vblk->config_lock);
+ vblk->config_enable = false;
+ mutex_unlock(&vblk->config_lock);
/* Nothing should be pending. */
BUG_ON(!list_empty(&vblk->reqs));
@@ -550,6 +578,8 @@ static void __devexit virtblk_remove(struct virtio_device *vdev)
/* Stop all the virtqueues. */
vdev->config->reset(vdev);
+ flush_work(&vblk->config_work);
+
del_gendisk(vblk->disk);
blk_cleanup_queue(vblk->disk->queue);
put_disk(vblk->disk);
@@ -559,6 +589,46 @@ static void __devexit virtblk_remove(struct virtio_device *vdev)
ida_simple_remove(&vd_index_ida, index);
}
+#ifdef CONFIG_PM
+static int virtblk_freeze(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk = vdev->priv;
+
+ /* Ensure we don't receive any more interrupts */
+ vdev->config->reset(vdev);
+
+ /* Prevent config work handler from accessing the device. */
+ mutex_lock(&vblk->config_lock);
+ vblk->config_enable = false;
+ mutex_unlock(&vblk->config_lock);
+
+ flush_work(&vblk->config_work);
+
+ spin_lock_irq(vblk->disk->queue->queue_lock);
+ blk_stop_queue(vblk->disk->queue);
+ spin_unlock_irq(vblk->disk->queue->queue_lock);
+ blk_sync_queue(vblk->disk->queue);
+
+ vdev->config->del_vqs(vdev);
+ return 0;
+}
+
+static int virtblk_restore(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk = vdev->priv;
+ int ret;
+
+ vblk->config_enable = true;
+ ret = init_vq(vdev->priv);
+ if (!ret) {
+ spin_lock_irq(vblk->disk->queue->queue_lock);
+ blk_start_queue(vblk->disk->queue);
+ spin_unlock_irq(vblk->disk->queue->queue_lock);
+ }
+ return ret;
+}
+#endif
+
static const struct virtio_device_id id_table[] = {
{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
{ 0 },
@@ -584,6 +654,10 @@ static struct virtio_driver __refdata virtio_blk = {
.probe = virtblk_probe,
.remove = __devexit_p(virtblk_remove),
.config_changed = virtblk_config_changed,
+#ifdef CONFIG_PM
+ .freeze = virtblk_freeze,
+ .restore = virtblk_restore,
+#endif
};
static int __init init(void)
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 4abd2bcd20fb..ff540520bada 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -52,7 +52,6 @@
#include <linux/io.h>
#include <linux/gfp.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/dma.h>
@@ -148,7 +147,7 @@ static volatile int xdc_busy;
static struct timer_list xd_watchdog_int;
static volatile u_char xd_error;
-static int nodma = XD_DONT_USE_DMA;
+static bool nodma = XD_DONT_USE_DMA;
static struct request_queue *xd_queue;
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 15ec4db194d1..73f196ca713f 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -39,9 +39,6 @@
#include <linux/list.h>
#include <linux/delay.h>
#include <linux/freezer.h>
-#include <linux/loop.h>
-#include <linux/falloc.h>
-#include <linux/fs.h>
#include <xen/events.h>
#include <xen/page.h>
@@ -324,6 +321,7 @@ struct seg_buf {
static void xen_blkbk_unmap(struct pending_req *req)
{
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int i, invcount = 0;
grant_handle_t handle;
int ret;
@@ -335,25 +333,12 @@ static void xen_blkbk_unmap(struct pending_req *req)
gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
GNTMAP_host_map, handle);
pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+ pages[invcount] = virt_to_page(vaddr(req, i));
invcount++;
}
- ret = HYPERVISOR_grant_table_op(
- GNTTABOP_unmap_grant_ref, unmap, invcount);
+ ret = gnttab_unmap_refs(unmap, pages, invcount, false);
BUG_ON(ret);
- /*
- * Note, we use invcount, so nr->pages, so we can't index
- * using vaddr(req, i).
- */
- for (i = 0; i < invcount; i++) {
- ret = m2p_remove_override(
- virt_to_page(unmap[i].host_addr), false);
- if (ret) {
- pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n",
- (unsigned long)unmap[i].host_addr);
- continue;
- }
- }
}
static int xen_blkbk_map(struct blkif_request *req,
@@ -362,7 +347,7 @@ static int xen_blkbk_map(struct blkif_request *req,
{
struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
int i;
- int nseg = req->nr_segments;
+ int nseg = req->u.rw.nr_segments;
int ret = 0;
/*
@@ -381,7 +366,7 @@ static int xen_blkbk_map(struct blkif_request *req,
pending_req->blkif->domid);
}
- ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+ ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg);
BUG_ON(ret);
/*
@@ -401,47 +386,30 @@ static int xen_blkbk_map(struct blkif_request *req,
if (ret)
continue;
- ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
- blkbk->pending_page(pending_req, i), NULL);
- if (ret) {
- pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n",
- (unsigned long)map[i].dev_bus_addr, ret);
- /* We could switch over to GNTTABOP_copy */
- continue;
- }
-
seg[i].buf = map[i].dev_bus_addr |
(req->u.rw.seg[i].first_sect << 9);
}
return ret;
}
-static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req)
+static int dispatch_discard_io(struct xen_blkif *blkif,
+ struct blkif_request *req)
{
int err = 0;
int status = BLKIF_RSP_OKAY;
struct block_device *bdev = blkif->vbd.bdev;
+ unsigned long secure;
+
+ blkif->st_ds_req++;
- if (blkif->blk_backend_type == BLKIF_BACKEND_PHY)
- /* just forward the discard request */
- err = blkdev_issue_discard(bdev,
- req->u.discard.sector_number,
- req->u.discard.nr_sectors,
- GFP_KERNEL, 0);
- else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) {
- /* punch a hole in the backing file */
- struct loop_device *lo = bdev->bd_disk->private_data;
- struct file *file = lo->lo_backing_file;
-
- if (file->f_op->fallocate)
- err = file->f_op->fallocate(file,
- FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
- req->u.discard.sector_number << 9,
- req->u.discard.nr_sectors << 9);
- else
- err = -EOPNOTSUPP;
- } else
- err = -EOPNOTSUPP;
+ xen_blkif_get(blkif);
+ secure = (blkif->vbd.discard_secure &&
+ (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
+ BLKDEV_DISCARD_SECURE : 0;
+
+ err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
+ req->u.discard.nr_sectors,
+ GFP_KERNEL, secure);
if (err == -EOPNOTSUPP) {
pr_debug(DRV_PFX "discard op failed, not supported\n");
@@ -449,7 +417,9 @@ static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req)
} else if (err)
status = BLKIF_RSP_ERROR;
- make_response(blkif, req->id, req->operation, status);
+ make_response(blkif, req->u.discard.id, req->operation, status);
+ xen_blkif_put(blkif);
+ return err;
}
static void xen_blk_drain_io(struct xen_blkif *blkif)
@@ -573,8 +543,11 @@ __do_block_io_op(struct xen_blkif *blkif)
/* Apply all sanity checks to /private copy/ of request. */
barrier();
-
- if (dispatch_rw_block_io(blkif, &req, pending_req))
+ if (unlikely(req.operation == BLKIF_OP_DISCARD)) {
+ free_req(pending_req);
+ if (dispatch_discard_io(blkif, &req))
+ break;
+ } else if (dispatch_rw_block_io(blkif, &req, pending_req))
break;
/* Yield point for this unbounded loop. */
@@ -633,10 +606,6 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
blkif->st_f_req++;
operation = WRITE_FLUSH;
break;
- case BLKIF_OP_DISCARD:
- blkif->st_ds_req++;
- operation = REQ_DISCARD;
- break;
default:
operation = 0; /* make gcc happy */
goto fail_response;
@@ -644,9 +613,9 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
}
/* Check that the number of segments is sane. */
- nseg = req->nr_segments;
- if (unlikely(nseg == 0 && operation != WRITE_FLUSH &&
- operation != REQ_DISCARD) ||
+ nseg = req->u.rw.nr_segments;
+
+ if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
nseg);
@@ -654,12 +623,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
goto fail_response;
}
- preq.dev = req->handle;
+ preq.dev = req->u.rw.handle;
preq.sector_number = req->u.rw.sector_number;
preq.nr_sects = 0;
pending_req->blkif = blkif;
- pending_req->id = req->id;
+ pending_req->id = req->u.rw.id;
pending_req->operation = req->operation;
pending_req->status = BLKIF_RSP_OKAY;
pending_req->nr_pages = nseg;
@@ -707,7 +676,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
* the hypercall to unmap the grants - that is all done in
* xen_blkbk_unmap.
*/
- if (operation != REQ_DISCARD && xen_blkbk_map(req, pending_req, seg))
+ if (xen_blkbk_map(req, pending_req, seg))
goto fail_flush;
/*
@@ -739,23 +708,16 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
/* This will be hit if the operation was a flush or discard. */
if (!bio) {
- BUG_ON(operation != WRITE_FLUSH && operation != REQ_DISCARD);
+ BUG_ON(operation != WRITE_FLUSH);
- if (operation == WRITE_FLUSH) {
- bio = bio_alloc(GFP_KERNEL, 0);
- if (unlikely(bio == NULL))
- goto fail_put_bio;
+ bio = bio_alloc(GFP_KERNEL, 0);
+ if (unlikely(bio == NULL))
+ goto fail_put_bio;
- biolist[nbio++] = bio;
- bio->bi_bdev = preq.bdev;
- bio->bi_private = pending_req;
- bio->bi_end_io = end_block_io_op;
- } else if (operation == REQ_DISCARD) {
- xen_blk_discard(blkif, req);
- xen_blkif_put(blkif);
- free_req(pending_req);
- return 0;
- }
+ biolist[nbio++] = bio;
+ bio->bi_bdev = preq.bdev;
+ bio->bi_private = pending_req;
+ bio->bi_end_io = end_block_io_op;
}
/*
@@ -784,7 +746,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
xen_blkbk_unmap(pending_req);
fail_response:
/* Haven't submitted any bio's yet. */
- make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+ make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
free_req(pending_req);
msleep(1); /* back off a bit */
return -EIO;
@@ -844,7 +806,7 @@ static int __init xen_blkif_init(void)
int i, mmap_pages;
int rc = 0;
- if (!xen_pv_domain())
+ if (!xen_domain())
return -ENODEV;
blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index dfb1b3a43a5d..773cf27dc23f 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -60,58 +60,66 @@ struct blkif_common_response {
char dummy;
};
-/* i386 protocol version */
-#pragma pack(push, 4)
-
struct blkif_x86_32_request_rw {
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
+} __attribute__((__packed__));
struct blkif_x86_32_request_discard {
+ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */
+ blkif_vdev_t _pad1; /* was "handle" for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
- uint64_t nr_sectors;
-};
+ uint64_t nr_sectors;
+} __attribute__((__packed__));
struct blkif_x86_32_request {
uint8_t operation; /* BLKIF_OP_??? */
- uint8_t nr_segments; /* number of segments */
- blkif_vdev_t handle; /* only for read/write requests */
- uint64_t id; /* private guest value, echoed in resp */
union {
struct blkif_x86_32_request_rw rw;
struct blkif_x86_32_request_discard discard;
} u;
-};
+} __attribute__((__packed__));
+
+/* i386 protocol version */
+#pragma pack(push, 4)
struct blkif_x86_32_response {
uint64_t id; /* copied from request */
uint8_t operation; /* copied from request */
int16_t status; /* BLKIF_RSP_??? */
};
#pragma pack(pop)
-
/* x86_64 protocol version */
struct blkif_x86_64_request_rw {
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint32_t _pad1; /* offsetof(blkif_reqest..,u.rw.id)==8 */
+ uint64_t id;
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-};
+} __attribute__((__packed__));
struct blkif_x86_64_request_discard {
+ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */
+ blkif_vdev_t _pad1; /* was "handle" for read/write requests */
+ uint32_t _pad2; /* offsetof(blkif_..,u.discard.id)==8 */
+ uint64_t id;
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
- uint64_t nr_sectors;
-};
+ uint64_t nr_sectors;
+} __attribute__((__packed__));
struct blkif_x86_64_request {
uint8_t operation; /* BLKIF_OP_??? */
- uint8_t nr_segments; /* number of segments */
- blkif_vdev_t handle; /* only for read/write requests */
- uint64_t __attribute__((__aligned__(8))) id;
union {
struct blkif_x86_64_request_rw rw;
struct blkif_x86_64_request_discard discard;
} u;
-};
+} __attribute__((__packed__));
+
struct blkif_x86_64_response {
uint64_t __attribute__((__aligned__(8))) id;
uint8_t operation; /* copied from request */
@@ -138,11 +146,6 @@ enum blkif_protocol {
BLKIF_PROTOCOL_X86_64 = 3,
};
-enum blkif_backend_type {
- BLKIF_BACKEND_PHY = 1,
- BLKIF_BACKEND_FILE = 2,
-};
-
struct xen_vbd {
/* What the domain refers to this vbd as. */
blkif_vdev_t handle;
@@ -156,6 +159,7 @@ struct xen_vbd {
/* Cached size parameter. */
sector_t size;
bool flush_support;
+ bool discard_secure;
};
struct backend_info;
@@ -168,7 +172,6 @@ struct xen_blkif {
unsigned int irq;
/* Comms information. */
enum blkif_protocol blk_protocol;
- enum blkif_backend_type blk_backend_type;
union blkif_back_rings blk_rings;
void *blk_ring;
/* The VBD attached to this interface. */
@@ -237,22 +240,23 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
{
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
dst->operation = src->operation;
- dst->nr_segments = src->nr_segments;
- dst->handle = src->handle;
- dst->id = src->id;
switch (src->operation) {
case BLKIF_OP_READ:
case BLKIF_OP_WRITE:
case BLKIF_OP_WRITE_BARRIER:
case BLKIF_OP_FLUSH_DISKCACHE:
+ dst->u.rw.nr_segments = src->u.rw.nr_segments;
+ dst->u.rw.handle = src->u.rw.handle;
+ dst->u.rw.id = src->u.rw.id;
dst->u.rw.sector_number = src->u.rw.sector_number;
barrier();
- if (n > dst->nr_segments)
- n = dst->nr_segments;
+ if (n > dst->u.rw.nr_segments)
+ n = dst->u.rw.nr_segments;
for (i = 0; i < n; i++)
dst->u.rw.seg[i] = src->u.rw.seg[i];
break;
case BLKIF_OP_DISCARD:
+ dst->u.discard.flag = src->u.discard.flag;
dst->u.discard.sector_number = src->u.discard.sector_number;
dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
break;
@@ -266,22 +270,23 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
{
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
dst->operation = src->operation;
- dst->nr_segments = src->nr_segments;
- dst->handle = src->handle;
- dst->id = src->id;
switch (src->operation) {
case BLKIF_OP_READ:
case BLKIF_OP_WRITE:
case BLKIF_OP_WRITE_BARRIER:
case BLKIF_OP_FLUSH_DISKCACHE:
+ dst->u.rw.nr_segments = src->u.rw.nr_segments;
+ dst->u.rw.handle = src->u.rw.handle;
+ dst->u.rw.id = src->u.rw.id;
dst->u.rw.sector_number = src->u.rw.sector_number;
barrier();
- if (n > dst->nr_segments)
- n = dst->nr_segments;
+ if (n > dst->u.rw.nr_segments)
+ n = dst->u.rw.nr_segments;
for (i = 0; i < n; i++)
dst->u.rw.seg[i] = src->u.rw.seg[i];
break;
case BLKIF_OP_DISCARD:
+ dst->u.discard.flag = src->u.discard.flag;
dst->u.discard.sector_number = src->u.discard.sector_number;
dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
break;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 37c794d31264..89860f34a7ec 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -338,6 +338,9 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
if (q && q->flush_flags)
vbd->flush_support = true;
+ if (q && blk_queue_secdiscard(q))
+ vbd->discard_secure = true;
+
DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
handle, blkif->domid);
return 0;
@@ -378,63 +381,49 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
"%d", state);
if (err)
- xenbus_dev_fatal(dev, err, "writing feature-flush-cache");
+ dev_warn(&dev->dev, "writing feature-flush-cache (%d)", err);
return err;
}
-int xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
+static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
{
struct xenbus_device *dev = be->dev;
struct xen_blkif *blkif = be->blkif;
- char *type;
int err;
int state = 0;
+ struct block_device *bdev = be->blkif->vbd.bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
- type = xenbus_read(XBT_NIL, dev->nodename, "type", NULL);
- if (!IS_ERR(type)) {
- if (strncmp(type, "file", 4) == 0) {
- state = 1;
- blkif->blk_backend_type = BLKIF_BACKEND_FILE;
+ if (blk_queue_discard(q)) {
+ err = xenbus_printf(xbt, dev->nodename,
+ "discard-granularity", "%u",
+ q->limits.discard_granularity);
+ if (err) {
+ dev_warn(&dev->dev, "writing discard-granularity (%d)", err);
+ return;
}
- if (strncmp(type, "phy", 3) == 0) {
- struct block_device *bdev = be->blkif->vbd.bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- if (blk_queue_discard(q)) {
- err = xenbus_printf(xbt, dev->nodename,
- "discard-granularity", "%u",
- q->limits.discard_granularity);
- if (err) {
- xenbus_dev_fatal(dev, err,
- "writing discard-granularity");
- goto kfree;
- }
- err = xenbus_printf(xbt, dev->nodename,
- "discard-alignment", "%u",
- q->limits.discard_alignment);
- if (err) {
- xenbus_dev_fatal(dev, err,
- "writing discard-alignment");
- goto kfree;
- }
- state = 1;
- blkif->blk_backend_type = BLKIF_BACKEND_PHY;
- }
+ err = xenbus_printf(xbt, dev->nodename,
+ "discard-alignment", "%u",
+ q->limits.discard_alignment);
+ if (err) {
+ dev_warn(&dev->dev, "writing discard-alignment (%d)", err);
+ return;
+ }
+ state = 1;
+ /* Optional. */
+ err = xenbus_printf(xbt, dev->nodename,
+ "discard-secure", "%d",
+ blkif->vbd.discard_secure);
+ if (err) {
+ dev_warn(dev-dev, "writing discard-secure (%d)", err);
+ return;
}
- } else {
- err = PTR_ERR(type);
- xenbus_dev_fatal(dev, err, "reading type");
- goto out;
}
-
err = xenbus_printf(xbt, dev->nodename, "feature-discard",
"%d", state);
if (err)
- xenbus_dev_fatal(dev, err, "writing feature-discard");
-kfree:
- kfree(type);
-out:
- return err;
+ dev_warn(&dev->dev, "writing feature-discard (%d)", err);
}
int xen_blkbk_barrier(struct xenbus_transaction xbt,
struct backend_info *be, int state)
@@ -445,7 +434,7 @@ int xen_blkbk_barrier(struct xenbus_transaction xbt,
err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
"%d", state);
if (err)
- xenbus_dev_fatal(dev, err, "writing feature-barrier");
+ dev_warn(&dev->dev, "writing feature-barrier (%d)", err);
return err;
}
@@ -677,14 +666,12 @@ again:
return;
}
- err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
- if (err)
- goto abort;
+ /* If we can't advertise it is OK. */
+ xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
- err = xen_blkbk_discard(xbt, be);
+ xen_blkbk_discard(xbt, be);
- /* If we can't advertise it is OK. */
- err = xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
+ xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
(unsigned long long)vbd_sz(&be->blkif->vbd));
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 19b6005a323e..4e86393a09cf 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -43,6 +43,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/scatterlist.h>
+#include <linux/bitmap.h>
#include <xen/xen.h>
#include <xen/xenbus.h>
@@ -81,6 +82,7 @@ static const struct block_device_operations xlvbd_block_fops;
*/
struct blkfront_info
{
+ spinlock_t io_lock;
struct mutex mutex;
struct xenbus_device *xbdev;
struct gendisk *gd;
@@ -98,14 +100,13 @@ struct blkfront_info
unsigned long shadow_free;
unsigned int feature_flush;
unsigned int flush_op;
- unsigned int feature_discard;
+ unsigned int feature_discard:1;
+ unsigned int feature_secdiscard:1;
unsigned int discard_granularity;
unsigned int discard_alignment;
int is_ready;
};
-static DEFINE_SPINLOCK(blkif_io_lock);
-
static unsigned int nr_minors;
static unsigned long *minors;
static DEFINE_SPINLOCK(minor_lock);
@@ -135,15 +136,15 @@ static int get_id_from_freelist(struct blkfront_info *info)
{
unsigned long free = info->shadow_free;
BUG_ON(free >= BLK_RING_SIZE);
- info->shadow_free = info->shadow[free].req.id;
- info->shadow[free].req.id = 0x0fffffee; /* debug */
+ info->shadow_free = info->shadow[free].req.u.rw.id;
+ info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
return free;
}
static void add_id_to_freelist(struct blkfront_info *info,
unsigned long id)
{
- info->shadow[id].req.id = info->shadow_free;
+ info->shadow[id].req.u.rw.id = info->shadow_free;
info->shadow[id].request = NULL;
info->shadow_free = id;
}
@@ -156,7 +157,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
if (end > nr_minors) {
unsigned long *bitmap, *old;
- bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
+ bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
GFP_KERNEL);
if (bitmap == NULL)
return -ENOMEM;
@@ -176,8 +177,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
spin_lock(&minor_lock);
if (find_next_bit(minors, end, minor) >= end) {
- for (; minor < end; ++minor)
- __set_bit(minor, minors);
+ bitmap_set(minors, minor, nr);
rc = 0;
} else
rc = -EBUSY;
@@ -192,8 +192,7 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
BUG_ON(end > nr_minors);
spin_lock(&minor_lock);
- for (; minor < end; ++minor)
- __clear_bit(minor, minors);
+ bitmap_clear(minors, minor, nr);
spin_unlock(&minor_lock);
}
@@ -287,9 +286,9 @@ static int blkif_queue_request(struct request *req)
id = get_id_from_freelist(info);
info->shadow[id].request = req;
- ring_req->id = id;
+ ring_req->u.rw.id = id;
ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
- ring_req->handle = info->handle;
+ ring_req->u.rw.handle = info->handle;
ring_req->operation = rq_data_dir(req) ?
BLKIF_OP_WRITE : BLKIF_OP_READ;
@@ -305,16 +304,21 @@ static int blkif_queue_request(struct request *req)
ring_req->operation = info->flush_op;
}
- if (unlikely(req->cmd_flags & REQ_DISCARD)) {
+ if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
/* id, sector_number and handle are set above. */
ring_req->operation = BLKIF_OP_DISCARD;
- ring_req->nr_segments = 0;
ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+ if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+ ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
+ else
+ ring_req->u.discard.flag = 0;
} else {
- ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
- BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
+ info->sg);
+ BUG_ON(ring_req->u.rw.nr_segments >
+ BLKIF_MAX_SEGMENTS_PER_REQUEST);
- for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
+ for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
fsect = sg->offset >> 9;
lsect = fsect + (sg->length >> 9) - 1;
@@ -413,7 +417,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
struct request_queue *rq;
struct blkfront_info *info = gd->private_data;
- rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+ rq = blk_init_queue(do_blkif_request, &info->io_lock);
if (rq == NULL)
return -1;
@@ -424,6 +428,8 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
blk_queue_max_discard_sectors(rq, get_capacity(gd));
rq->limits.discard_granularity = info->discard_granularity;
rq->limits.discard_alignment = info->discard_alignment;
+ if (info->feature_secdiscard)
+ queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
}
/* Hard sector size and max sectors impersonate the equiv. hardware. */
@@ -628,14 +634,14 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
if (info->rq == NULL)
return;
- spin_lock_irqsave(&blkif_io_lock, flags);
+ spin_lock_irqsave(&info->io_lock, flags);
/* No more blkif_request(). */
blk_stop_queue(info->rq);
/* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback);
- spin_unlock_irqrestore(&blkif_io_lock, flags);
+ spin_unlock_irqrestore(&info->io_lock, flags);
/* Flush gnttab callback work. Must be done with no locks held. */
flush_work_sync(&info->work);
@@ -667,16 +673,16 @@ static void blkif_restart_queue(struct work_struct *work)
{
struct blkfront_info *info = container_of(work, struct blkfront_info, work);
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
if (info->connected == BLKIF_STATE_CONNECTED)
kick_pending_request_queues(info);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
}
static void blkif_free(struct blkfront_info *info, int suspend)
{
/* Prevent new requests being issued until we fix things up. */
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
info->connected = suspend ?
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
/* No more blkif_request(). */
@@ -684,7 +690,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
blk_stop_queue(info->rq);
/* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
/* Flush gnttab callback work. Must be done with no locks held. */
flush_work_sync(&info->work);
@@ -705,7 +711,9 @@ static void blkif_free(struct blkfront_info *info, int suspend)
static void blkif_completion(struct blk_shadow *s)
{
int i;
- for (i = 0; i < s->req.nr_segments; i++)
+ /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place
+ * flag. */
+ for (i = 0; i < s->req.u.rw.nr_segments; i++)
gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
}
@@ -718,10 +726,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
struct blkfront_info *info = (struct blkfront_info *)dev_id;
int error;
- spin_lock_irqsave(&blkif_io_lock, flags);
+ spin_lock_irqsave(&info->io_lock, flags);
if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
- spin_unlock_irqrestore(&blkif_io_lock, flags);
+ spin_unlock_irqrestore(&info->io_lock, flags);
return IRQ_HANDLED;
}
@@ -736,7 +744,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
id = bret->id;
req = info->shadow[id].request;
- blkif_completion(&info->shadow[id]);
+ if (bret->operation != BLKIF_OP_DISCARD)
+ blkif_completion(&info->shadow[id]);
add_id_to_freelist(info, id);
@@ -749,7 +758,9 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
info->gd->disk_name);
error = -EOPNOTSUPP;
info->feature_discard = 0;
+ info->feature_secdiscard = 0;
queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+ queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
}
__blk_end_request_all(req, error);
break;
@@ -763,7 +774,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
error = -EOPNOTSUPP;
}
if (unlikely(bret->status == BLKIF_RSP_ERROR &&
- info->shadow[id].req.nr_segments == 0)) {
+ info->shadow[id].req.u.rw.nr_segments == 0)) {
printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
info->flush_op == BLKIF_OP_WRITE_BARRIER ?
"barrier" : "flush disk cache",
@@ -803,7 +814,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
kick_pending_request_queues(info);
- spin_unlock_irqrestore(&blkif_io_lock, flags);
+ spin_unlock_irqrestore(&info->io_lock, flags);
return IRQ_HANDLED;
}
@@ -978,14 +989,15 @@ static int blkfront_probe(struct xenbus_device *dev,
}
mutex_init(&info->mutex);
+ spin_lock_init(&info->io_lock);
info->xbdev = dev;
info->vdevice = vdevice;
info->connected = BLKIF_STATE_DISCONNECTED;
INIT_WORK(&info->work, blkif_restart_queue);
for (i = 0; i < BLK_RING_SIZE; i++)
- info->shadow[i].req.id = i+1;
- info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+ info->shadow[i].req.u.rw.id = i+1;
+ info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
/* Front end dir is a number, which is used as the id. */
info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1019,9 +1031,9 @@ static int blkif_recover(struct blkfront_info *info)
/* Stage 2: Set up free list. */
memset(&info->shadow, 0, sizeof(info->shadow));
for (i = 0; i < BLK_RING_SIZE; i++)
- info->shadow[i].req.id = i+1;
+ info->shadow[i].req.u.rw.id = i+1;
info->shadow_free = info->ring.req_prod_pvt;
- info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+ info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
/* Stage 3: Find pending requests and requeue them. */
for (i = 0; i < BLK_RING_SIZE; i++) {
@@ -1034,17 +1046,19 @@ static int blkif_recover(struct blkfront_info *info)
*req = copy[i].req;
/* We get a new request id, and must reset the shadow state. */
- req->id = get_id_from_freelist(info);
- memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+ req->u.rw.id = get_id_from_freelist(info);
+ memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
+ if (req->operation != BLKIF_OP_DISCARD) {
/* Rewrite any grant references invalidated by susp/resume. */
- for (j = 0; j < req->nr_segments; j++)
- gnttab_grant_foreign_access_ref(
- req->u.rw.seg[j].gref,
- info->xbdev->otherend_id,
- pfn_to_mfn(info->shadow[req->id].frame[j]),
- rq_data_dir(info->shadow[req->id].request));
- info->shadow[req->id].req = *req;
+ for (j = 0; j < req->u.rw.nr_segments; j++)
+ gnttab_grant_foreign_access_ref(
+ req->u.rw.seg[j].gref,
+ info->xbdev->otherend_id,
+ pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
+ rq_data_dir(info->shadow[req->u.rw.id].request));
+ }
+ info->shadow[req->u.rw.id].req = *req;
info->ring.req_prod_pvt++;
}
@@ -1053,7 +1067,7 @@ static int blkif_recover(struct blkfront_info *info)
xenbus_switch_state(info->xbdev, XenbusStateConnected);
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
/* Now safe for us to use the shared ring */
info->connected = BLKIF_STATE_CONNECTED;
@@ -1064,7 +1078,7 @@ static int blkif_recover(struct blkfront_info *info)
/* Kick any other new requests queued since we resumed */
kick_pending_request_queues(info);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
return 0;
}
@@ -1135,11 +1149,13 @@ static void blkfront_setup_discard(struct blkfront_info *info)
char *type;
unsigned int discard_granularity;
unsigned int discard_alignment;
+ unsigned int discard_secure;
type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
if (IS_ERR(type))
return;
+ info->feature_secdiscard = 0;
if (strncmp(type, "phy", 3) == 0) {
err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
"discard-granularity", "%u", &discard_granularity,
@@ -1150,6 +1166,12 @@ static void blkfront_setup_discard(struct blkfront_info *info)
info->discard_granularity = discard_granularity;
info->discard_alignment = discard_alignment;
}
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "discard-secure", "%d", &discard_secure,
+ NULL);
+ if (!err)
+ info->feature_secdiscard = discard_secure;
+
} else if (strncmp(type, "file", 4) == 0)
info->feature_discard = 1;
@@ -1254,10 +1276,10 @@ static void blkfront_connect(struct blkfront_info *info)
xenbus_switch_state(info->xbdev, XenbusStateConnected);
/* Kick pending requests. */
- spin_lock_irq(&blkif_io_lock);
+ spin_lock_irq(&info->io_lock);
info->connected = BLKIF_STATE_CONNECTED;
kick_pending_request_queues(info);
- spin_unlock_irq(&blkif_io_lock);
+ spin_unlock_irq(&info->io_lock);
add_disk(info->gd);
@@ -1387,7 +1409,6 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
mutex_lock(&blkfront_mutex);
bdev = bdget_disk(disk, 0);
- bdput(bdev);
if (bdev->bd_openers)
goto out;
@@ -1418,6 +1439,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
}
out:
+ bdput(bdev);
mutex_unlock(&blkfront_mutex);
return 0;
}