summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-28 12:17:17 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-28 12:17:17 -0700
commit1731a47444e7bf66d5cf9415bbd6ac5e3903d719 (patch)
tree7cf48a70b8042333e580712e19e8da987ce01afa
parent0fa8dc423c55ad3bef3cafe16ccc1caec8e4c7d7 (diff)
parent273752c9ff03eb83856601b2a3458218bb949e46 (diff)
Merge tag 'for-4.13/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper fixes from Mike Snitzer: - a few DM integrity fixes that improve performance. One that address inefficiencies in the on-disk journal device layout. Another that makes use of the block layer's on-stack plugging when writing the journal. - a dm-bufio fix for the blk_status_t conversion that went in during the merge window. - a few DM raid fixes that address correctness when suspending the device and a validation fix for validation that occurs during device activation. - a couple DM zoned target fixes. Important one being the fix to not use GFP_KERNEL in the IO path due to concerns about deadlock in low-memory conditions (e.g. swap over a DM zoned device, etc). - a DM DAX device fix to make sure dm_dax_flush() is called if the underlying DAX device is operating as a write cache. * tag 'for-4.13/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm, dax: Make sure dm_dax_flush() is called if device supports it dm verity fec: fix GFP flags used with mempool_alloc() dm zoned: use GFP_NOIO in I/O path dm zoned: remove test for impossible REQ_OP_FLUSH conditions dm raid: bump target version dm raid: avoid mddev->suspended access dm raid: fix activation check in validate_raid_redundancy() dm raid: remove WARN_ON() in raid10_md_layout_to_format() dm bufio: fix error code in dm_bufio_write_dirty_buffers() dm integrity: test for corrupted disk format during table load dm integrity: WARN_ON if variables representing journal usage get out of sync dm integrity: use plugging when writing the journal dm integrity: fix inefficient allocation of journal space
-rw-r--r--Documentation/device-mapper/dm-raid.txt1
-rw-r--r--drivers/dax/super.c6
-rw-r--r--drivers/md/dm-bufio.c3
-rw-r--r--drivers/md/dm-integrity.c22
-rw-r--r--drivers/md/dm-raid.c29
-rw-r--r--drivers/md/dm-table.c35
-rw-r--r--drivers/md/dm-verity-fec.c21
-rw-r--r--drivers/md/dm-zoned-metadata.c12
-rw-r--r--drivers/md/dm-zoned-reclaim.c2
-rw-r--r--drivers/md/dm-zoned-target.c8
-rw-r--r--include/linux/dax.h1
11 files changed, 94 insertions, 46 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 7e06e65586d4..4a0a7469fdd7 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -343,3 +343,4 @@ Version History
1.11.0 Fix table line argument order
(wrong raid10_copies/raid10_format sequence)
1.11.1 Add raid4/5/6 journal write-back support via journal_mode option
+1.12.1 fix for MD deadlock between mddev_suspend() and md_write_start() available
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index ce9e563e6e1d..938eb4868f7f 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -278,6 +278,12 @@ void dax_write_cache(struct dax_device *dax_dev, bool wc)
}
EXPORT_SYMBOL_GPL(dax_write_cache);
+bool dax_write_cache_enabled(struct dax_device *dax_dev)
+{
+ return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
+
bool dax_alive(struct dax_device *dax_dev)
{
lockdep_assert_held(&dax_srcu);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 850ff6c67994..44f4a8ac95bd 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1258,8 +1258,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
*/
int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
{
- blk_status_t a;
- int f;
+ int a, f;
unsigned long buffers_processed = 0;
struct dm_buffer *b, *tmp;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 1b224aa9cf15..3acce09bba35 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1587,16 +1587,18 @@ retry:
if (likely(ic->mode == 'J')) {
if (dio->write) {
unsigned next_entry, i, pos;
- unsigned ws, we;
+ unsigned ws, we, range_sectors;
- dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors);
+ dio->range.n_sectors = min(dio->range.n_sectors,
+ ic->free_sectors << ic->sb->log2_sectors_per_block);
if (unlikely(!dio->range.n_sectors))
goto sleep;
- ic->free_sectors -= dio->range.n_sectors;
+ range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block;
+ ic->free_sectors -= range_sectors;
journal_section = ic->free_section;
journal_entry = ic->free_section_entry;
- next_entry = ic->free_section_entry + dio->range.n_sectors;
+ next_entry = ic->free_section_entry + range_sectors;
ic->free_section_entry = next_entry % ic->journal_section_entries;
ic->free_section += next_entry / ic->journal_section_entries;
ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
@@ -1727,6 +1729,8 @@ static void pad_uncommitted(struct dm_integrity_c *ic)
wraparound_section(ic, &ic->free_section);
ic->n_uncommitted_sections++;
}
+ WARN_ON(ic->journal_sections * ic->journal_section_entries !=
+ (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors);
}
static void integrity_commit(struct work_struct *w)
@@ -1821,6 +1825,9 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
{
unsigned i, j, n;
struct journal_completion comp;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
comp.ic = ic;
comp.in_flight = (atomic_t)ATOMIC_INIT(1);
@@ -1945,6 +1952,8 @@ skip_io:
dm_bufio_write_dirty_buffers_async(ic->bufio);
+ blk_finish_plug(&plug);
+
complete_journal_op(&comp);
wait_for_completion_io(&comp.comp);
@@ -3019,6 +3028,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->error = "Block size doesn't match the information in superblock";
goto bad;
}
+ if (!le32_to_cpu(ic->sb->journal_sections)) {
+ r = -EINVAL;
+ ti->error = "Corrupted superblock, journal_sections is 0";
+ goto bad;
+ }
/* make sure that ti->max_io_len doesn't overflow */
if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2e10c2f13a34..5bfe285ea9d1 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -208,6 +208,7 @@ struct raid_dev {
#define RT_FLAG_RS_BITMAP_LOADED 2
#define RT_FLAG_UPDATE_SBS 3
#define RT_FLAG_RESHAPE_RS 4
+#define RT_FLAG_RS_SUSPENDED 5
/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -564,9 +565,10 @@ static const char *raid10_md_layout_to_format(int layout)
if (__raid10_near_copies(layout) > 1)
return "near";
- WARN_ON(__raid10_far_copies(layout) < 2);
+ if (__raid10_far_copies(layout) > 1)
+ return "far";
- return "far";
+ return "unknown";
}
/* Return md raid10 algorithm for @name */
@@ -2540,11 +2542,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (!freshest)
return 0;
- if (validate_raid_redundancy(rs)) {
- rs->ti->error = "Insufficient redundancy to activate array";
- return -EINVAL;
- }
-
/*
* Validation of the freshest device provides the source of
* validation for the remaining devices.
@@ -2553,6 +2550,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (super_validate(rs, freshest))
return -EINVAL;
+ if (validate_raid_redundancy(rs)) {
+ rs->ti->error = "Insufficient redundancy to activate array";
+ return -EINVAL;
+ }
+
rdev_for_each(rdev, mddev)
if (!test_bit(Journal, &rdev->flags) &&
rdev != freshest &&
@@ -3168,6 +3170,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
mddev_suspend(&rs->md);
+ set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
if (rs_is_raid456(rs)) {
@@ -3625,7 +3628,7 @@ static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
- if (!rs->md.suspended)
+ if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
mddev_suspend(&rs->md);
rs->md.ro = 1;
@@ -3759,7 +3762,7 @@ static int rs_start_reshape(struct raid_set *rs)
return r;
/* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */
- if (mddev->suspended)
+ if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
mddev_resume(mddev);
/*
@@ -3786,8 +3789,8 @@ static int rs_start_reshape(struct raid_set *rs)
}
/* Suspend because a resume will happen in raid_resume() */
- if (!mddev->suspended)
- mddev_suspend(mddev);
+ set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
+ mddev_suspend(mddev);
/*
* Now reshape got set up, update superblocks to
@@ -3883,13 +3886,13 @@ static void raid_resume(struct dm_target *ti)
if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- if (mddev->suspended)
+ if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
mddev_resume(mddev);
}
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 11, 1},
+ .version = {1, 12, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a39bcd9b982a..28a4071cdf85 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -20,6 +20,7 @@
#include <linux/atomic.h>
#include <linux/blk-mq.h>
#include <linux/mount.h>
+#include <linux/dax.h>
#define DM_MSG_PREFIX "table"
@@ -1630,6 +1631,37 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
return false;
}
+static int device_dax_write_cache_enabled(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
+{
+ struct dax_device *dax_dev = dev->dax_dev;
+
+ if (!dax_dev)
+ return false;
+
+ if (dax_write_cache_enabled(dax_dev))
+ return true;
+ return false;
+}
+
+static int dm_table_supports_dax_write_cache(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i;
+
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti,
+ device_dax_write_cache_enabled, NULL))
+ return true;
+ }
+
+ return false;
+}
+
static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1785,6 +1817,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
blk_queue_write_cache(q, wc, fua);
+ if (dm_table_supports_dax_write_cache(t))
+ dax_write_cache(t->md->dax_dev, true);
+
/* Ensure that all underlying devices are non-rotational. */
if (dm_table_all_devices_attribute(t, device_is_nonrot))
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 504ba3fa328b..e13f90832b6b 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -308,19 +308,14 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
{
unsigned n;
- if (!fio->rs) {
- fio->rs = mempool_alloc(v->fec->rs_pool, 0);
- if (unlikely(!fio->rs)) {
- DMERR("failed to allocate RS");
- return -ENOMEM;
- }
- }
+ if (!fio->rs)
+ fio->rs = mempool_alloc(v->fec->rs_pool, GFP_NOIO);
fec_for_each_prealloc_buffer(n) {
if (fio->bufs[n])
continue;
- fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO);
+ fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOWAIT);
if (unlikely(!fio->bufs[n])) {
DMERR("failed to allocate FEC buffer");
return -ENOMEM;
@@ -332,22 +327,16 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
if (fio->bufs[n])
continue;
- fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO);
+ fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOWAIT);
/* we can manage with even one buffer if necessary */
if (unlikely(!fio->bufs[n]))
break;
}
fio->nbufs = n;
- if (!fio->output) {
+ if (!fio->output)
fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
- if (!fio->output) {
- DMERR("failed to allocate FEC page");
- return -ENOMEM;
- }
- }
-
return 0;
}
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 884ff7c170a0..a4fa2ada6883 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -624,7 +624,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
if (ret == 0)
- ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+ ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
return ret;
}
@@ -658,7 +658,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
/* Flush drive cache (this will also sync data) */
if (ret == 0)
- ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+ ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
return ret;
}
@@ -722,7 +722,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
/* If there are no dirty metadata blocks, just flush the device cache */
if (list_empty(&write_list)) {
- ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
+ ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
goto out;
}
@@ -927,7 +927,7 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
(zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
}
- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_NOIO);
if (!page)
return -ENOMEM;
@@ -1183,7 +1183,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
/* Get zone information from disk */
ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
- &blkz, &nr_blkz, GFP_KERNEL);
+ &blkz, &nr_blkz, GFP_NOIO);
if (ret) {
dmz_dev_err(zmd->dev, "Get zone %u report failed",
dmz_id(zmd, zone));
@@ -1257,7 +1257,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
ret = blkdev_reset_zones(dev->bdev,
dmz_start_sect(zmd, zone),
- dev->zone_nr_sectors, GFP_KERNEL);
+ dev->zone_nr_sectors, GFP_NOIO);
if (ret) {
dmz_dev_err(dev, "Reset zone %u failed %d",
dmz_id(zmd, zone), ret);
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index 05c0a126f5c8..44a119e12f1a 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -75,7 +75,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
nr_blocks = block - wp_block;
ret = blkdev_issue_zeroout(zrc->dev->bdev,
dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
- dmz_blk2sect(nr_blocks), GFP_NOFS, false);
+ dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
if (ret) {
dmz_dev_err(zrc->dev,
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 2b538fa817f4..b08bbbd4d902 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -541,7 +541,7 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
int ret;
/* Create a new chunk work */
- cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOFS);
+ cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
if (!cw)
goto out;
@@ -588,7 +588,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
bio->bi_bdev = dev->bdev;
- if (!nr_sectors && (bio_op(bio) != REQ_OP_FLUSH) && (bio_op(bio) != REQ_OP_WRITE))
+ if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
return DM_MAPIO_REMAPPED;
/* The BIO should be block aligned */
@@ -603,7 +603,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
bioctx->status = BLK_STS_OK;
/* Set the BIO pending in the flush list */
- if (bio_op(bio) == REQ_OP_FLUSH || (!nr_sectors && bio_op(bio) == REQ_OP_WRITE)) {
+ if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
spin_lock(&dmz->flush_lock);
bio_list_add(&dmz->flush_list, bio);
spin_unlock(&dmz->flush_lock);
@@ -785,7 +785,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
/* Chunk BIO work */
mutex_init(&dmz->chunk_lock);
- INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOFS);
+ INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL);
dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
0, dev->name);
if (!dmz->chunk_wq) {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 794811875732..df97b7af7e2c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -87,6 +87,7 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t size);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
+bool dax_write_cache_enabled(struct dax_device *dax_dev);
/*
* We use lowest available bit in exceptional entry for locking, one bit for