diff options
author | Linus Torvalds | 2019-05-16 15:55:48 -0700 |
---|---|---|
committer | Linus Torvalds | 2019-05-16 15:55:48 -0700 |
commit | 311f71281ff4b24f86a39c60c959f485c68a6d36 (patch) | |
tree | 05983f559c3e7eb7fc2e0cdab5d14e2ecaf1bf5a /drivers | |
parent | 7878c231dae05bae9dcf2ad4d309f02e51625033 (diff) | |
parent | 8454fca4f53bbe5e0a71613192674c8ce5c52318 (diff) |
Merge tag 'for-5.2/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- Improve DM snapshot target's scalability by using finer grained
locking. Requires some list_bl interface improvements.
- Add ability for DM integrity to use a bitmap mode, that tracks
regions where data and metadata are out of sync, instead of using a
journal.
- Improve DM thin provisioning target to not write metadata changes to
disk if the thin-pool and associated thin devices are merely
activated but not used. This avoids metadata corruption due to
concurrent activation of thin devices across different OS instances
(e.g. split brain scenarios, which ultimately would be avoided if
proper device filters were used -- but not having proper filtering
has proven a very common configuration mistake)
- Fix missing call to path selector type->end_io in DM multipath. This
fixes reported performance problems due to inaccurate path selector
IO accounting causing an imbalance of IO (e.g. avoiding issuing IO to
particular path due to it seemingly being heavily used).
- Fix bug in DM cache metadata's loading of its discard bitset that
could lead to all cache blocks being discarded if the very first
cache block was discarded (thankfully in practice the first cache
block is generally in use; be it FS superblock, partition table, disk
label, etc).
- Add testing-only DM dust target which simulates a device that has
failing sectors and/or read failures.
- Fix a DM init error path reference count hang that caused boot hangs
if user supplied malformed input on kernel commandline.
- Fix a couple issues with DM crypt target's logging being overly
verbose or lacking context.
- Various other small fixes to DM init, DM multipath, DM zoned, and DM
crypt.
* tag 'for-5.2/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (42 commits)
dm: fix a couple brace coding style issues
dm crypt: print device name in integrity error message
dm crypt: move detailed message into debug level
dm ioctl: fix hang in early create error condition
dm integrity: whitespace, coding style and dead code cleanup
dm integrity: implement synchronous mode for reboot handling
dm integrity: handle machine reboot in bitmap mode
dm integrity: add a bitmap mode
dm integrity: introduce a function add_new_range_and_wait()
dm integrity: allow large ranges to be described
dm ingerity: pass size to dm_integrity_alloc_page_list()
dm integrity: introduce rw_journal_sectors()
dm integrity: update documentation
dm integrity: don't report unused options
dm integrity: don't check null pointer before kvfree and vfree
dm integrity: correctly calculate the size of metadata area
dm dust: Make dm_dust_init and dm_dust_exit static
dm dust: remove redundant unsigned comparison to less than zero
dm mpath: always free attached_handler_name in parse_path()
dm init: fix max devices/targets checks
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/Kconfig | 9 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 9 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 26 | ||||
-rw-r--r-- | drivers/md/dm-delay.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-dust.c | 515 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.h | 3 | ||||
-rw-r--r-- | drivers/md/dm-init.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 717 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 19 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 359 | ||||
-rw-r--r-- | drivers/md/dm-target.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 139 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 29 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 3 | ||||
-rw-r--r-- | drivers/md/dm.c | 12 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-common.c | 2 |
20 files changed, 1583 insertions, 293 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2557f198e175..db269a348b20 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -436,6 +436,15 @@ config DM_DELAY If unsure, say N. +config DM_DUST + tristate "Bad sector simulation target" + depends on BLK_DEV_DM + ---help--- + A target that simulates bad sector behavior. + Useful for testing. + + If unsure, say N. + config DM_INIT bool "DM \"dm-mod.create=\" parameter support" depends on BLK_DEV_DM=y diff --git a/drivers/md/Makefile b/drivers/md/Makefile index a52b703e588e..be7a6eb92abc 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_DM_BUFIO) += dm-bufio.o obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o +obj-$(CONFIG_DM_DUST) += dm-dust.o obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 6fc93834da44..151aa95775be 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1167,11 +1167,18 @@ static int __load_discards(struct dm_cache_metadata *cmd, if (r) return r; - for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + for (b = 0; ; b++) { r = fn(context, cmd->discard_block_size, to_dblock(b), dm_bitset_cursor_get_value(&c)); if (r) break; + + if (b >= (from_dblock(cmd->discard_nr_blocks) - 1)) + break; + + r = dm_bitset_cursor_next(&c); + if (r) + break; } dm_bitset_cursor_end(&c); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 7f6462f74ac8..1b16d34bb785 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -946,6 +946,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) { #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); + struct mapped_device *md = dm_table_get_md(ti->table); /* From now we require underlying device with our integrity profile */ if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { @@ -965,7 +966,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) if (crypt_integrity_aead(cc)) { cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size; - DMINFO("Integrity AEAD, tag size %u, IV size %u.", + DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md), cc->integrity_tag_size, cc->integrity_iv_size); if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) { @@ -973,7 +974,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) return -EINVAL; } } else if (cc->integrity_iv_size) - DMINFO("Additional per-sector space %u bytes for IV.", + DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md), cc->integrity_iv_size); if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) { @@ -1031,11 +1032,11 @@ static u8 *org_iv_of_dmreq(struct crypt_config *cc, return iv_of_dmreq(cc, dmreq) + cc->iv_size; } -static uint64_t *org_sector_of_dmreq(struct crypt_config *cc, +static __le64 *org_sector_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) { u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size; - return (uint64_t*) ptr; + return (__le64 *) ptr; } static unsigned int *org_tag_of_dmreq(struct crypt_config *cc, @@ -1071,7 +1072,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc, struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); struct dm_crypt_request *dmreq; u8 *iv, *org_iv, *tag_iv, *tag; - uint64_t *sector; + __le64 *sector; int r = 0; BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size); @@ -1143,9 +1144,11 @@ static int crypt_convert_block_aead(struct crypt_config *cc, r = crypto_aead_decrypt(req); } - if (r == -EBADMSG) - DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", + if (r == -EBADMSG) { + char b[BDEVNAME_SIZE]; + DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b), (unsigned long long)le64_to_cpu(*sector)); + } if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) r = cc->iv_gen_ops->post(cc, org_iv, dmreq); @@ -1166,7 +1169,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, struct scatterlist *sg_in, *sg_out; struct dm_crypt_request *dmreq; u8 *iv, *org_iv, *tag_iv; - uint64_t *sector; + __le64 *sector; int r = 0; /* Reject unexpected unaligned bio. */ @@ -1788,7 +1791,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); if (error == -EBADMSG) { - DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", + char b[BDEVNAME_SIZE]; + DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b), (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq))); io->error = BLK_STS_PROTECTION; } else if (error < 0) @@ -1887,7 +1891,7 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode) * algorithm implementation is used. Help people debug performance * problems by logging the ->cra_driver_name. */ - DMINFO("%s using implementation \"%s\"", ciphermode, + DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode, crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name); return 0; } @@ -1907,7 +1911,7 @@ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode) return err; } - DMINFO("%s using implementation \"%s\"", ciphermode, + DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode, crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name); return 0; } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index fddffe251bf6..f496213f8b67 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -121,7 +121,8 @@ static void delay_dtr(struct dm_target *ti) { struct delay_c *dc = ti->private; - destroy_workqueue(dc->kdelayd_wq); + if (dc->kdelayd_wq) + destroy_workqueue(dc->kdelayd_wq); if (dc->read.dev) dm_put_device(ti, dc->read.dev); diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c new file mode 100644 index 000000000000..845f376a72d9 --- /dev/null +++ b/drivers/md/dm-dust.c @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Red Hat, Inc. + * + * This is a test "dust" device, which fails reads on specified + * sectors, emulating the behavior of a hard disk drive sending + * a "Read Medium Error" sense. + * + */ + +#include <linux/device-mapper.h> +#include <linux/module.h> +#include <linux/rbtree.h> + +#define DM_MSG_PREFIX "dust" + +struct badblock { + struct rb_node node; + sector_t bb; +}; + +struct dust_device { + struct dm_dev *dev; + struct rb_root badblocklist; + unsigned long long badblock_count; + spinlock_t dust_lock; + unsigned int blksz; + unsigned int sect_per_block; + sector_t start; + bool fail_read_on_bb:1; + bool quiet_mode:1; +}; + +static struct badblock *dust_rb_search(struct rb_root *root, sector_t blk) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct badblock *bblk = rb_entry(node, struct badblock, node); + + if (bblk->bb > blk) + node = node->rb_left; + else if (bblk->bb < blk) + node = node->rb_right; + else + return bblk; + } + + return NULL; +} + +static bool dust_rb_insert(struct rb_root *root, struct badblock *new) +{ + struct badblock *bblk; + struct rb_node **link = &root->rb_node, *parent = NULL; + sector_t value = new->bb; + + while (*link) { + parent = *link; + bblk = rb_entry(parent, struct badblock, node); + + if (bblk->bb > value) + link = &(*link)->rb_left; + else if (bblk->bb < value) + link = &(*link)->rb_right; + else + return false; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, root); + + return true; +} + +static int dust_remove_block(struct dust_device *dd, unsigned long long block) +{ + struct badblock *bblock; + unsigned long flags; + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block); + + if (bblock == NULL) { + if (!dd->quiet_mode) { + DMERR("%s: block %llu not found in badblocklist", + __func__, block); + } + spin_unlock_irqrestore(&dd->dust_lock, flags); + return -EINVAL; + } + + rb_erase(&bblock->node, &dd->badblocklist); + dd->badblock_count--; + if (!dd->quiet_mode) + DMINFO("%s: badblock removed at block %llu", __func__, block); + kfree(bblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int dust_add_block(struct dust_device *dd, unsigned long long block) +{ + struct badblock *bblock; + unsigned long flags; + + bblock = kmalloc(sizeof(*bblock), GFP_KERNEL); + if (bblock == NULL) { + if (!dd->quiet_mode) + DMERR("%s: badblock allocation failed", __func__); + return -ENOMEM; + } + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock->bb = block * dd->sect_per_block; + if (!dust_rb_insert(&dd->badblocklist, bblock)) { + if (!dd->quiet_mode) { + DMERR("%s: block %llu already in badblocklist", + __func__, block); + } + spin_unlock_irqrestore(&dd->dust_lock, flags); + kfree(bblock); + return -EINVAL; + } + + dd->badblock_count++; + if (!dd->quiet_mode) + DMINFO("%s: badblock added at block %llu", __func__, block); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int dust_query_block(struct dust_device *dd, unsigned long long block) +{ + struct badblock *bblock; + unsigned long flags; + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block); + if (bblock != NULL) + DMINFO("%s: block %llu found in badblocklist", __func__, block); + else + DMINFO("%s: block %llu not found in badblocklist", __func__, block); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int __dust_map_read(struct dust_device *dd, sector_t thisblock) +{ + struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + + if (bblk) + return DM_MAPIO_KILL; + + return DM_MAPIO_REMAPPED; +} + +static int dust_map_read(struct dust_device *dd, sector_t thisblock, + bool fail_read_on_bb) +{ + unsigned long flags; + int ret = DM_MAPIO_REMAPPED; + + if (fail_read_on_bb) { + spin_lock_irqsave(&dd->dust_lock, flags); + ret = __dust_map_read(dd, thisblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + } + + return ret; +} + +static void __dust_map_write(struct dust_device *dd, sector_t thisblock) +{ + struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + + if (bblk) { + rb_erase(&bblk->node, &dd->badblocklist); + dd->badblock_count--; + kfree(bblk); + if (!dd->quiet_mode) { + sector_div(thisblock, dd->sect_per_block); + DMINFO("block %llu removed from badblocklist by write", + (unsigned long long)thisblock); + } + } +} + +static int dust_map_write(struct dust_device *dd, sector_t thisblock, + bool fail_read_on_bb) +{ + unsigned long flags; + + if (fail_read_on_bb) { + spin_lock_irqsave(&dd->dust_lock, flags); + __dust_map_write(dd, thisblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + } + + return DM_MAPIO_REMAPPED; +} + +static int dust_map(struct dm_target *ti, struct bio *bio) +{ + struct dust_device *dd = ti->private; + int ret; + + bio_set_dev(bio, dd->dev->bdev); + bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (bio_data_dir(bio) == READ) + ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + else + ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + + return ret; +} + +static bool __dust_clear_badblocks(struct rb_root *tree, + unsigned long long count) +{ + struct rb_node *node = NULL, *nnode = NULL; + + nnode = rb_first(tree); + if (nnode == NULL) { + BUG_ON(count != 0); + return false; + } + + while (nnode) { + node = nnode; + nnode = rb_next(node); + rb_erase(node, tree); + count--; + kfree(node); + } + BUG_ON(count != 0); + BUG_ON(tree->rb_node != NULL); + + return true; +} + +static int dust_clear_badblocks(struct dust_device *dd) +{ + unsigned long flags; + struct rb_root badblocklist; + unsigned long long badblock_count; + + spin_lock_irqsave(&dd->dust_lock, flags); + badblocklist = dd->badblocklist; + badblock_count = dd->badblock_count; + dd->badblocklist = RB_ROOT; + dd->badblock_count = 0; + spin_unlock_irqrestore(&dd->dust_lock, flags); + + if (!__dust_clear_badblocks(&badblocklist, badblock_count)) + DMINFO("%s: no badblocks found", __func__); + else + DMINFO("%s: badblocks cleared", __func__); + + return 0; +} + +/* + * Target parameters: + * + * <device_path> <offset> <blksz> + * + * device_path: path to the block device + * offset: offset to data area from start of device_path + * blksz: block size (minimum 512, maximum 1073741824, must be a power of 2) + */ +static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dust_device *dd; + unsigned long long tmp; + char dummy; + unsigned int blksz; + unsigned int sect_per_block; + sector_t DUST_MAX_BLKSZ_SECTORS = 2097152; + sector_t max_block_sectors = min(ti->len, DUST_MAX_BLKSZ_SECTORS); + + if (argc != 3) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + if (kstrtouint(argv[2], 10, &blksz) || !blksz) { + ti->error = "Invalid block size parameter"; + return -EINVAL; + } + + if (blksz < 512) { + ti->error = "Block size must be at least 512"; + return -EINVAL; + } + + if (!is_power_of_2(blksz)) { + ti->error = "Block size must be a power of 2"; + return -EINVAL; + } + + if (to_sector(blksz) > max_block_sectors) { + ti->error = "Block size is too large"; + return -EINVAL; + } + + sect_per_block = (blksz >> SECTOR_SHIFT); + + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) { + ti->error = "Invalid device offset sector"; + return -EINVAL; + } + + dd = kzalloc(sizeof(struct dust_device), GFP_KERNEL); + if (dd == NULL) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dd->dev)) { + ti->error = "Device lookup failed"; + kfree(dd); + return -EINVAL; + } + + dd->sect_per_block = sect_per_block; + dd->blksz = blksz; + dd->start = tmp; + + /* + * Whether to fail a read on a "bad" block. + * Defaults to false; enabled later by message. + */ + dd->fail_read_on_bb = false; + + /* + * Initialize bad block list rbtree. + */ + dd->badblocklist = RB_ROOT; + dd->badblock_count = 0; + spin_lock_init(&dd->dust_lock); + + dd->quiet_mode = false; + + BUG_ON(dm_set_target_max_io_len(ti, dd->sect_per_block) != 0); + + ti->num_discard_bios = 1; + ti->num_flush_bios = 1; + ti->private = dd; + + return 0; +} + +static void dust_dtr(struct dm_target *ti) +{ + struct dust_device *dd = ti->private; + + __dust_clear_badblocks(&dd->badblocklist, dd->badblock_count); + dm_put_device(ti, dd->dev); + kfree(dd); +} + +static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result_buf, unsigned int maxlen) +{ + struct dust_device *dd = ti->private; + sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT; + bool invalid_msg = false; + int result = -EINVAL; + unsigned long long tmp, block; + unsigned long flags; + char dummy; + + if (argc == 1) { + if (!strcasecmp(argv[0], "addbadblock") || + !strcasecmp(argv[0], "removebadblock") || + !strcasecmp(argv[0], "queryblock")) { + DMERR("%s requires an additional argument", argv[0]); + } else if (!strcasecmp(argv[0], "disable")) { + DMINFO("disabling read failures on bad sectors"); + dd->fail_read_on_bb = false; + result = 0; + } else if (!strcasecmp(argv[0], "enable")) { + DMINFO("enabling read failures on bad sectors"); + dd->fail_read_on_bb = true; + result = 0; + } else if (!strcasecmp(argv[0], "countbadblocks")) { + spin_lock_irqsave(&dd->dust_lock, flags); + DMINFO("countbadblocks: %llu badblock(s) found", + dd->badblock_count); + spin_unlock_irqrestore(&dd->dust_lock, flags); + result = 0; + } else if (!strcasecmp(argv[0], "clearbadblocks")) { + result = dust_clear_badblocks(dd); + } else if (!strcasecmp(argv[0], "quiet")) { + if (!dd->quiet_mode) + dd->quiet_mode = true; + else + dd->quiet_mode = false; + result = 0; + } else { + invalid_msg = true; + } + } else if (argc == 2) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) + return result; + + block = tmp; + sector_div(size, dd->sect_per_block); + if (block > size) { + DMERR("selected block value out of range"); + return result; + } + + if (!strcasecmp(argv[0], "addbadblock")) + result = dust_add_block(dd, block); + else if (!strcasecmp(argv[0], "removebadblock")) + result = dust_remove_block(dd, block); + else if (!strcasecmp(argv[0], "queryblock")) + result = dust_query_block(dd, block); + else + invalid_msg = true; + + } else + DMERR("invalid number of arguments '%d'", argc); + + if (invalid_msg) + DMERR("unrecognized message '%s' received", argv[0]); + + return result; +} + +static void dust_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct dust_device *dd = ti->private; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%s %s %s", dd->dev->name, + dd->fail_read_on_bb ? "fail_read_on_bad_block" : "bypass", + dd->quiet_mode ? "quiet" : "verbose"); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu %u", dd->dev->name, + (unsigned long long)dd->start, dd->blksz); + break; + } +} + +static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct dust_device *dd = ti->private; + struct dm_dev *dev = dd->dev; + + *bdev = dev->bdev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (dd->start || + ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + return 1; + + return 0; +} + +static int dust_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, + void *data) +{ + struct dust_device *dd = ti->private; + + return fn(ti, dd->dev, dd->start, ti->len, data); +} + +static struct target_type dust_target = { + .name = "dust", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = dust_ctr, + .dtr = dust_dtr, + .iterate_devices = dust_iterate_devices, + .map = dust_map, + .message = dust_message, + .status = dust_status, + .prepare_ioctl = dust_prepare_ioctl, +}; + +static int __init dm_dust_init(void) +{ + int result = dm_register_target(&dust_target); + + if (result < 0) + DMERR("dm_register_target failed %d", result); + + return result; +} + +static void __exit dm_dust_exit(void) +{ + dm_unregister_target(&dust_target); +} + +module_init(dm_dust_init); +module_exit(dm_dust_exit); + +MODULE_DESCRIPTION(DM_NAME " dust test target"); +MODULE_AUTHOR("Bryan Gurney <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 721efc493942..3f4139ac1f60 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -11,6 +11,7 @@ #define _LINUX_DM_EXCEPTION_STORE #include <linux/blkdev.h> +#include <linux/list_bl.h> #include <linux/device-mapper.h> /* @@ -27,7 +28,7 @@ typedef sector_t chunk_t; * chunk within the device. */ struct dm_exception { - struct list_head hash_list; + struct hlist_bl_node hash_list; chunk_t old_chunk; chunk_t new_chunk; diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index 4b76f84424c3..352e803f566e 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -160,7 +160,7 @@ static int __init dm_parse_table(struct dm_device *dev, char *str) while (table_entry) { DMDEBUG("parsing table \"%s\"", str); - if (++dev->dmi.target_count >= DM_MAX_TARGETS) { + if (++dev->dmi.target_count > DM_MAX_TARGETS) { DMERR("too many targets %u > %d", dev->dmi.target_count, DM_MAX_TARGETS); return -EINVAL; @@ -242,9 +242,9 @@ static int __init dm_parse_devices(struct list_head *devices, char *str) return -ENOMEM; list_add_tail(&dev->list, devices); - if (++ndev >= DM_MAX_DEVICES) { - DMERR("too many targets %u > %d", - dev->dmi.target_count, DM_MAX_TARGETS); + if (++ndev > DM_MAX_DEVICES) { + DMERR("too many devices %lu > %d", + ndev, DM_MAX_DEVICES); return -EINVAL; } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index c27c32cf4a30..44e76cda087a 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -15,6 +15,7 @@ #include <linux/rbtree.h> #include <linux/delay.h> #include <linux/random.h> +#include <linux/reboot.h> #include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/async_tx.h> @@ -24,6 +25,7 @@ #define DEFAULT_INTERLEAVE_SECTORS 32768 #define DEFAULT_JOURNAL_SIZE_FACTOR 7 +#define DEFAULT_SECTORS_PER_BITMAP_BIT 32768 #define DEFAULT_BUFFER_SECTORS 128 #define DEFAULT_JOURNAL_WATERMARK 50 #define DEFAULT_SYNC_MSEC 10000 @@ -33,6 +35,8 @@ #define METADATA_WORKQUEUE_MAX_ACTIVE 16 #define RECALC_SECTORS 8192 #define RECALC_WRITE_SUPER 16 +#define BITMAP_BLOCK_SIZE 4096 /* don't change it */ +#define BITMAP_FLUSH_INTERVAL (10 * HZ) /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -48,6 +52,7 @@ #define SB_MAGIC "integrt" #define SB_VERSION_1 1 #define SB_VERSION_2 2 +#define SB_VERSION_3 3 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -60,12 +65,14 @@ struct superblock { __u64 provided_data_sectors; /* userspace uses this value */ __u32 flags; __u8 log2_sectors_per_block; - __u8 pad[3]; + __u8 log2_blocks_per_bitmap_bit; + __u8 pad[2]; __u64 recalc_sector; }; #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 #define SB_FLAG_RECALCULATING 0x2 +#define SB_FLAG_DIRTY_BITMAP 0x4 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -151,9 +158,18 @@ struct dm_integrity_c { struct workqueue_struct *metadata_wq; struct superblock *sb; unsigned journal_pages; + unsigned n_bitmap_blocks; + struct page_list *journal; struct page_list *journal_io; struct page_list *journal_xor; + struct page_list *recalc_bitmap; + struct page_list *may_write_bitmap; + struct bitmap_block_status *bbs; + unsigned bitmap_flush_interval; + int synchronous_mode; + struct bio_list synchronous_bios; + struct delayed_work bitmap_flush_work; struct crypto_skcipher *journal_crypt; struct scatterlist **journal_scatterlist; @@ -180,6 +196,7 @@ struct dm_integrity_c { __s8 log2_metadata_run; __u8 log2_buffer_sectors; __u8 sectors_per_block; + __u8 log2_blocks_per_bitmap_bit; unsigned char mode; int suspending; @@ -232,17 +249,20 @@ struct dm_integrity_c { bool journal_uptodate; bool just_formatted; + bool recalculate_flag; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; struct alg_spec journal_mac_alg; atomic64_t number_of_mismatches; + + struct notifier_block reboot_notifier; }; struct dm_integrity_range { sector_t logical_sector; - unsigned n_sectors; + sector_t n_sectors; bool waiting; union { struct rb_node node; @@ -288,6 +308,16 @@ struct journal_io { struct journal_completion *comp; }; +struct bitmap_block_status { + struct work_struct work; + struct dm_integrity_c *ic; + unsigned idx; + unsigned long *bitmap; + struct bio_list bio_queue; + spinlock_t bio_queue_lock; + +}; + static struct kmem_cache *journal_io_cache; #define JOURNAL_IO_MEMPOOL 32 @@ -423,7 +453,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) + ic->sb->version = SB_VERSION_3; + else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ic->sb->version = SB_VERSION_2; else ic->sb->version = SB_VERSION_1; @@ -447,6 +479,137 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) return dm_io(&io_req, 1, &io_loc, NULL); } +#define BITMAP_OP_TEST_ALL_SET 0 +#define BITMAP_OP_TEST_ALL_CLEAR 1 +#define BITMAP_OP_SET 2 +#define BITMAP_OP_CLEAR 3 + +static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap, + sector_t sector, sector_t n_sectors, int mode) +{ + unsigned long bit, end_bit, this_end_bit, page, end_page; + unsigned long *data; + + if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) { + DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)", + (unsigned long long)sector, + (unsigned long long)n_sectors, + ic->sb->log2_sectors_per_block, + ic->log2_blocks_per_bitmap_bit, + mode); + BUG(); + } + + if (unlikely(!n_sectors)) + return true; + + bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + end_bit = (sector + n_sectors - 1) >> + (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + + page = bit / (PAGE_SIZE * 8); + bit %= PAGE_SIZE * 8; + + end_page = end_bit / (PAGE_SIZE * 8); + end_bit %= PAGE_SIZE * 8; + +repeat: + if (page < end_page) { + this_end_bit = PAGE_SIZE * 8 - 1; + } else { + this_end_bit = end_bit; + } + + data = lowmem_page_address(bitmap[page].page); + + if (mode == BITMAP_OP_TEST_ALL_SET) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + if (data[bit / BITS_PER_LONG] != -1) + return false; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + if (!test_bit(bit, data)) + return false; + bit++; + } + } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + if (data[bit / BITS_PER_LONG] != 0) + return false; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + if (test_bit(bit, data)) + return false; + bit++; + } + } else if (mode == BITMAP_OP_SET) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + data[bit / BITS_PER_LONG] = -1; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + __set_bit(bit, data); + bit++; + } + } else if (mode == BITMAP_OP_CLEAR) { + if (!bit && this_end_bit == PAGE_SIZE * 8 - 1) + clear_page(data); + else while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + data[bit / BITS_PER_LONG] = 0; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + __clear_bit(bit, data); + bit++; + } + } else { + BUG(); + } + + if (unlikely(page < end_page)) { + bit = 0; + page++; + goto repeat; + } + + return true; +} + +static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src) +{ + unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE); + unsigned i; + + for (i = 0; i < n_bitmap_pages; i++) { + unsigned long *dst_data = lowmem_page_address(dst[i].page); + unsigned long *src_data = lowmem_page_address(src[i].page); + copy_page(dst_data, src_data); + } +} + +static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector) +{ + unsigned bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8); + + BUG_ON(bitmap_block >= ic->n_bitmap_blocks); + return &ic->bbs[bitmap_block]; +} + static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset, bool e, const char *function) { @@ -455,8 +618,8 @@ static void access_journal_check(struct dm_integrity_c *ic, unsigned section, un if (unlikely(section >= ic->journal_sections) || unlikely(offset >= limit)) { - printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n", - function, section, offset, ic->journal_sections, limit); + DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)", + function, section, offset, ic->journal_sections, limit); BUG(); } #endif @@ -756,12 +919,12 @@ static void complete_journal_io(unsigned long error, void *context) complete_journal_op(comp); } -static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, - unsigned n_sections, struct journal_completion *comp) +static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags, + unsigned sector, unsigned n_sectors, struct journal_completion *comp) { struct dm_io_request io_req; struct dm_io_region io_loc; - unsigned sector, n_sectors, pl_index, pl_offset; + unsigned pl_index, pl_offset; int r; if (unlikely(dm_integrity_failed(ic))) { @@ -770,9 +933,6 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned return; } - sector = section * ic->journal_section_sectors; - n_sectors = n_sections * ic->journal_section_sectors; - pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); @@ -805,6 +965,17 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned } } +static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + unsigned sector, n_sectors; + + sector = section * ic->journal_section_sectors; + n_sectors = n_sections * ic->journal_section_sectors; + + rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp); +} + static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections) { struct journal_completion io_comp; @@ -988,6 +1159,12 @@ static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrit } while (unlikely(new_range->waiting)); } +static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +{ + if (unlikely(!add_new_range(ic, new_range, true))) + wait_and_add_new_range(ic, new_range); +} + static void init_journal_node(struct journal_node *node) { RB_CLEAR_NODE(&node->node); @@ -1204,6 +1381,14 @@ static void do_endio(struct dm_integrity_c *ic, struct bio *bio) int r = dm_integrity_failed(ic); if (unlikely(r) && !bio->bi_status) bio->bi_status = errno_to_blk_status(r); + if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) { + unsigned long flags; + spin_lock_irqsave(&ic->endio_wait.lock, flags); + bio_list_add(&ic->synchronous_bios, bio); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + spin_unlock_irqrestore(&ic->endio_wait.lock, flags); + return; + } bio_endio(bio); } @@ -1477,7 +1662,8 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) else wanted_tag_size *= ic->tag_size; if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) { - DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size); + DMERR("Invalid integrity data size %u, expected %u", + bip->bip_iter.bi_size, wanted_tag_size); return DM_MAPIO_KILL; } } @@ -1681,7 +1867,7 @@ retry: unsigned ws, we, range_sectors; dio->range.n_sectors = min(dio->range.n_sectors, - ic->free_sectors << ic->sb->log2_sectors_per_block); + (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block); if (unlikely(!dio->range.n_sectors)) { if (from_map) goto offload_to_thread; @@ -1764,6 +1950,20 @@ offload_to_thread: goto journal_read_write; } + if (ic->mode == 'B' && dio->write) { + if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { + struct bitmap_block_status *bbs; + + bbs = sector_to_bitmap_block(ic, dio->range.logical_sector); + spin_lock(&bbs->bio_queue_lock); + bio_list_add(&bbs->bio_queue, bio); + spin_unlock(&bbs->bio_queue_lock); + queue_work(ic->writer_wq, &bbs->work); + return; + } + } + dio->in_flight = (atomic_t)ATOMIC_INIT(2); if (need_sync_io) { @@ -1790,10 +1990,15 @@ offload_to_thread: if (need_sync_io) { wait_for_completion_io(&read_comp); - if (unlikely(ic->recalc_wq != NULL) && - ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector)) goto skip_check; + if (ic->mode == 'B') { + if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) + goto skip_check; + } + if (likely(!bio->bi_status)) integrity_metadata(&dio->work); else @@ -1831,8 +2036,16 @@ static void pad_uncommitted(struct dm_integrity_c *ic) wraparound_section(ic, &ic->free_section); ic->n_uncommitted_sections++; } - WARN_ON(ic->journal_sections * ic->journal_section_entries != - (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors); + if (WARN_ON(ic->journal_sections * ic->journal_section_entries != + (ic->n_uncommitted_sections + ic->n_committed_sections) * + ic->journal_section_entries + ic->free_sectors)) { + DMCRIT("journal_sections %u, journal_section_entries %u, " + "n_uncommitted_sections %u, n_committed_sections %u, " + "journal_section_entries %u, free_sectors %u", + ic->journal_sections, ic->journal_section_entries, + ic->n_uncommitted_sections, ic->n_committed_sections, + ic->journal_section_entries, ic->free_sectors); + } } static void integrity_commit(struct work_struct *w) @@ -1981,8 +2194,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; spin_lock_irq(&ic->endio_wait.lock); - if (unlikely(!add_new_range(ic, &io->range, true))) - wait_and_add_new_range(ic, &io->range); + add_new_range_and_wait(ic, &io->range); if (likely(!from_replay)) { struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; @@ -2120,11 +2332,14 @@ static void integrity_recalc(struct work_struct *w) sector_t area, offset; sector_t metadata_block; unsigned metadata_offset; + sector_t logical_sector, n_sectors; __u8 *t; unsigned i; int r; unsigned super_counter = 0; + DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector)); + spin_lock_irq(&ic->endio_wait.lock); next_chunk: @@ -2133,21 +2348,49 @@ next_chunk: goto unlock_ret; range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); - if (unlikely(range.logical_sector >= ic->provided_data_sectors)) + if (unlikely(range.logical_sector >= ic->provided_data_sectors)) { + if (ic->mode == 'B') { + DEBUG_print("queue_delayed_work: bitmap_flush_work\n"); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + } goto unlock_ret; + } get_area_and_offset(ic, range.logical_sector, &area, &offset); range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector); if (!ic->meta_dev) - range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); - - if (unlikely(!add_new_range(ic, &range, true))) - wait_and_add_new_range(ic, &range); + range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); + add_new_range_and_wait(ic, &range); spin_unlock_irq(&ic->endio_wait.lock); + logical_sector = range.logical_sector; + n_sectors = range.n_sectors; + + if (ic->mode == 'B') { + if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) { + goto advance_and_next; + } + while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, + ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { + logical_sector += ic->sectors_per_block; + n_sectors -= ic->sectors_per_block; + cond_resched(); + } + while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block, + ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { + n_sectors -= ic->sectors_per_block; + cond_resched(); + } + get_area_and_offset(ic, logical_sector, &area, &offset); + } + + DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors); if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { recalc_write_super(ic); + if (ic->mode == 'B') { + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval); + } super_counter = 0; } @@ -2162,7 +2405,7 @@ next_chunk: io_req.client = ic->io; io_loc.bdev = ic->dev->bdev; io_loc.sector = get_data_sector(ic, area, offset); - io_loc.count = range.n_sectors; + io_loc.count = n_sectors; r = dm_io(&io_req, 1, &io_loc, NULL); if (unlikely(r)) { @@ -2171,8 +2414,8 @@ next_chunk: } t = ic->recalc_tags; - for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) { - integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); + for (i = 0; i < n_sectors; i += ic->sectors_per_block) { + integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); t += ic->tag_size; } @@ -2184,6 +2427,9 @@ next_chunk: goto err; } +advance_and_next: + cond_resched(); + spin_lock_irq(&ic->endio_wait.lock); remove_range_unlocked(ic, &range); ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors); @@ -2199,6 +2445,103 @@ unlock_ret: recalc_write_super(ic); } +static void bitmap_block_work(struct work_struct *w) +{ + struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work); + struct dm_integrity_c *ic = bbs->ic; + struct bio *bio; + struct bio_list bio_queue; + struct bio_list waiting; + + bio_list_init(&waiting); + + spin_lock(&bbs->bio_queue_lock); + bio_queue = bbs->bio_queue; + bio_list_init(&bbs->bio_queue); + spin_unlock(&bbs->bio_queue_lock); + + while ((bio = bio_list_pop(&bio_queue))) { + struct dm_integrity_io *dio; + + dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { + remove_range(ic, &dio->range); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + } else { + block_bitmap_op(ic, ic->journal, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_SET); + bio_list_add(&waiting, bio); + } + } + + if (bio_list_empty(&waiting)) + return; + + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, + bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), + BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL); + + while ((bio = bio_list_pop(&waiting))) { + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_SET); + + remove_range(ic, &dio->range); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + } + + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval); +} + +static void bitmap_flush_work(struct work_struct *work) +{ + struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work); + struct dm_integrity_range range; + unsigned long limit; + struct bio *bio; + + dm_integrity_flush_buffers(ic); + + range.logical_sector = 0; + range.n_sectors = ic->provided_data_sectors; + + spin_lock_irq(&ic->endio_wait.lock); + add_new_range_and_wait(ic, &range); + spin_unlock_irq(&ic->endio_wait.lock); + + dm_integrity_flush_buffers(ic); + if (ic->meta_dev) + blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL); + + limit = ic->provided_data_sectors; + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + limit = le64_to_cpu(ic->sb->recalc_sector) + >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit) + << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + } + /*DEBUG_print("zeroing journal\n");*/ + block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR); + block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR); + + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + + spin_lock_irq(&ic->endio_wait.lock); + remove_range_unlocked(ic, &range); + while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) { + bio_endio(bio); + spin_unlock_irq(&ic->endio_wait.lock); + spin_lock_irq(&ic->endio_wait.lock); + } + spin_unlock_irq(&ic->endio_wait.lock); +} + + static void init_journal(struct dm_integrity_c *ic, unsigned start_section, unsigned n_sections, unsigned char commit_seq) { @@ -2395,9 +2738,37 @@ clear_journal: init_journal_node(&ic->journal_tree[i]); } +static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic) +{ + DEBUG_print("dm_integrity_enter_synchronous_mode\n"); + + if (ic->mode == 'B') { + ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1; + ic->synchronous_mode = 1; + + cancel_delayed_work_sync(&ic->bitmap_flush_work); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + flush_workqueue(ic->commit_wq); + } +} + +static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x) +{ + struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier); + + DEBUG_print("dm_integrity_reboot\n"); + + dm_integrity_enter_synchronous_mode(ic); + + return NOTIFY_DONE; +} + static void dm_integrity_postsuspend(struct dm_target *ti) { struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + int r; + + WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier)); del_timer_sync(&ic->autocommit_timer); @@ -2406,6 +2777,9 @@ static void dm_integrity_postsuspend(struct dm_target *ti) if (ic->recalc_wq) drain_workqueue(ic->recalc_wq); + if (ic->mode == 'B') + cancel_delayed_work_sync(&ic->bitmap_flush_work); + queue_work(ic->commit_wq, &ic->commit_work); drain_workqueue(ic->commit_wq); @@ -2416,6 +2790,18 @@ static void dm_integrity_postsuspend(struct dm_target *ti) dm_integrity_flush_buffers(ic); } + if (ic->mode == 'B') { + dm_integrity_flush_buffers(ic); +#if 1 + /* set to 0 to test bitmap replay code */ + init_journal(ic, 0, ic->journal_sections, 0); + ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); +#endif + } + WRITE_ONCE(ic->suspending, 0); BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); @@ -2426,11 +2812,70 @@ static void dm_integrity_postsuspend(struct dm_target *ti) static void dm_integrity_resume(struct dm_target *ti) { struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + int r; + DEBUG_print("resume\n"); + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) { + DEBUG_print("resume dirty_bitmap\n"); + rw_journal_sectors(ic, REQ_OP_READ, 0, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + if (ic->mode == 'B') { + if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { + block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal); + block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal); + if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, + BITMAP_OP_TEST_ALL_CLEAR)) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + } else { + DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n", + ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit); + ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); + block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + } else { + if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit && + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + init_journal(ic, 0, ic->journal_sections, 0); + replay_journal(ic); + ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + } + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + } else { + replay_journal(ic); + if (ic->mode == 'B') { + int mode; + ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + + mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR; + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode); + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode); + block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + } + } - replay_journal(ic); - - if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + DEBUG_print("testing recalc: %x\n", ic->sb->flags); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector); + DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors); if (recalc_pos < ic->provided_data_sectors) { queue_work(ic->recalc_wq, &ic->recalc_work); } else if (recalc_pos > ic->provided_data_sectors) { @@ -2438,6 +2883,16 @@ static void dm_integrity_resume(struct dm_target *ti) recalc_write_super(ic); } } + + ic->reboot_notifier.notifier_call = dm_integrity_reboot; + ic->reboot_notifier.next = NULL; + ic->reboot_notifier.priority = INT_MAX - 1; /* be notified after md and before hardware drivers */ + WARN_ON(register_reboot_notifier(&ic->reboot_notifier)); + +#if 0 + /* set to 1 to stress test synchronous mode */ + dm_integrity_enter_synchronous_mode(ic); +#endif } static void dm_integrity_status(struct dm_target *ti, status_type_t type, @@ -2462,10 +2917,14 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; watermark_percentage += ic->journal_entries / 2; do_div(watermark_percentage, ic->journal_entries); - arg_count = 5; + arg_count = 3; arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); + arg_count += ic->mode == 'J'; + arg_count += ic->mode == 'J'; + arg_count += ic->mode == 'B'; + arg_count += ic->mode == 'B'; arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; @@ -2475,13 +2934,19 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" meta_device:%s", ic->meta_dev->name); if (ic->sectors_per_block != 1) DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); - if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + if (ic->recalculate_flag) DMEMIT(" recalculate"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); - DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); - DMEMIT(" commit_time:%u", ic->autocommit_msec); + if (ic->mode == 'J') { + DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); + DMEMIT(" commit_time:%u", ic->autocommit_msec); + } + if (ic->mode == 'B') { + DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); + DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); + } #define EMIT_ALG(a, n) \ do { \ @@ -2562,7 +3027,7 @@ static int calculate_device_limits(struct dm_integrity_c *ic) if (last_sector < ic->start || last_sector >= ic->meta_device_sectors) return -EINVAL; } else { - __u64 meta_size = ic->provided_data_sectors * ic->tag_size; + __u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1)) >> (ic->log2_buffer_sectors + SECTOR_SHIFT); meta_size <<= ic->log2_buffer_sectors; @@ -2659,37 +3124,37 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) blk_queue_max_integrity_segments(disk->queue, UINT_MAX); } -static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl) +static void dm_integrity_free_page_list(struct page_list *pl) { unsigned i; if (!pl) return; - for (i = 0; i < ic->journal_pages; i++) - if (pl[i].page) - __free_page(pl[i].page); + for (i = 0; pl[i].page; i++) + __free_page(pl[i].page); kvfree(pl); } -static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic) +static struct page_list *dm_integrity_alloc_page_list(unsigned n_pages) { - size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list); struct page_list *pl; unsigned i; - pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO); + pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO); if (!pl) return NULL; - for (i = 0; i < ic->journal_pages; i++) { + for (i = 0; i < n_pages; i++) { pl[i].page = alloc_page(GFP_KERNEL); if (!pl[i].page) { - dm_integrity_free_page_list(ic, pl); + dm_integrity_free_page_list(pl); return NULL; } if (i) pl[i - 1].next = &pl[i]; } + pl[i].page = NULL; + pl[i].next = NULL; return pl; } @@ -2702,7 +3167,8 @@ static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, str kvfree(sl); } -static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl) +static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, + struct page_list *pl) { struct scatterlist **sl; unsigned i; @@ -2721,7 +3187,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int unsigned idx; page_list_location(ic, i, 0, &start_index, &start_offset); - page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset); + page_list_location(ic, i, ic->journal_section_sectors - 1, + &end_index, &end_offset); n_pages = (end_index - start_index + 1); @@ -2842,7 +3309,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) } ic->journal_pages = journal_pages; - ic->journal = dm_integrity_alloc_page_list(ic); + ic->journal = dm_integrity_alloc_page_list(ic->journal_pages); if (!ic->journal) { *error = "Could not allocate memory for journal"; r = -ENOMEM; @@ -2874,7 +3341,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) DEBUG_print("cipher %s, block size %u iv size %u\n", ic->journal_crypt_alg.alg_string, blocksize, ivsize); - ic->journal_io = dm_integrity_alloc_page_list(ic); + ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages); if (!ic->journal_io) { *error = "Could not allocate memory for journal io"; r = -ENOMEM; @@ -2898,7 +3365,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) goto bad; } - ic->journal_xor = dm_integrity_alloc_page_list(ic); + ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages); if (!ic->journal_xor) { *error = "Could not allocate memory for journal xor"; r = -ENOMEM; @@ -2922,7 +3389,8 @@ static int create_journal(struct dm_integrity_c *ic, char **error) sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids); memset(crypt_iv, 0x00, ivsize); - skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv); + skcipher_request_set_crypt(req, sg, sg, + PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv); init_completion(&comp.comp); comp.in_flight = (atomic_t)ATOMIC_INIT(1); if (do_crypt(true, req, &comp)) @@ -3063,7 +3531,7 @@ bad: * device * offset from the start of the device * tag size - * D - direct writes, J - journal writes, R - recovery mode + * D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode * number of optional arguments * optional arguments: * journal_sectors @@ -3071,10 +3539,14 @@ bad: * buffer_sectors * journal_watermark * commit_time + * meta_device + * block_size + * sectors_per_bit + * bitmap_flush_interval * internal_hash * journal_crypt * journal_mac - * block_size + * recalculate */ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) { @@ -3087,10 +3559,13 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) {0, 9, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; - bool recalculate; bool should_write_sb; __u64 threshold; unsigned long long start; + __s8 log2_sectors_per_bitmap_bit = -1; + __s8 log2_blocks_per_bitmap_bit; + __u64 bits_in_journal; + __u64 n_bitmap_bits; #define DIRECT_ARGUMENTS 4 @@ -3114,6 +3589,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) init_waitqueue_head(&ic->copy_to_journal_wait); init_completion(&ic->crypto_backoff); atomic64_set(&ic->number_of_mismatches, 0); + ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL; r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev); if (r) { @@ -3136,10 +3612,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } } - if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) + if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") || + !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) { ic->mode = argv[3][0]; - else { - ti->error = "Invalid mode (expecting J, D, R)"; + } else { + ti->error = "Invalid mode (expecting J, B, D, R)"; r = -EINVAL; goto bad; } @@ -3149,7 +3626,6 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) buffer_sectors = DEFAULT_BUFFER_SECTORS; journal_watermark = DEFAULT_JOURNAL_WATERMARK; sync_msec = DEFAULT_SYNC_MSEC; - recalculate = false; ic->sectors_per_block = 1; as.argc = argc - DIRECT_ARGUMENTS; @@ -3161,6 +3637,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) while (extra_args--) { const char *opt_string; unsigned val; + unsigned long long llval; opt_string = dm_shift_arg(&as); if (!opt_string) { r = -EINVAL; @@ -3182,7 +3659,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) dm_put_device(ti, ic->meta_dev); ic->meta_dev = NULL; } - r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev); + r = dm_get_device(ti, strchr(opt_string, ':') + 1, + dm_table_get_mode(ti->table), &ic->meta_dev); if (r) { ti->error = "Device lookup failed"; goto bad; @@ -3196,6 +3674,14 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } ic->sectors_per_block = val >> SECTOR_SHIFT; + } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) { + log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval); + } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) { + if (val >= (uint64_t)UINT_MAX * 1000 / HZ) { + r = -EINVAL; + ti->error = "Invalid bitmap_flush_interval argument"; + } + ic->bitmap_flush_interval = msecs_to_jiffies(val); } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) { r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error, "Invalid internal_hash argument"); @@ -3212,7 +3698,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (r) goto bad; } else if (!strcmp(opt_string, "recalculate")) { - recalculate = true; + ic->recalculate_flag = true; } else { r = -EINVAL; ti->error = "Invalid argument"; @@ -3228,7 +3714,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (!journal_sectors) { journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, - ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); + ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); } if (!buffer_sectors) @@ -3263,6 +3749,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) else ic->log2_tag_size = -1; + if (ic->mode == 'B' && !ic->internal_hash) { + r = -EINVAL; + ti->error = "Bitmap mode can be only used with internal hash"; + goto bad; + } + ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); ic->autocommit_msec = sync_msec; timer_setup(&ic->autocommit_timer, autocommit_fn, 0); @@ -3308,7 +3800,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } INIT_WORK(&ic->commit_work, integrity_commit); - if (ic->mode == 'J') { + if (ic->mode == 'J' || ic->mode == 'B') { ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1); if (!ic->writer_wq) { ti->error = "Cannot allocate workqueue"; @@ -3349,7 +3841,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_2) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_3) { r = -EINVAL; ti->error = "Unknown version"; goto bad; @@ -3409,6 +3901,27 @@ try_smaller_buffer: ti->error = "The device is too small"; goto bad; } + + if (log2_sectors_per_bitmap_bit < 0) + log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT); + if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block) + log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block; + + bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3); + if (bits_in_journal > UINT_MAX) + bits_in_journal = UINT_MAX; + while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit) + log2_sectors_per_bitmap_bit++; + + log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block; + ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit; + if (should_write_sb) { + ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit; + } + n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) + + (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit; + ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8); + if (!ic->meta_dev) ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run)); @@ -3433,25 +3946,21 @@ try_smaller_buffer: DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections)); DEBUG_print(" journal_entries %u\n", ic->journal_entries); DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors); - DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors); + DEBUG_print(" data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT); DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors, (unsigned long long)ic->provided_data_sectors); DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); + DEBUG_print(" bits_in_journal %llu\n", (unsigned long long)bits_in_journal); - if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { + if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); ic->sb->recalc_sector = cpu_to_le64(0); } - if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { - if (!ic->internal_hash) { - r = -EINVAL; - ti->error = "Recalculate is only valid with internal hash"; - goto bad; - } + if (ic->internal_hash) { ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1); if (!ic->recalc_wq ) { ti->error = "Cannot allocate workqueue"; @@ -3488,6 +3997,45 @@ try_smaller_buffer: r = create_journal(ic, &ti->error); if (r) goto bad; + + } + + if (ic->mode == 'B') { + unsigned i; + unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE); + + ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); + if (!ic->recalc_bitmap) { + r = -ENOMEM; + goto bad; + } + ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); + if (!ic->may_write_bitmap) { + r = -ENOMEM; + goto bad; + } + ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); + if (!ic->bbs) { + r = -ENOMEM; + goto bad; + } + INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work); + for (i = 0; i < ic->n_bitmap_blocks; i++) { + struct bitmap_block_status *bbs = &ic->bbs[i]; + unsigned sector, pl_index, pl_offset; + + INIT_WORK(&bbs->work, bitmap_block_work); + bbs->ic = ic; + bbs->idx = i; + bio_list_init(&bbs->bio_queue); + spin_lock_init(&bbs->bio_queue_lock); + + sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT); + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset; + } } if (should_write_sb) { @@ -3512,6 +4060,17 @@ try_smaller_buffer: if (r) goto bad; } + if (ic->mode == 'B') { + unsigned max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8); + if (!max_io_len) + max_io_len = 1U << 31; + DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len); + if (!ti->max_io_len || ti->max_io_len > max_io_len) { + r = dm_set_target_max_io_len(ti, max_io_len); + if (r) + goto bad; + } + } if (!ic->internal_hash) dm_integrity_set(ti, ic); @@ -3520,6 +4079,7 @@ try_smaller_buffer: ti->flush_supported = true; return 0; + bad: dm_integrity_dtr(ti); return r; @@ -3542,10 +4102,9 @@ static void dm_integrity_dtr(struct dm_target *ti) destroy_workqueue(ic->writer_wq); if (ic->recalc_wq) destroy_workqueue(ic->recalc_wq); - if (ic->recalc_buffer) - vfree(ic->recalc_buffer); - if (ic->recalc_tags) - kvfree(ic->recalc_tags); + vfree(ic->recalc_buffer); + kvfree(ic->recalc_tags); + kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); mempool_exit(&ic->journal_io_mempool); @@ -3555,9 +4114,11 @@ static void dm_integrity_dtr(struct dm_target *ti) dm_put_device(ti, ic->dev); if (ic->meta_dev) dm_put_device(ti, ic->meta_dev); - dm_integrity_free_page_list(ic, ic->journal); - dm_integrity_free_page_list(ic, ic->journal_io); - dm_integrity_free_page_list(ic, ic->journal_xor); + dm_integrity_free_page_list(ic->journal); + dm_integrity_free_page_list(ic->journal_io); + dm_integrity_free_page_list(ic->journal_xor); + dm_integrity_free_page_list(ic->recalc_bitmap); + dm_integrity_free_page_list(ic->may_write_bitmap); if (ic->journal_scatterlist) dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist); if (ic->journal_io_scatterlist) @@ -3595,7 +4156,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index c740153b4e52..1e03bc89e20f 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -2069,7 +2069,7 @@ int __init dm_early_create(struct dm_ioctl *dmi, /* alloc table */ r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md); if (r) - goto err_destroy_dm; + goto err_hash_remove; /* add targets */ for (i = 0; i < dmi->target_count; i++) { @@ -2116,6 +2116,10 @@ int __init dm_early_create(struct dm_ioctl *dmi, err_destroy_table: dm_table_destroy(t); +err_hash_remove: + (void) __hash_remove(__get_name_cell(dmi->name)); + /* release reference from __get_name_cell */ + dm_put(md); err_destroy_dm: dm_put(md); dm_destroy(md); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 2ee5e357a0a7..dbcc1e41cd57 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -544,8 +544,23 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, return DM_MAPIO_REMAPPED; } -static void multipath_release_clone(struct request *clone) +static void multipath_release_clone(struct request *clone, + union map_info *map_context) { + if (unlikely(map_context)) { + /* + * non-NULL map_context means caller is still map + * method; must undo multipath_clone_and_map() + */ + struct dm_mpath_io *mpio = get_mpio(map_context); + struct pgpath *pgpath = mpio->pgpath; + + if (pgpath && pgpath->pg->ps.type->end_io) + pgpath->pg->ps.type->end_io(&pgpath->pg->ps, + &pgpath->path, + mpio->nr_bytes); + } + blk_put_request(clone); } @@ -882,6 +897,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps if (attached_handler_name || m->hw_handler_name) { INIT_DELAYED_WORK(&p->activate_path, activate_path_work); r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error); + kfree(attached_handler_name); if (r) { dm_put_device(ti, p->path.dev); goto bad; @@ -896,7 +912,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps return p; bad: - kfree(attached_handler_name); free_pgpath(p); return ERR_PTR(r); } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index b66745bd08bb..5f7063f05ae0 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -168,7 +168,7 @@ static void dm_end_request(struct request *clone, blk_status_t error) struct request *rq = tio->orig; blk_rq_unprep_clone(clone); - tio->ti->type->release_clone_rq(clone); + tio->ti->type->release_clone_rq(clone, NULL); rq_end_stats(md, rq); blk_mq_end_request(rq, error); @@ -201,7 +201,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ rq_end_stats(md, rq); if (tio->clone) { blk_rq_unprep_clone(tio->clone); - tio->ti->type->release_clone_rq(tio->clone); + tio->ti->type->release_clone_rq(tio->clone, NULL); } dm_mq_delay_requeue_request(rq, delay_ms); @@ -398,7 +398,7 @@ static int map_request(struct dm_rq_target_io *tio) case DM_MAPIO_REMAPPED: if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { /* -ENOMEM */ - ti->type->release_clone_rq(clone); + ti->type->release_clone_rq(clone, &tio->info); return DM_MAPIO_REQUEUE; } @@ -408,7 +408,7 @@ static int map_request(struct dm_rq_target_io *tio) ret = dm_dispatch_clone_request(clone, rq); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_rq_unprep_clone(clone); - tio->ti->type->release_clone_rq(clone); + tio->ti->type->release_clone_rq(clone, &tio->info); tio->clone = NULL; return DM_MAPIO_REQUEUE; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index a168963b757d..3107f2b1988b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/list.h> +#include <linux/list_bl.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/slab.h> @@ -44,11 +45,11 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; struct dm_exception_table { uint32_t hash_mask; unsigned hash_shift; - struct list_head *table; + struct hlist_bl_head *table; }; struct dm_snapshot { - struct mutex lock; + struct rw_semaphore lock; struct dm_dev *origin; struct dm_dev *cow; @@ -76,7 +77,9 @@ struct dm_snapshot { atomic_t pending_exceptions_count; - /* Protected by "lock" */ + spinlock_t pe_allocation_lock; + + /* Protected by "pe_allocation_lock" */ sector_t exception_start_sequence; /* Protected by kcopyd single-threaded callback */ @@ -457,9 +460,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) continue; - mutex_lock(&s->lock); + down_read(&s->lock); active = s->active; - mutex_unlock(&s->lock); + up_read(&s->lock); if (active) { if (snap_src) @@ -618,6 +621,36 @@ static void unregister_snapshot(struct dm_snapshot *s) * The lowest hash_shift bits of the chunk number are ignored, allowing * some consecutive chunks to be grouped together. */ +static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk); + +/* Lock to protect access to the completed and pending exception hash tables. */ +struct dm_exception_table_lock { + struct hlist_bl_head *complete_slot; + struct hlist_bl_head *pending_slot; +}; + +static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk, + struct dm_exception_table_lock *lock) +{ + struct dm_exception_table *complete = &s->complete; + struct dm_exception_table *pending = &s->pending; + + lock->complete_slot = &complete->table[exception_hash(complete, chunk)]; + lock->pending_slot = &pending->table[exception_hash(pending, chunk)]; +} + +static void dm_exception_table_lock(struct dm_exception_table_lock *lock) +{ + hlist_bl_lock(lock->complete_slot); + hlist_bl_lock(lock->pending_slot); +} + +static void dm_exception_table_unlock(struct dm_exception_table_lock *lock) +{ + hlist_bl_unlock(lock->pending_slot); + hlist_bl_unlock(lock->complete_slot); +} + static int dm_exception_table_init(struct dm_exception_table *et, uint32_t size, unsigned hash_shift) { @@ -625,12 +658,12 @@ static int dm_exception_table_init(struct dm_exception_table *et, et->hash_shift = hash_shift; et->hash_mask = size - 1; - et->table = dm_vcalloc(size, sizeof(struct list_head)); + et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head)); if (!et->table) return -ENOMEM; for (i = 0; i < size; i++) - INIT_LIST_HEAD(et->table + i); + INIT_HLIST_BL_HEAD(et->table + i); return 0; } @@ -638,15 +671,16 @@ static int dm_exception_table_init(struct dm_exception_table *et, static void dm_exception_table_exit(struct dm_exception_table *et, struct kmem_cache *mem) { - struct list_head *slot; - struct dm_exception *ex, *next; + struct hlist_bl_head *slot; + struct dm_exception *ex; + struct hlist_bl_node *pos, *n; int i, size; size = et->hash_mask + 1; for (i = 0; i < size; i++) { slot = et->table + i; - list_for_each_entry_safe (ex, next, slot, hash_list) + hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) kmem_cache_free(mem, ex); } @@ -660,7 +694,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) static void dm_remove_exception(struct dm_exception *e) { - list_del(&e->hash_list); + hlist_bl_del(&e->hash_list); } /* @@ -670,11 +704,12 @@ static void dm_remove_exception(struct dm_exception *e) static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, chunk_t chunk) { - struct list_head *slot; + struct hlist_bl_head *slot; + struct hlist_bl_node *pos; struct dm_exception *e; slot = &et->table[exception_hash(et, chunk)]; - list_for_each_entry (e, slot, hash_list) + hlist_bl_for_each_entry(e, pos, slot, hash_list) if (chunk >= e->old_chunk && chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) return e; @@ -721,7 +756,8 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) static void dm_insert_exception(struct dm_exception_table *eh, struct dm_exception *new_e) { - struct list_head *l; + struct hlist_bl_head *l; + struct hlist_bl_node *pos; struct dm_exception *e = NULL; l = &eh->table[exception_hash(eh, new_e->old_chunk)]; @@ -731,7 +767,7 @@ static void dm_insert_exception(struct dm_exception_table *eh, goto out; /* List is ordered by old_chunk */ - list_for_each_entry_reverse(e, l, hash_list) { + hlist_bl_for_each_entry(e, pos, l, hash_list) { /* Insert after an existing chunk? */ if (new_e->old_chunk == (e->old_chunk + dm_consecutive_chunk_count(e) + 1) && @@ -752,12 +788,24 @@ static void dm_insert_exception(struct dm_exception_table *eh, return; } - if (new_e->old_chunk > e->old_chunk) + if (new_e->old_chunk < e->old_chunk) break; } out: - list_add(&new_e->hash_list, e ? &e->hash_list : l); + if (!e) { + /* + * Either the table doesn't support consecutive chunks or slot + * l is empty. + */ + hlist_bl_add_head(&new_e->hash_list, l); + } else if (new_e->old_chunk < e->old_chunk) { + /* Add before an existing exception */ + hlist_bl_add_before(&new_e->hash_list, &e->hash_list); + } else { + /* Add to l's tail: e is the last exception in this slot */ + hlist_bl_add_behind(&new_e->hash_list, &e->hash_list); + } } /* @@ -766,6 +814,7 @@ out: */ static int dm_add_exception(void *context, chunk_t old, chunk_t new) { + struct dm_exception_table_lock lock; struct dm_snapshot *s = context; struct dm_exception *e; @@ -778,7 +827,17 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) /* Consecutive_count is implicitly initialised to zero */ e->new_chunk = new; + /* + * Although there is no need to lock access to the exception tables + * here, if we don't then hlist_bl_add_head(), called by + * dm_insert_exception(), will complain about accessing the + * corresponding list without locking it first. + */ + dm_exception_table_lock_init(s, old, &lock); + + dm_exception_table_lock(&lock); dm_insert_exception(&s->complete, e); + dm_exception_table_unlock(&lock); return 0; } @@ -807,7 +866,7 @@ static int calc_max_buckets(void) { /* use a fixed size of 2MB */ unsigned long mem = 2 * 1024 * 1024; - mem /= sizeof(struct list_head); + mem /= sizeof(struct hlist_bl_head); return mem; } @@ -927,7 +986,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s) int r; chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; - mutex_lock(&s->lock); + down_write(&s->lock); /* * Process chunks (and associated exceptions) in reverse order @@ -942,7 +1001,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s) b = __release_queued_bios_after_merge(s); out: - mutex_unlock(&s->lock); + up_write(&s->lock); if (b) flush_bios(b); @@ -1001,9 +1060,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) if (linear_chunks < 0) { DMERR("Read error in exception store: " "shutting down merge"); - mutex_lock(&s->lock); + down_write(&s->lock); s->merge_failed = 1; - mutex_unlock(&s->lock); + up_write(&s->lock); } goto shut; } @@ -1044,10 +1103,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) previous_count = read_pending_exceptions_done_count(); } - mutex_lock(&s->lock); + down_write(&s->lock); s->first_merging_chunk = old_chunk; s->num_merging_chunks = linear_chunks; - mutex_unlock(&s->lock); + up_write(&s->lock); /* Wait until writes to all 'linear_chunks' drain */ for (i = 0; i < linear_chunks; i++) @@ -1089,10 +1148,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) return; shut: - mutex_lock(&s->lock); + down_write(&s->lock); s->merge_failed = 1; b = __release_queued_bios_after_merge(s); - mutex_unlock(&s->lock); + up_write(&s->lock); error_bios(b); merge_shutdown(s); @@ -1188,10 +1247,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->snapshot_overflowed = 0; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); + spin_lock_init(&s->pe_allocation_lock); s->exception_start_sequence = 0; s->exception_complete_sequence = 0; s->out_of_order_tree = RB_ROOT; - mutex_init(&s->lock); + init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); s->state_bits = 0; @@ -1357,9 +1417,9 @@ static void snapshot_dtr(struct dm_target *ti) /* Check whether exception handover must be cancelled */ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest && (s == snap_src)) { - mutex_lock(&snap_dest->lock); + down_write(&snap_dest->lock); snap_dest->valid = 0; - mutex_unlock(&snap_dest->lock); + up_write(&snap_dest->lock); DMERR("Cancelling snapshot handover."); } up_read(&_origins_lock); @@ -1390,8 +1450,6 @@ static void snapshot_dtr(struct dm_target *ti) dm_exception_store_destroy(s->store); - mutex_destroy(&s->lock); - dm_put_device(ti, s->cow); dm_put_device(ti, s->origin); @@ -1467,6 +1525,13 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) dm_table_event(s->ti->table); } +static void invalidate_snapshot(struct dm_snapshot *s, int err) +{ + down_write(&s->lock); + __invalidate_snapshot(s, err); + up_write(&s->lock); +} + static void pending_complete(void *context, int success) { struct dm_snap_pending_exception *pe = context; @@ -1475,43 +1540,63 @@ static void pending_complete(void *context, int success) struct bio *origin_bios = NULL; struct bio *snapshot_bios = NULL; struct bio *full_bio = NULL; + struct dm_exception_table_lock lock; int error = 0; + dm_exception_table_lock_init(s, pe->e.old_chunk, &lock); + if (!success) { /* Read/write error - snapshot is unusable */ - mutex_lock(&s->lock); - __invalidate_snapshot(s, -EIO); + invalidate_snapshot(s, -EIO); error = 1; + + dm_exception_table_lock(&lock); goto out; } e = alloc_completed_exception(GFP_NOIO); if (!e) { - mutex_lock(&s->lock); - __invalidate_snapshot(s, -ENOMEM); + invalidate_snapshot(s, -ENOMEM); error = 1; + + dm_exception_table_lock(&lock); goto out; } *e = pe->e; - mutex_lock(&s->lock); + down_read(&s->lock); + dm_exception_table_lock(&lock); if (!s->valid) { + up_read(&s->lock); free_completed_exception(e); error = 1; + goto out; } - /* Check for conflicting reads */ - __check_for_conflicting_io(s, pe->e.old_chunk); - /* - * Add a proper exception, and remove the - * in-flight exception from the list. + * Add a proper exception. After inserting the completed exception all + * subsequent snapshot reads to this chunk will be redirected to the + * COW device. This ensures that we do not starve. Moreover, as long + * as the pending exception exists, neither origin writes nor snapshot + * merging can overwrite the chunk in origin. */ dm_insert_exception(&s->complete, e); + up_read(&s->lock); + + /* Wait for conflicting reads to drain */ + if (__chunk_is_tracked(s, pe->e.old_chunk)) { + dm_exception_table_unlock(&lock); + __check_for_conflicting_io(s, pe->e.old_chunk); + dm_exception_table_lock(&lock); + } out: + /* Remove the in-flight exception from the list */ dm_remove_exception(&pe->e); + + dm_exception_table_unlock(&lock); + snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); full_bio = pe->full_bio; @@ -1519,8 +1604,6 @@ out: full_bio->bi_end_io = pe->full_bio_end_io; increment_pending_exceptions_done_count(); - mutex_unlock(&s->lock); - /* Submit any pending write bios */ if (error) { if (full_bio) @@ -1660,43 +1743,59 @@ __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) } /* - * Looks to see if this snapshot already has a pending exception - * for this chunk, otherwise it allocates a new one and inserts - * it into the pending table. + * Inserts a pending exception into the pending table. * - * NOTE: a write lock must be held on snap->lock before calling - * this. + * NOTE: a write lock must be held on the chunk's pending exception table slot + * before calling this. */ static struct dm_snap_pending_exception * -__find_pending_exception(struct dm_snapshot *s, - struct dm_snap_pending_exception *pe, chunk_t chunk) +__insert_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) { - struct dm_snap_pending_exception *pe2; - - pe2 = __lookup_pending_exception(s, chunk); - if (pe2) { - free_pending_exception(pe); - return pe2; - } - pe->e.old_chunk = chunk; bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); pe->started = 0; pe->full_bio = NULL; + spin_lock(&s->pe_allocation_lock); if (s->store->type->prepare_exception(s->store, &pe->e)) { + spin_unlock(&s->pe_allocation_lock); free_pending_exception(pe); return NULL; } pe->exception_sequence = s->exception_start_sequence++; + spin_unlock(&s->pe_allocation_lock); dm_insert_exception(&s->pending, &pe->e); return pe; } +/* + * Looks to see if this snapshot already has a pending exception + * for this chunk, otherwise it allocates a new one and inserts + * it into the pending table. + * + * NOTE: a write lock must be held on the chunk's pending exception table slot + * before calling this. + */ +static struct dm_snap_pending_exception * +__find_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) +{ + struct dm_snap_pending_exception *pe2; + + pe2 = __lookup_pending_exception(s, chunk); + if (pe2) { + free_pending_exception(pe); + return pe2; + } + + return __insert_pending_exception(s, pe, chunk); +} + static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, struct bio *bio, chunk_t chunk) { @@ -1714,6 +1813,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) int r = DM_MAPIO_REMAPPED; chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; + struct dm_exception_table_lock lock; init_tracked_chunk(bio); @@ -1723,13 +1823,15 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) } chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); + dm_exception_table_lock_init(s, chunk, &lock); /* Full snapshots are not usable */ /* To get here the table must be live so s->active is always set. */ if (!s->valid) return DM_MAPIO_KILL; - mutex_lock(&s->lock); + down_read(&s->lock); + dm_exception_table_lock(&lock); if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_data_dir(bio) == WRITE)) { @@ -1752,15 +1854,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(bio) == WRITE) { pe = __lookup_pending_exception(s, chunk); if (!pe) { - mutex_unlock(&s->lock); + dm_exception_table_unlock(&lock); pe = alloc_pending_exception(s); - mutex_lock(&s->lock); - - if (!s->valid || s->snapshot_overflowed) { - free_pending_exception(pe); - r = DM_MAPIO_KILL; - goto out_unlock; - } + dm_exception_table_lock(&lock); e = dm_lookup_exception(&s->complete, chunk); if (e) { @@ -1771,13 +1867,22 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) pe = __find_pending_exception(s, pe, chunk); if (!pe) { + dm_exception_table_unlock(&lock); + up_read(&s->lock); + + down_write(&s->lock); + if (s->store->userspace_supports_overflow) { - s->snapshot_overflowed = 1; - DMERR("Snapshot overflowed: Unable to allocate exception."); + if (s->valid && !s->snapshot_overflowed) { + s->snapshot_overflowed = 1; + DMERR("Snapshot overflowed: Unable to allocate exception."); + } } else __invalidate_snapshot(s, -ENOMEM); + up_write(&s->lock); + r = DM_MAPIO_KILL; - goto out_unlock; + goto out; } } @@ -1789,7 +1894,10 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) bio->bi_iter.bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { pe->started = 1; - mutex_unlock(&s->lock); + + dm_exception_table_unlock(&lock); + up_read(&s->lock); + start_full_bio(pe, bio); goto out; } @@ -1797,9 +1905,12 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) bio_list_add(&pe->snapshot_bios, bio); if (!pe->started) { - /* this is protected by snap->lock */ + /* this is protected by the exception table lock */ pe->started = 1; - mutex_unlock(&s->lock); + + dm_exception_table_unlock(&lock); + up_read(&s->lock); + start_copy(pe); goto out; } @@ -1809,7 +1920,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) } out_unlock: - mutex_unlock(&s->lock); + dm_exception_table_unlock(&lock); + up_read(&s->lock); out: return r; } @@ -1845,7 +1957,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); - mutex_lock(&s->lock); + down_write(&s->lock); /* Full merging snapshots are redirected to the origin */ if (!s->valid) @@ -1876,12 +1988,12 @@ redirect_to_origin: bio_set_dev(bio, s->origin->bdev); if (bio_data_dir(bio) == WRITE) { - mutex_unlock(&s->lock); + up_write(&s->lock); return do_origin(s->origin, bio); } out_unlock: - mutex_unlock(&s->lock); + up_write(&s->lock); return r; } @@ -1913,7 +2025,7 @@ static int snapshot_preresume(struct dm_target *ti) down_read(&_origins_lock); (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { - mutex_lock(&snap_src->lock); + down_read(&snap_src->lock); if (s == snap_src) { DMERR("Unable to resume snapshot source until " "handover completes."); @@ -1923,7 +2035,7 @@ static int snapshot_preresume(struct dm_target *ti) "source is suspended."); r = -EINVAL; } - mutex_unlock(&snap_src->lock); + up_read(&snap_src->lock); } up_read(&_origins_lock); @@ -1969,11 +2081,11 @@ static void snapshot_resume(struct dm_target *ti) (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { - mutex_lock(&snap_src->lock); - mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); + down_write(&snap_src->lock); + down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); __handover_exceptions(snap_src, snap_dest); - mutex_unlock(&snap_dest->lock); - mutex_unlock(&snap_src->lock); + up_write(&snap_dest->lock); + up_write(&snap_src->lock); } up_read(&_origins_lock); @@ -1988,9 +2100,9 @@ static void snapshot_resume(struct dm_target *ti) /* Now we have correct chunk size, reregister */ reregister_snapshot(s); - mutex_lock(&s->lock); + down_write(&s->lock); s->active = 1; - mutex_unlock(&s->lock); + up_write(&s->lock); } static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) @@ -2030,7 +2142,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - mutex_lock(&snap->lock); + down_write(&snap->lock); if (!snap->valid) DMEMIT("Invalid"); @@ -2055,7 +2167,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, DMEMIT("Unknown"); } - mutex_unlock(&snap->lock); + up_write(&snap->lock); break; @@ -2107,9 +2219,10 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, int r = DM_MAPIO_REMAPPED; struct dm_snapshot *snap; struct dm_exception *e; - struct dm_snap_pending_exception *pe; + struct dm_snap_pending_exception *pe, *pe2; struct dm_snap_pending_exception *pe_to_start_now = NULL; struct dm_snap_pending_exception *pe_to_start_last = NULL; + struct dm_exception_table_lock lock; chunk_t chunk; /* Do all the snapshots on this origin */ @@ -2121,52 +2234,59 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, if (dm_target_is_snapshot_merge(snap->ti)) continue; - mutex_lock(&snap->lock); - - /* Only deal with valid and active snapshots */ - if (!snap->valid || !snap->active) - goto next_snapshot; - /* Nothing to do if writing beyond end of snapshot */ if (sector >= dm_table_get_size(snap->ti->table)) - goto next_snapshot; + continue; /* * Remember, different snapshots can have * different chunk sizes. */ chunk = sector_to_chunk(snap->store, sector); + dm_exception_table_lock_init(snap, chunk, &lock); - /* - * Check exception table to see if block - * is already remapped in this snapshot - * and trigger an exception if not. - */ - e = dm_lookup_exception(&snap->complete, chunk); - if (e) + down_read(&snap->lock); + dm_exception_table_lock(&lock); + + /* Only deal with valid and active snapshots */ + if (!snap->valid || !snap->active) goto next_snapshot; pe = __lookup_pending_exception(snap, chunk); if (!pe) { - mutex_unlock(&snap->lock); - pe = alloc_pending_exception(snap); - mutex_lock(&snap->lock); - - if (!snap->valid) { - free_pending_exception(pe); - goto next_snapshot; - } - + /* + * Check exception table to see if block is already + * remapped in this snapshot and trigger an exception + * if not. + */ e = dm_lookup_exception(&snap->complete, chunk); - if (e) { - free_pending_exception(pe); + if (e) goto next_snapshot; - } - pe = __find_pending_exception(snap, pe, chunk); - if (!pe) { - __invalidate_snapshot(snap, -ENOMEM); - goto next_snapshot; + dm_exception_table_unlock(&lock); + pe = alloc_pending_exception(snap); + dm_exception_table_lock(&lock); + + pe2 = __lookup_pending_exception(snap, chunk); + + if (!pe2) { + e = dm_lookup_exception(&snap->complete, chunk); + if (e) { + free_pending_exception(pe); + goto next_snapshot; + } + + pe = __insert_pending_exception(snap, pe, chunk); + if (!pe) { + dm_exception_table_unlock(&lock); + up_read(&snap->lock); + + invalidate_snapshot(snap, -ENOMEM); + continue; + } + } else { + free_pending_exception(pe); + pe = pe2; } } @@ -2193,7 +2313,8 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, } next_snapshot: - mutex_unlock(&snap->lock); + dm_exception_table_unlock(&lock); + up_read(&snap->lock); if (pe_to_start_now) { start_copy(pe_to_start_now); diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 314d17ca6466..64dd0b34fcf4 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -136,7 +136,8 @@ static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, return DM_MAPIO_KILL; } -static void io_err_release_clone_rq(struct request *clone) +static void io_err_release_clone_rq(struct request *clone, + union map_info *map_context) { } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index ed3caceaed07..7f0840601737 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -202,6 +202,13 @@ struct dm_pool_metadata { bool fail_io:1; /* + * Set once a thin-pool has been accessed through one of the interfaces + * that imply the pool is in-service (e.g. thin devices created/deleted, + * thin-pool message, metadata snapshots, etc). + */ + bool in_service:1; + + /* * Reading the space map roots can fail, so we read it into these * buffers before the superblock is locked and updated. */ @@ -367,6 +374,32 @@ static int subtree_equal(void *context, const void *value1_le, const void *value /*----------------------------------------------------------------*/ +/* + * Variant that is used for in-core only changes or code that + * shouldn't put the pool in service on its own (e.g. commit). + */ +static inline void __pmd_write_lock(struct dm_pool_metadata *pmd) + __acquires(pmd->root_lock) +{ + down_write(&pmd->root_lock); +} +#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd)) + +static inline void pmd_write_lock(struct dm_pool_metadata *pmd) +{ + __pmd_write_lock(pmd); + if (unlikely(!pmd->in_service)) + pmd->in_service = true; +} + +static inline void pmd_write_unlock(struct dm_pool_metadata *pmd) + __releases(pmd->root_lock) +{ + up_write(&pmd->root_lock); +} + +/*----------------------------------------------------------------*/ + static int superblock_lock_zero(struct dm_pool_metadata *pmd, struct dm_block **sblock) { @@ -790,6 +823,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) */ BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); + if (unlikely(!pmd->in_service)) + return 0; + r = __write_changed_details(pmd); if (r < 0) return r; @@ -853,6 +889,7 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, pmd->time = 0; INIT_LIST_HEAD(&pmd->thin_devices); pmd->fail_io = false; + pmd->in_service = false; pmd->bdev = bdev; pmd->data_block_size = data_block_size; @@ -903,7 +940,6 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) DMWARN("%s: __commit_transaction() failed, error = %d", __func__, r); } - if (!pmd->fail_io) __destroy_persistent_data_objects(pmd); @@ -1032,10 +1068,10 @@ int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __create_thin(pmd, dev); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1123,10 +1159,10 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __create_snap(pmd, dev, origin); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1166,10 +1202,10 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __delete_device(pmd, dev); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1180,7 +1216,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (pmd->fail_io) goto out; @@ -1194,7 +1230,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, r = 0; out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1225,7 +1261,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) * We commit to ensure the btree roots which we increment in a * moment are up to date. */ - __commit_transaction(pmd); + r = __commit_transaction(pmd); + if (r < 0) { + DMWARN("%s: __commit_transaction() failed, error = %d", + __func__, r); + return r; + } /* * Copy the superblock. @@ -1283,10 +1324,10 @@ int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __reserve_metadata_snap(pmd); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1331,10 +1372,10 @@ int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __release_metadata_snap(pmd); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1377,19 +1418,19 @@ int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); if (!pmd->fail_io) r = __open_device(pmd, dev, 0, td); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } int dm_pool_close_thin_device(struct dm_thin_device *td) { - down_write(&td->pmd->root_lock); + pmd_write_lock_in_core(td->pmd); __close_device(td); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return 0; } @@ -1570,10 +1611,10 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, { int r = -EINVAL; - down_write(&td->pmd->root_lock); + pmd_write_lock(td->pmd); if (!td->pmd->fail_io) r = __insert(td, block, data_block); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return r; } @@ -1657,10 +1698,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) { int r = -EINVAL; - down_write(&td->pmd->root_lock); + pmd_write_lock(td->pmd); if (!td->pmd->fail_io) r = __remove(td, block); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return r; } @@ -1670,10 +1711,10 @@ int dm_thin_remove_range(struct dm_thin_device *td, { int r = -EINVAL; - down_write(&td->pmd->root_lock); + pmd_write_lock(td->pmd); if (!td->pmd->fail_io) r = __remove_range(td, begin, end); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return r; } @@ -1696,13 +1737,13 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ { int r = 0; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); for (; b != e; b++) { r = dm_sm_inc_block(pmd->data_sm, b); if (r) break; } - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1711,13 +1752,13 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ { int r = 0; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); for (; b != e; b++) { r = dm_sm_dec_block(pmd->data_sm, b); if (r) break; } - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1765,10 +1806,10 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = dm_sm_new_block(pmd->data_sm, result); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1777,12 +1818,16 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + /* + * Care is taken to not have commit be what + * triggers putting the thin-pool in-service. + */ + __pmd_write_lock(pmd); if (pmd->fail_io) goto out; r = __commit_transaction(pmd); - if (r <= 0) + if (r < 0) goto out; /* @@ -1790,7 +1835,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) */ r = __begin_transaction(pmd); out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1806,7 +1851,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (pmd->fail_io) goto out; @@ -1817,7 +1862,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) pmd->fail_io = true; out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1948,10 +1993,10 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __resize_space_map(pmd->data_sm, new_count); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1960,29 +2005,29 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) { r = __resize_space_map(pmd->metadata_sm, new_count); if (!r) __set_metadata_reserve(pmd); } - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) { - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); dm_bm_set_read_only(pmd->bm); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); } void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) { - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); dm_bm_set_read_write(pmd->bm); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); } int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, @@ -1992,9 +2037,9 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, { int r; - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -2005,7 +2050,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) struct dm_block *sblock; struct thin_disk_superblock *disk_super; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; r = superblock_lock(pmd, &sblock); @@ -2019,7 +2064,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) dm_bm_unlock(sblock); out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index f7822875589e..1cb137f0ef9d 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -190,7 +190,6 @@ struct writeback_struct { struct dm_writecache *wc; struct wc_entry **wc_list; unsigned wc_list_n; - unsigned page_offset; struct page *page; struct wc_entry *wc_list_inline[WB_LIST_INLINE]; struct bio bio; @@ -546,21 +545,20 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, e = container_of(node, struct wc_entry, rb_node); if (read_original_sector(wc, e) == block) break; + node = (read_original_sector(wc, e) >= block ? e->rb_node.rb_left : e->rb_node.rb_right); if (unlikely(!node)) { - if (!(flags & WFE_RETURN_FOLLOWING)) { + if (!(flags & WFE_RETURN_FOLLOWING)) return NULL; - } if (read_original_sector(wc, e) >= block) { - break; + return e; } else { node = rb_next(&e->rb_node); - if (unlikely(!node)) { + if (unlikely(!node)) return NULL; - } e = container_of(node, struct wc_entry, rb_node); - break; + return e; } } } @@ -571,7 +569,7 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, node = rb_prev(&e->rb_node); else node = rb_next(&e->rb_node); - if (!node) + if (unlikely(!node)) return e; e2 = container_of(node, struct wc_entry, rb_node); if (read_original_sector(wc, e2) != block) @@ -804,7 +802,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_ writecache_free_entry(wc, e); } - if (!node) + if (unlikely(!node)) break; e = container_of(node, struct wc_entry, rb_node); @@ -1478,10 +1476,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set); wb = container_of(bio, struct writeback_struct, bio); wb->wc = wc; - wb->bio.bi_end_io = writecache_writeback_endio; - bio_set_dev(&wb->bio, wc->dev->bdev); - wb->bio.bi_iter.bi_sector = read_original_sector(wc, e); - wb->page_offset = PAGE_SIZE; + bio->bi_end_io = writecache_writeback_endio; + bio_set_dev(bio, wc->dev->bdev); + bio->bi_iter.bi_sector = read_original_sector(wc, e); if (max_pages <= WB_LIST_INLINE || unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), GFP_NOIO | __GFP_NORETRY | @@ -1507,12 +1504,12 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba wb->wc_list[wb->wc_list_n++] = f; e = f; } - bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); + bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); if (writecache_has_error(wc)) { bio->bi_status = BLK_STS_IOERR; - bio_endio(&wb->bio); + bio_endio(bio); } else { - submit_bio(&wb->bio); + submit_bio(bio); } __writeback_throttle(wc, wbl); diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index fa68336560c3..d8334cd45d7c 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1169,6 +1169,9 @@ static int dmz_init_zones(struct dmz_metadata *zmd) goto out; } + if (!nr_blkz) + break; + /* Process report */ for (i = 0; i < nr_blkz; i++) { ret = dmz_init_zone(zmd, zone, &blkz[i]); @@ -1204,6 +1207,8 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) /* Get zone information from disk */ ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), &blkz, &nr_blkz, GFP_NOIO); + if (!nr_blkz) + ret = -EIO; if (ret) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 8865c1709e16..51d029bbb740 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -643,7 +643,8 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) q = bdev_get_queue(dev->bdev); dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; - aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1); + aligned_capacity = dev->capacity & + ~((sector_t)blk_queue_zone_sectors(q) - 1); if (ti->begin || ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) { ti->error = "Partial mapping not supported"; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 043f0761e4a0..1fb1333fefec 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -781,7 +781,8 @@ static void close_table_device(struct table_device *td, struct mapped_device *md } static struct table_device *find_table_device(struct list_head *l, dev_t dev, - fmode_t mode) { + fmode_t mode) +{ struct table_device *td; list_for_each_entry(td, l, list) @@ -792,7 +793,8 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev, } int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, - struct dm_dev **result) { + struct dm_dev **result) +{ int r; struct table_device *td; @@ -1906,7 +1908,6 @@ static void cleanup_mapped_device(struct mapped_device *md) static struct mapped_device *alloc_dev(int minor) { int r, numa_node_id = dm_get_numa_node(); - struct dax_device *dax_dev = NULL; struct mapped_device *md; void *old_md; @@ -1969,11 +1970,10 @@ static struct mapped_device *alloc_dev(int minor) sprintf(md->disk->disk_name, "dm-%d", minor); if (IS_ENABLED(CONFIG_DAX_DRIVER)) { - dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); - if (!dax_dev) + md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); + if (!md->dax_dev) goto bad; } - md->dax_dev = dax_dev; add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 0a3b8ae4a29c..b8a62188f6be 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -190,6 +190,8 @@ static int sm_find_free(void *addr, unsigned begin, unsigned end, static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) { + memset(ll, 0, sizeof(struct ll_disk)); + ll->tm = tm; ll->bitmap_info.tm = tm; |