diff options
author | Linus Torvalds | 2021-02-22 10:22:54 -0800 |
---|---|---|
committer | Linus Torvalds | 2021-02-22 10:22:54 -0800 |
commit | 325b764089c9bef2be45354db4f15e5b12ae406d (patch) | |
tree | 3f06f8a8054874caf45242f57885a9a64e6ea57b | |
parent | a99163e9e708d5d773b7de6da952fcddc341f977 (diff) | |
parent | a666e5c05e7c4aaabb2c5d58117b0946803d03d2 (diff) |
Merge tag 'for-5.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- Fix DM integrity's HMAC support to provide enhanced security of
internal_hash and journal_mac capabilities.
- Various DM writecache fixes to address performance, fix table output
to match what was provided at table creation, fix writing beyond end
of device when shrinking underlying data device, and a couple other
small cleanups.
- Add DM crypt support for using trusted keys.
- Fix deadlock when swapping to DM crypt device by throttling number of
in-flight REQ_SWAP bios. Implemented in DM core so that other
bio-based targets can opt-in by setting ti->limit_swap_bios.
- Fix various inverted logic bugs in the .iterate_devices callout
functions that are used to assess if specific feature or capability
is supported across all devices being combined/stacked by DM.
- Fix DM era target bugs that exposed users to lost writes or memory
leaks.
- Add DM core support for passing through inline crypto support of
underlying devices. Includes block/keyslot-manager changes that
enable extending this support to DM.
- Various small fixes and cleanups (spelling fixes, front padding
calculation cleanup, cleanup conditional zoned support in targets,
etc).
* tag 'for-5.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (31 commits)
dm: fix deadlock when swapping to encrypted device
dm: simplify target code conditional on CONFIG_BLK_DEV_ZONED
dm: set DM_TARGET_PASSES_CRYPTO feature for some targets
dm: support key eviction from keyslot managers of underlying devices
dm: add support for passing through inline crypto support
block/keyslot-manager: Introduce functions for device mapper support
block/keyslot-manager: Introduce passthrough keyslot manager
dm era: only resize metadata in preresume
dm era: Use correct value size in equality function of writeset tree
dm era: Fix bitset memory leaks
dm era: Verify the data block size hasn't changed
dm era: Reinitialize bitset cache before digesting a new writeset
dm era: Update in-core bitset after committing the metadata
dm era: Recover committed writeset after crash
dm writecache: use bdev_nr_sectors() instead of open-coded equivalent
dm writecache: fix writing beyond end of underlying device when shrinking
dm table: remove needless request_queue NULL pointer checks
dm table: fix zoned iterate_devices based device capability checks
dm table: fix DAX iterate_devices based device capability checks
dm table: fix iterate_devices based device capability checks
...
-rw-r--r-- | Documentation/admin-guide/device-mapper/dm-crypt.rst | 2 | ||||
-rw-r--r-- | Documentation/admin-guide/device-mapper/dm-integrity.rst | 11 | ||||
-rw-r--r-- | block/blk-crypto.c | 1 | ||||
-rw-r--r-- | block/keyslot-manager.c | 146 | ||||
-rw-r--r-- | drivers/md/Kconfig | 1 | ||||
-rw-r--r-- | drivers/md/dm-core.h | 9 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 39 | ||||
-rw-r--r-- | drivers/md/dm-dust.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-era-target.c | 93 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 140 | ||||
-rw-r--r-- | drivers/md/dm-linear.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 399 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 80 | ||||
-rw-r--r-- | drivers/md/dm.c | 96 | ||||
-rw-r--r-- | drivers/md/dm.h | 2 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-internal.h | 2 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-spine.c | 2 | ||||
-rw-r--r-- | include/linux/device-mapper.h | 32 | ||||
-rw-r--r-- | include/linux/keyslot-manager.h | 11 | ||||
-rw-r--r-- | include/uapi/linux/dm-ioctl.h | 4 |
21 files changed, 868 insertions, 218 deletions
diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst index 1a6753b76dbb..aa2d04d95df6 100644 --- a/Documentation/admin-guide/device-mapper/dm-crypt.rst +++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst @@ -67,7 +67,7 @@ Parameters:: the value passed in <key_size>. <key_type> - Either 'logon', 'user' or 'encrypted' kernel key type. + Either 'logon', 'user', 'encrypted' or 'trusted' kernel key type. <key_description> The kernel keyring key description crypt target should look for diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst index cd198ccf4292..8db172efa272 100644 --- a/Documentation/admin-guide/device-mapper/dm-integrity.rst +++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst @@ -186,6 +186,17 @@ fix_padding space-efficient. If this option is not present, large padding is used - that is for compatibility with older kernels. +fix_hmac + Improve security of internal_hash and journal_mac: + + - the section number is mixed to the mac, so that an attacker can't + copy sectors from one journal section to another journal section + - the superblock is protected by journal_mac + - a 16-byte salt stored in the superblock is mixed to the mac, so + that the attacker can't detect that two disks have the same hmac + key and also to disallow the attacker to move sectors from one + disk to another + legacy_recalculate Allow recalculating of volumes with HMAC keys. This is disabled by default for security reasons - an attacker could modify the volume, diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 09fcb18fa778..c5bdaafffa29 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -409,3 +409,4 @@ int blk_crypto_evict_key(struct request_queue *q, */ return blk_crypto_fallback_evict_key(key); } +EXPORT_SYMBOL_GPL(blk_crypto_evict_key); diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c index 324bf4244f5f..2c4a55bea6ca 100644 --- a/block/keyslot-manager.c +++ b/block/keyslot-manager.c @@ -63,6 +63,11 @@ static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm) pm_runtime_put_sync(ksm->dev); } +static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm) +{ + return ksm->num_slots == 0; +} + /** * blk_ksm_init() - Initialize a keyslot manager * @ksm: The keyslot_manager to initialize. @@ -234,6 +239,10 @@ blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, int err; *slot_ptr = NULL; + + if (blk_ksm_is_passthrough(ksm)) + return BLK_STS_OK; + down_read(&ksm->lock); slot = blk_ksm_find_and_grab_keyslot(ksm, key); up_read(&ksm->lock); @@ -354,6 +363,16 @@ int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, struct blk_ksm_keyslot *slot; int err = 0; + if (blk_ksm_is_passthrough(ksm)) { + if (ksm->ksm_ll_ops.keyslot_evict) { + blk_ksm_hw_enter(ksm); + err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1); + blk_ksm_hw_exit(ksm); + return err; + } + return 0; + } + blk_ksm_hw_enter(ksm); slot = blk_ksm_find_keyslot(ksm, key); if (!slot) @@ -389,6 +408,9 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm) { unsigned int slot; + if (blk_ksm_is_passthrough(ksm)) + return; + /* This is for device initialization, so don't resume the device */ down_write(&ksm->lock); for (slot = 0; slot < ksm->num_slots; slot++) { @@ -430,3 +452,127 @@ void blk_ksm_unregister(struct request_queue *q) { q->ksm = NULL; } + +/** + * blk_ksm_intersect_modes() - restrict supported modes by child device + * @parent: The keyslot manager for parent device + * @child: The keyslot manager for child device, or NULL + * + * Clear any crypto mode support bits in @parent that aren't set in @child. + * If @child is NULL, then all parent bits are cleared. + * + * Only use this when setting up the keyslot manager for a layered device, + * before it's been exposed yet. + */ +void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, + const struct blk_keyslot_manager *child) +{ + if (child) { + unsigned int i; + + parent->max_dun_bytes_supported = + min(parent->max_dun_bytes_supported, + child->max_dun_bytes_supported); + for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported); + i++) { + parent->crypto_modes_supported[i] &= + child->crypto_modes_supported[i]; + } + } else { + parent->max_dun_bytes_supported = 0; + memset(parent->crypto_modes_supported, 0, + sizeof(parent->crypto_modes_supported)); + } +} +EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes); + +/** + * blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes + * and DUN bytes that another KSM supports. Here, + * "superset" refers to the mathematical meaning of the + * word - i.e. if two KSMs have the *same* capabilities, + * they *are* considered supersets of each other. + * @ksm_superset: The KSM that we want to verify is a superset + * @ksm_subset: The KSM that we want to verify is a subset + * + * Return: True if @ksm_superset supports a superset of the crypto modes and DUN + * bytes that @ksm_subset supports. + */ +bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, + struct blk_keyslot_manager *ksm_subset) +{ + int i; + + if (!ksm_subset) + return true; + + if (!ksm_superset) + return false; + + for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) { + if (ksm_subset->crypto_modes_supported[i] & + (~ksm_superset->crypto_modes_supported[i])) { + return false; + } + } + + if (ksm_subset->max_dun_bytes_supported > + ksm_superset->max_dun_bytes_supported) { + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(blk_ksm_is_superset); + +/** + * blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of + * another KSM + * @target_ksm: The KSM whose restrictions to update. + * @reference_ksm: The KSM to whose restrictions this function will update + * @target_ksm's restrictions to. + * + * Blk-crypto requires that crypto capabilities that were + * advertised when a bio was created continue to be supported by the + * device until that bio is ended. This is turn means that a device cannot + * shrink its advertised crypto capabilities without any explicit + * synchronization with upper layers. So if there's no such explicit + * synchronization, @reference_ksm must support all the crypto capabilities that + * @target_ksm does + * (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true). + * + * Note also that as long as the crypto capabilities are being expanded, the + * order of updates becoming visible is not important because it's alright + * for blk-crypto to see stale values - they only cause blk-crypto to + * believe that a crypto capability isn't supported when it actually is (which + * might result in blk-crypto-fallback being used if available, or the bio being + * failed). + */ +void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, + struct blk_keyslot_manager *reference_ksm) +{ + memcpy(target_ksm->crypto_modes_supported, + reference_ksm->crypto_modes_supported, + sizeof(target_ksm->crypto_modes_supported)); + + target_ksm->max_dun_bytes_supported = + reference_ksm->max_dun_bytes_supported; +} +EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities); + +/** + * blk_ksm_init_passthrough() - Init a passthrough keyslot manager + * @ksm: The keyslot manager to init + * + * Initialize a passthrough keyslot manager. + * Called by e.g. storage drivers to set up a keyslot manager in their + * request_queue, when the storage driver wants to manage its keys by itself. + * This is useful for inline encryption hardware that doesn't have the concept + * of keyslots, and for layered devices. + */ +void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm) +{ + memset(ksm, 0, sizeof(*ksm)); + init_rwsem(&ksm->lock); +} +EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 9e44c09f6410..f2014385d48b 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -270,6 +270,7 @@ config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) + depends on (TRUSTED_KEYS || TRUSTED_KEYS=n) select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 086d293c2b03..5953ff2bd260 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -13,6 +13,7 @@ #include <linux/ktime.h> #include <linux/genhd.h> #include <linux/blk-mq.h> +#include <linux/keyslot-manager.h> #include <trace/events/block.h> @@ -102,6 +103,10 @@ struct mapped_device { /* kobject and completion */ struct dm_kobject_holder kobj_holder; + int swap_bios; + struct semaphore swap_bios_semaphore; + struct mutex swap_bios_lock; + struct dm_stats stats; /* for blk-mq request-based DM support */ @@ -162,6 +167,10 @@ struct dm_table { void *event_context; struct dm_md_mempools *mempools; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct blk_keyslot_manager *ksm; +#endif }; static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 5a55617a08e6..11c105ecd165 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -37,6 +37,7 @@ #include <linux/key-type.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> +#include <keys/trusted-type.h> #include <linux/device-mapper.h> @@ -133,7 +134,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, DM_CRYPT_WRITE_INLINE }; enum cipher_flags { - CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */ + CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cipher */ CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */ CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */ }; @@ -2436,7 +2437,6 @@ static int set_key_user(struct crypt_config *cc, struct key *key) return 0; } -#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) static int set_key_encrypted(struct crypt_config *cc, struct key *key) { const struct encrypted_key_payload *ekp; @@ -2452,7 +2452,22 @@ static int set_key_encrypted(struct crypt_config *cc, struct key *key) return 0; } -#endif /* CONFIG_ENCRYPTED_KEYS */ + +static int set_key_trusted(struct crypt_config *cc, struct key *key) +{ + const struct trusted_key_payload *tkp; + + tkp = key->payload.data[0]; + if (!tkp) + return -EKEYREVOKED; + + if (cc->key_size != tkp->key_len) + return -EINVAL; + + memcpy(cc->key, tkp->key, cc->key_size); + + return 0; +} static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) { @@ -2482,11 +2497,14 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string } else if (!strncmp(key_string, "user:", key_desc - key_string + 1)) { type = &key_type_user; set_key = set_key_user; -#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) - } else if (!strncmp(key_string, "encrypted:", key_desc - key_string + 1)) { + } else if (IS_ENABLED(CONFIG_ENCRYPTED_KEYS) && + !strncmp(key_string, "encrypted:", key_desc - key_string + 1)) { type = &key_type_encrypted; set_key = set_key_encrypted; -#endif + } else if (IS_ENABLED(CONFIG_TRUSTED_KEYS) && + !strncmp(key_string, "trusted:", key_desc - key_string + 1)) { + type = &key_type_trusted; + set_key = set_key_trusted; } else { return -EINVAL; } @@ -3116,7 +3134,6 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar } #ifdef CONFIG_BLK_DEV_ZONED - static int crypt_report_zones(struct dm_target *ti, struct dm_report_zones_args *args, unsigned int nr_zones) { @@ -3127,7 +3144,8 @@ static int crypt_report_zones(struct dm_target *ti, return blkdev_report_zones(cc->dev->bdev, sector, nr_zones, dm_report_zones_cb, args); } - +#else +#define crypt_report_zones NULL #endif /* @@ -3324,6 +3342,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) wake_up_process(cc->write_thread); ti->num_flush_bios = 1; + ti->limit_swap_bios = true; return 0; @@ -3558,14 +3577,12 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 22, 0}, + .version = {1, 23, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, -#ifdef CONFIG_BLK_DEV_ZONED .features = DM_TARGET_ZONED_HM, .report_zones = crypt_report_zones, -#endif .map = crypt_map, .status = crypt_status, .postsuspend = crypt_postsuspend, diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 072ea913cebc..cbe1058ee589 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -130,7 +130,7 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block, dd->badblock_count++; if (!dd->quiet_mode) { - DMINFO("%s: badblock added at block %llu with write fail count %hhu", + DMINFO("%s: badblock added at block %llu with write fail count %u", __func__, block, wr_fail_cnt); } spin_unlock_irqrestore(&dd->dust_lock, flags); diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index b24e3839bb3a..d9ac7372108c 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -47,6 +47,7 @@ struct writeset { static void writeset_free(struct writeset *ws) { vfree(ws->bits); + ws->bits = NULL; } static int setup_on_disk_bitset(struct dm_disk_bitset *info, @@ -71,8 +72,6 @@ static size_t bitset_size(unsigned nr_bits) */ static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) { - ws->md.nr_bits = nr_blocks; - ws->md.root = INVALID_WRITESET_ROOT; ws->bits = vzalloc(bitset_size(nr_blocks)); if (!ws->bits) { DMERR("%s: couldn't allocate in memory bitset", __func__); @@ -85,12 +84,14 @@ static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) /* * Wipes the in-core bitset, and creates a new on disk bitset. */ -static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws) +static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws, + dm_block_t nr_blocks) { int r; - memset(ws->bits, 0, bitset_size(ws->md.nr_bits)); + memset(ws->bits, 0, bitset_size(nr_blocks)); + ws->md.nr_bits = nr_blocks; r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root); if (r) { DMERR("%s: setup_on_disk_bitset failed", __func__); @@ -134,7 +135,7 @@ static int writeset_test_and_set(struct dm_disk_bitset *info, { int r; - if (!test_and_set_bit(block, ws->bits)) { + if (!test_bit(block, ws->bits)) { r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root); if (r) { /* FIXME: fail mode */ @@ -388,7 +389,7 @@ static void ws_dec(void *context, const void *value) static int ws_eq(void *context, const void *value1, const void *value2) { - return !memcmp(value1, value2, sizeof(struct writeset_metadata)); + return !memcmp(value1, value2, sizeof(struct writeset_disk)); } /*----------------------------------------------------------------*/ @@ -564,6 +565,15 @@ static int open_metadata(struct era_metadata *md) } disk = dm_block_data(sblock); + + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk->data_block_size) != md->block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk->data_block_size), md->block_size); + r = -EINVAL; + goto bad; + } + r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION, disk->metadata_space_map_root, sizeof(disk->metadata_space_map_root), @@ -575,10 +585,10 @@ static int open_metadata(struct era_metadata *md) setup_infos(md); - md->block_size = le32_to_cpu(disk->data_block_size); md->nr_blocks = le32_to_cpu(disk->nr_blocks); md->current_era = le32_to_cpu(disk->current_era); + ws_unpack(&disk->current_writeset, &md->current_writeset->md); md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root); md->era_array_root = le64_to_cpu(disk->era_array_root); md->metadata_snap = le64_to_cpu(disk->metadata_snap); @@ -746,6 +756,12 @@ static int metadata_digest_lookup_writeset(struct era_metadata *md, ws_unpack(&disk, &d->writeset); d->value = cpu_to_le32(key); + /* + * We initialise another bitset info to avoid any caching side effects + * with the previous one. + */ + dm_disk_bitset_init(md->tm, &d->info); + d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks); d->current_bit = 0; d->step = metadata_digest_transcribe_writeset; @@ -759,12 +775,6 @@ static int metadata_digest_start(struct era_metadata *md, struct digest *d) return 0; memset(d, 0, sizeof(*d)); - - /* - * We initialise another bitset info to avoid any caching side - * effects with the previous one. - */ - dm_disk_bitset_init(md->tm, &d->info); d->step = metadata_digest_lookup_writeset; return 0; @@ -802,6 +812,8 @@ static struct era_metadata *metadata_open(struct block_device *bdev, static void metadata_close(struct era_metadata *md) { + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); destroy_persistent_data_objects(md); kfree(md); } @@ -839,6 +851,7 @@ static int metadata_resize(struct era_metadata *md, void *arg) r = writeset_alloc(&md->writesets[1], *new_size); if (r) { DMERR("%s: writeset_alloc failed for writeset 1", __func__); + writeset_free(&md->writesets[0]); return r; } @@ -849,6 +862,8 @@ static int metadata_resize(struct era_metadata *md, void *arg) &value, &md->era_array_root); if (r) { DMERR("%s: dm_array_resize failed", __func__); + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); return r; } @@ -870,7 +885,6 @@ static int metadata_era_archive(struct era_metadata *md) } ws_pack(&md->current_writeset->md, &value); - md->current_writeset->md.root = INVALID_WRITESET_ROOT; keys[0] = md->current_era; __dm_bless_for_disk(&value); @@ -882,6 +896,7 @@ static int metadata_era_archive(struct era_metadata *md) return r; } + md->current_writeset->md.root = INVALID_WRITESET_ROOT; md->archived_writesets = true; return 0; @@ -898,7 +913,7 @@ static int metadata_new_era(struct era_metadata *md) int r; struct writeset *new_writeset = next_writeset(md); - r = writeset_init(&md->bitset_info, new_writeset); + r = writeset_init(&md->bitset_info, new_writeset, md->nr_blocks); if (r) { DMERR("%s: writeset_init failed", __func__); return r; @@ -951,7 +966,7 @@ static int metadata_commit(struct era_metadata *md) int r; struct dm_block *sblock; - if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) { + if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) { r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, &md->current_writeset->md.root); if (r) { @@ -1225,8 +1240,10 @@ static void process_deferred_bios(struct era *era) int r; struct bio_list deferred_bios, marked_bios; struct bio *bio; + struct blk_plug plug; bool commit_needed = false; bool failed = false; + struct writeset *ws = era->md->current_writeset; bio_list_init(&deferred_bios); bio_list_init(&marked_bios); @@ -1236,9 +1253,11 @@ static void process_deferred_bios(struct era *era) bio_list_init(&era->deferred_bios); spin_unlock(&era->deferred_lock); + if (bio_list_empty(&deferred_bios)) + return; + while ((bio = bio_list_pop(&deferred_bios))) { - r = writeset_test_and_set(&era->md->bitset_info, - era->md->current_writeset, + r = writeset_test_and_set(&era->md->bitset_info, ws, get_block(era, bio)); if (r < 0) { /* @@ -1246,7 +1265,6 @@ static void process_deferred_bios(struct era *era) * FIXME: finish. */ failed = true; - } else if (r == 0) commit_needed = true; @@ -1262,9 +1280,19 @@ static void process_deferred_bios(struct era *era) if (failed) while ((bio = bio_list_pop(&marked_bios))) bio_io_error(bio); - else - while ((bio = bio_list_pop(&marked_bios))) + else { + blk_start_plug(&plug); + while ((bio = bio_list_pop(&marked_bios))) { + /* + * Only update the in-core writeset if the on-disk one + * was updated too. + */ + if (commit_needed) + set_bit(get_block(era, bio), ws->bits); submit_bio_noacct(bio); + } + blk_finish_plug(&plug); + } } static void process_rpc_calls(struct era *era) @@ -1473,15 +1501,6 @@ static int era_ctr(struct dm_target *ti, unsigned argc, char **argv) } era->md = md; - era->nr_blocks = calc_nr_blocks(era); - - r = metadata_resize(era->md, &era->nr_blocks); - if (r) { - ti->error = "couldn't resize metadata"; - era_destroy(era); - return -ENOMEM; - } - era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); if (!era->wq) { ti->error = "could not create workqueue for metadata object"; @@ -1556,16 +1575,24 @@ static int era_preresume(struct dm_target *ti) dm_block_t new_size = calc_nr_blocks(era); if (era->nr_blocks != new_size) { - r = in_worker1(era, metadata_resize, &new_size); - if (r) + r = metadata_resize(era->md, &new_size); + if (r) { + DMERR("%s: metadata_resize failed", __func__); + return r; + } + + r = metadata_commit(era->md); + if (r) { + DMERR("%s: metadata_commit failed", __func__); return r; + } era->nr_blocks = new_size; } start_worker(era); - r = in_worker0(era, metadata_new_era); + r = in_worker0(era, metadata_era_rollover); if (r) { DMERR("%s: metadata_era_rollover failed", __func__); return r; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index a2cc9e45cbba..b7fee9936f05 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -469,6 +469,8 @@ static int flakey_report_zones(struct dm_target *ti, return blkdev_report_zones(fc->dev->bdev, sector, nr_zones, dm_report_zones_cb, args); } +#else +#define flakey_report_zones NULL #endif static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -481,10 +483,8 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_ static struct target_type flakey_target = { .name = "flakey", .version = {1, 5, 0}, -#ifdef CONFIG_BLK_DEV_ZONED - .features = DM_TARGET_ZONED_HM, + .features = DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO, .report_zones = flakey_report_zones, -#endif .module = THIS_MODULE, .ctr = flakey_ctr, .dtr = flakey_dtr, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index b64fede032dc..46b5d542b8fe 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -40,6 +40,7 @@ #define BITMAP_BLOCK_SIZE 4096 /* don't change it */ #define BITMAP_FLUSH_INTERVAL (10 * HZ) #define DISCARD_FILLER 0xf6 +#define SALT_SIZE 16 /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -57,6 +58,7 @@ #define SB_VERSION_2 2 #define SB_VERSION_3 3 #define SB_VERSION_4 4 +#define SB_VERSION_5 5 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -72,12 +74,15 @@ struct superblock { __u8 log2_blocks_per_bitmap_bit; __u8 pad[2]; __u64 recalc_sector; + __u8 pad2[8]; + __u8 salt[SALT_SIZE]; }; #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 #define SB_FLAG_RECALCULATING 0x2 #define SB_FLAG_DIRTY_BITMAP 0x4 #define SB_FLAG_FIXED_PADDING 0x8 +#define SB_FLAG_FIXED_HMAC 0x10 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -259,6 +264,7 @@ struct dm_integrity_c { bool recalculate_flag; bool discard; bool fix_padding; + bool fix_hmac; bool legacy_recalculate; struct alg_spec internal_hash_alg; @@ -389,8 +395,11 @@ static int dm_integrity_failed(struct dm_integrity_c *ic) static bool dm_integrity_disable_recalculate(struct dm_integrity_c *ic) { - if ((ic->internal_hash_alg.key || ic->journal_mac_alg.key) && - !ic->legacy_recalculate) + if (ic->legacy_recalculate) + return false; + if (!(ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) ? + ic->internal_hash_alg.key || ic->journal_mac_alg.key : + ic->internal_hash_alg.key && !ic->journal_mac_alg.key) return true; return false; } @@ -477,7 +486,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) + ic->sb->version = SB_VERSION_5; + else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) ic->sb->version = SB_VERSION_4; else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) ic->sb->version = SB_VERSION_3; @@ -487,10 +498,58 @@ static void sb_set_version(struct dm_integrity_c *ic) ic->sb->version = SB_VERSION_1; } +static int sb_mac(struct dm_integrity_c *ic, bool wr) +{ + SHASH_DESC_ON_STACK(desc, ic->journal_mac); + int r; + unsigned size = crypto_shash_digestsize(ic->journal_mac); + + if (sizeof(struct superblock) + size > 1 << SECTOR_SHIFT) { + dm_integrity_io_error(ic, "digest is too long", -EINVAL); + return -EINVAL; + } + + desc->tfm = ic->journal_mac; + + r = crypto_shash_init(desc); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_init", r); + return r; + } + + r = crypto_shash_update(desc, (__u8 *)ic->sb, (1 << SECTOR_SHIFT) - size); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + return r; + } + + if (likely(wr)) { + r = crypto_shash_final(desc, (__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + return r; + } + } else { + __u8 result[HASH_MAX_DIGESTSIZE]; + r = crypto_shash_final(desc, result); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + return r; + } + if (memcmp((__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size, result, size)) { + dm_integrity_io_error(ic, "superblock mac", -EILSEQ); + return -EILSEQ; + } + } + + return 0; +} + static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) { struct dm_io_request io_req; struct dm_io_region io_loc; + int r; io_req.bi_op = op; io_req.bi_op_flags = op_flags; @@ -502,10 +561,28 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) io_loc.sector = ic->start; io_loc.count = SB_SECTORS; - if (op == REQ_OP_WRITE) + if (op == REQ_OP_WRITE) { sb_set_version(ic); + if (ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + r = sb_mac(ic, true); + if (unlikely(r)) + return r; + } + } - return dm_io(&io_req, 1, &io_loc, NULL); + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) + return r; + + if (op == REQ_OP_READ) { + if (ic->mode != 'R' && ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + r = sb_mac(ic, false); + if (unlikely(r)) + return r; + } + } + + return 0; } #define BITMAP_OP_TEST_ALL_SET 0 @@ -722,15 +799,32 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result desc->tfm = ic->journal_mac; r = crypto_shash_init(desc); - if (unlikely(r)) { + if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_init", r); goto err; } + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + uint64_t section_le; + + r = crypto_shash_update(desc, (__u8 *)&ic->sb->salt, SALT_SIZE); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto err; + } + + section_le = cpu_to_le64(section); + r = crypto_shash_update(desc, (__u8 *)§ion_le, sizeof section_le); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto err; + } + } + for (j = 0; j < ic->journal_section_entries; j++) { struct journal_entry *je = access_journal_entry(ic, section, j); r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector); - if (unlikely(r)) { + if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_update", r); goto err; } @@ -740,7 +834,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result if (likely(size <= JOURNAL_MAC_SIZE)) { r = crypto_shash_final(desc, result); - if (unlikely(r)) { + if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_final", r); goto err; } @@ -753,7 +847,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result goto err; } r = crypto_shash_final(desc, digest); - if (unlikely(r)) { + if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_final", r); goto err; } @@ -1399,7 +1493,7 @@ static void flush_notify(unsigned long error, void *fr_) { struct flush_request *fr = fr_; if (unlikely(error != 0)) - dm_integrity_io_error(fr->ic, "flusing disk cache", -EIO); + dm_integrity_io_error(fr->ic, "flushing disk cache", -EIO); complete(&fr->comp); } @@ -1556,6 +1650,14 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector goto failed; } + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + r = crypto_shash_update(req, (__u8 *)&ic->sb->salt, SALT_SIZE); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto failed; + } + } + r = crypto_shash_update(req, (const __u8 *)§or_le, sizeof sector_le); if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_update", r); @@ -3149,6 +3251,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0; + arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0; arg_count += ic->legacy_recalculate; DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start, ic->tag_size, ic->mode, arg_count); @@ -3173,6 +3276,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, } if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) DMEMIT(" fix_padding"); + if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0) + DMEMIT(" fix_hmac"); if (ic->legacy_recalculate) DMEMIT(" legacy_recalculate"); @@ -3310,6 +3415,11 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec if (!journal_sections) journal_sections = 1; + if (ic->fix_hmac && (ic->internal_hash_alg.alg_string || ic->journal_mac_alg.alg_string)) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_HMAC); + get_random_bytes(ic->sb->salt, SALT_SIZE); + } + if (!ic->meta_dev) { if (ic->fix_padding) ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING); @@ -3804,7 +3914,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) unsigned extra_args; struct dm_arg_set as; static const struct dm_arg _args[] = { - {0, 16, "Invalid number of feature args"}, + {0, 17, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; bool should_write_sb; @@ -3942,7 +4052,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (r) goto bad; } else if (!strncmp(opt_string, "journal_mac:", strlen("journal_mac:"))) { - r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error, + r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error, "Invalid journal_mac argument"); if (r) goto bad; @@ -3952,6 +4062,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ic->discard = true; } else if (!strcmp(opt_string, "fix_padding")) { ic->fix_padding = true; + } else if (!strcmp(opt_string, "fix_hmac")) { + ic->fix_hmac = true; } else if (!strcmp(opt_string, "legacy_recalculate")) { ic->legacy_recalculate = true; } else { @@ -4110,7 +4222,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_4) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_5) { r = -EINVAL; ti->error = "Unknown version"; goto bad; @@ -4442,7 +4554,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 6, 0}, + .version = {1, 7, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 00774b5d7668..92db0f5e7f28 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -146,6 +146,8 @@ static int linear_report_zones(struct dm_target *ti, return blkdev_report_zones(lc->dev->bdev, sector, nr_zones, dm_report_zones_cb, args); } +#else +#define linear_report_zones NULL #endif static int linear_iterate_devices(struct dm_target *ti, @@ -227,13 +229,9 @@ static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, static struct target_type linear_target = { .name = "linear", .version = {1, 4, 0}, -#ifdef CONFIG_BLK_DEV_ZONED .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | - DM_TARGET_ZONED_HM, + DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO, .report_zones = linear_report_zones, -#else - .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT, -#endif .module = THIS_MODULE, .ctr = linear_ctr, .dtr = linear_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4acf2342f7ad..95391f78b8d5 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -187,6 +187,8 @@ static void free_devices(struct list_head *devices, struct mapped_device *md) } } +static void dm_table_destroy_keyslot_manager(struct dm_table *t); + void dm_table_destroy(struct dm_table *t) { unsigned int i; @@ -215,6 +217,8 @@ void dm_table_destroy(struct dm_table *t) dm_free_md_mempools(t->mempools); + dm_table_destroy_keyslot_manager(t); + kfree(t); } @@ -820,24 +824,24 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) EXPORT_SYMBOL_GPL(dm_table_set_type); /* validate the dax capability of the target device span */ -int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, +int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { int blocksize = *(int *) data, id; bool rc; id = dax_read_lock(); - rc = dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len); + rc = !dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len); dax_read_unlock(id); return rc; } /* Check devices support synchronous DAX */ -static int device_dax_synchronous(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { - return dev->dax_dev && dax_synchronous(dev->dax_dev); + return !dev->dax_dev || !dax_synchronous(dev->dax_dev); } bool dm_table_supports_dax(struct dm_table *t, @@ -854,7 +858,7 @@ bool dm_table_supports_dax(struct dm_table *t, return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, iterate_fn, blocksize)) + ti->type->iterate_devices(ti, iterate_fn, blocksize)) return false; } @@ -925,7 +929,7 @@ static int dm_table_determine_type(struct dm_table *t) verify_bio_based: /* We must use this table as bio-based */ t->type = DM_TYPE_BIO_BASED; - if (dm_table_supports_dax(t, device_supports_dax, &page_size) || + if (dm_table_supports_dax(t, device_not_dax_capable, &page_size) || (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { t->type = DM_TYPE_DAX_BIO_BASED; } @@ -1203,6 +1207,210 @@ static int dm_table_register_integrity(struct dm_table *t) return 0; } +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + +struct dm_keyslot_manager { + struct blk_keyslot_manager ksm; + struct mapped_device *md; +}; + +struct dm_keyslot_evict_args { + const struct blk_crypto_key *key; + int err; +}; + +static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_keyslot_evict_args *args = data; + int err; + + err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key); + if (!args->err) + args->err = err; + /* Always try to evict the key from all devices. */ + return 0; +} + +/* + * When an inline encryption key is evicted from a device-mapper device, evict + * it from all the underlying devices. + */ +static int dm_keyslot_evict(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, unsigned int slot) +{ + struct dm_keyslot_manager *dksm = container_of(ksm, + struct dm_keyslot_manager, + ksm); + struct mapped_device *md = dksm->md; + struct dm_keyslot_evict_args args = { key }; + struct dm_table *t; + int srcu_idx; + int i; + struct dm_target *ti; + + t = dm_get_live_table(md, &srcu_idx); + if (!t) + return 0; + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + if (!ti->type->iterate_devices) + continue; + ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args); + } + dm_put_live_table(md, srcu_idx); + return args.err; +} + +static struct blk_ksm_ll_ops dm_ksm_ll_ops = { + .keyslot_evict = dm_keyslot_evict, +}; + +static int device_intersect_crypto_modes(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct blk_keyslot_manager *parent = data; + struct blk_keyslot_manager *child = bdev_get_queue(dev->bdev)->ksm; + + blk_ksm_intersect_modes(parent, child); + return 0; +} + +void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm) +{ + struct dm_keyslot_manager *dksm = container_of(ksm, + struct dm_keyslot_manager, + ksm); + + if (!ksm) + return; + + blk_ksm_destroy(ksm); + kfree(dksm); +} + +static void dm_table_destroy_keyslot_manager(struct dm_table *t) +{ + dm_destroy_keyslot_manager(t->ksm); + t->ksm = NULL; +} + +/* + * Constructs and initializes t->ksm with a keyslot manager that + * represents the common set of crypto capabilities of the devices + * described by the dm_table. However, if the constructed keyslot + * manager does not support a superset of the crypto capabilities + * supported by the current keyslot manager of the mapped_device, + * it returns an error instead, since we don't support restricting + * crypto capabilities on table changes. Finally, if the constructed + * keyslot manager doesn't actually support any crypto modes at all, + * it just returns NULL. + */ +static int dm_table_construct_keyslot_manager(struct dm_table *t) +{ + struct dm_keyslot_manager *dksm; + struct blk_keyslot_manager *ksm; + struct dm_target *ti; + unsigned int i; + bool ksm_is_empty = true; + + dksm = kmalloc(sizeof(*dksm), GFP_KERNEL); + if (!dksm) + return -ENOMEM; + dksm->md = t->md; + + ksm = &dksm->ksm; + blk_ksm_init_passthrough(ksm); + ksm->ksm_ll_ops = dm_ksm_ll_ops; + ksm->max_dun_bytes_supported = UINT_MAX; + memset(ksm->crypto_modes_supported, 0xFF, + sizeof(ksm->crypto_modes_supported)); + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (!dm_target_passes_crypto(ti->type)) { + blk_ksm_intersect_modes(ksm, NULL); + break; + } + if (!ti->type->iterate_devices) + continue; + ti->type->iterate_devices(ti, device_intersect_crypto_modes, + ksm); + } + + if (t->md->queue && !blk_ksm_is_superset(ksm, t->md->queue->ksm)) { + DMWARN("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!"); + dm_destroy_keyslot_manager(ksm); + return -EINVAL; + } + + /* + * If the new KSM doesn't actually support any crypto modes, we may as + * well represent it with a NULL ksm. + */ + ksm_is_empty = true; + for (i = 0; i < ARRAY_SIZE(ksm->crypto_modes_supported); i++) { + if (ksm->crypto_modes_supported[i]) { + ksm_is_empty = false; + break; + } + } + + if (ksm_is_empty) { + dm_destroy_keyslot_manager(ksm); + ksm = NULL; + } + + /* + * t->ksm is only set temporarily while the table is being set + * up, and it gets set to NULL after the capabilities have + * been transferred to the request_queue. + */ + t->ksm = ksm; + + return 0; +} + +static void dm_update_keyslot_manager(struct request_queue *q, + struct dm_table *t) +{ + if (!t->ksm) + return; + + /* Make the ksm less restrictive */ + if (!q->ksm) { + blk_ksm_register(t->ksm, q); + } else { + blk_ksm_update_capabilities(q->ksm, t->ksm); + dm_destroy_keyslot_manager(t->ksm); + } + t->ksm = NULL; +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static int dm_table_construct_keyslot_manager(struct dm_table *t) +{ + return 0; +} + +void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm) +{ +} + +static void dm_table_destroy_keyslot_manager(struct dm_table *t) +{ +} + +static void dm_update_keyslot_manager(struct request_queue *q, + struct dm_table *t) +{ +} + +#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ + /* * Prepares the table for use by building the indices, * setting the type, and allocating mempools. @@ -1229,6 +1437,12 @@ int dm_table_complete(struct dm_table *t) return r; } + r = dm_table_construct_keyslot_manager(t); + if (r) { + DMERR("could not construct keyslot manager."); + return r; + } + r = dm_table_alloc_md_mempools(t, t->md); if (r) DMERR("unable to allocate mempools"); @@ -1295,6 +1509,46 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } +/* + * type->iterate_devices() should be called when the sanity check needs to + * iterate and check all underlying data devices. iterate_devices() will + * iterate all underlying data devices until it encounters a non-zero return + * code, returned by whether the input iterate_devices_callout_fn, or + * iterate_devices() itself internally. + * + * For some target type (e.g. dm-stripe), one call of iterate_devices() may + * iterate multiple underlying devices internally, in which case a non-zero + * return code returned by iterate_devices_callout_fn will stop the iteration + * in advance. + * + * Cases requiring _any_ underlying device supporting some kind of attribute, + * should use the iteration structure like dm_table_any_dev_attr(), or call + * it directly. @func should handle semantics of positive examples, e.g. + * capable of something. + * + * Cases requiring _all_ underlying devices supporting some kind of attribute, + * should use the iteration structure like dm_table_supports_nowait() or + * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that + * uses an @anti_func that handle semantics of counter examples, e.g. not + * capable of something. So: return !dm_table_any_dev_attr(t, anti_func, data); + */ +static bool dm_table_any_dev_attr(struct dm_table *t, + iterate_devices_callout_fn func, void *data) +{ + struct dm_target *ti; + unsigned int i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, func, data)) + return true; + } + + return false; +} + static int count_device(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1331,13 +1585,13 @@ bool dm_table_has_no_data_devices(struct dm_table *table) return true; } -static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); enum blk_zoned_model *zoned_model = data; - return q && blk_queue_zoned_model(q) == *zoned_model; + return blk_queue_zoned_model(q) != *zoned_model; } static bool dm_table_supports_zoned_model(struct dm_table *t, @@ -1354,37 +1608,20 @@ static bool dm_table_supports_zoned_model(struct dm_table *t, return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_is_zoned_model, &zoned_model)) + ti->type->iterate_devices(ti, device_not_zoned_model, &zoned_model)) return false; } return true; } -static int device_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); unsigned int *zone_sectors = data; - return q && blk_queue_zone_sectors(q) == *zone_sectors; -} - -static bool dm_table_matches_zone_sectors(struct dm_table *t, - unsigned int zone_sectors) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_matches_zone_sectors, &zone_sectors)) - return false; - } - - return true; + return blk_queue_zone_sectors(q) != *zone_sectors; } static int validate_hardware_zoned_model(struct dm_table *table, @@ -1404,7 +1641,7 @@ static int validate_hardware_zoned_model(struct dm_table *table, if (!zone_sectors || !is_power_of_2(zone_sectors)) return -EINVAL; - if (!dm_table_matches_zone_sectors(table, zone_sectors)) { + if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) { DMERR("%s: zone sectors is not consistent across all devices", dm_device_name(table->md)); return -EINVAL; @@ -1533,7 +1770,7 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, unsigned long flush = (unsigned long) data; struct request_queue *q = bdev_get_queue(dev->bdev); - return q && (q->queue_flags & flush); + return (q->queue_flags & flush); } static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) @@ -1578,29 +1815,12 @@ static int device_dax_write_cache_enabled(struct dm_target *ti, return false; } -static int dm_table_supports_dax_write_cache(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, - device_dax_write_cache_enabled, NULL)) - return true; - } - - return false; -} - -static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && blk_queue_nonrot(q); + return !blk_queue_nonrot(q); } static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, @@ -1608,24 +1828,7 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !blk_queue_add_random(q); -} - -static bool dm_table_all_devices_attribute(struct dm_table *t, - iterate_devices_callout_fn func) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, func, NULL)) - return false; - } - - return true; + return !blk_queue_add_random(q); } static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, @@ -1633,7 +1836,7 @@ static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *de { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !q->limits.max_write_same_sectors; + return !q->limits.max_write_same_sectors; } static bool dm_table_supports_write_same(struct dm_table *t) @@ -1660,7 +1863,7 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev * { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !q->limits.max_write_zeroes_sectors; + return !q->limits.max_write_zeroes_sectors; } static bool dm_table_supports_write_zeroes(struct dm_table *t) @@ -1687,7 +1890,7 @@ static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !blk_queue_nowait(q); + return !blk_queue_nowait(q); } static bool dm_table_supports_nowait(struct dm_table *t) @@ -1714,7 +1917,7 @@ static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !blk_queue_discard(q); + return !blk_queue_discard(q); } static bool dm_table_supports_discards(struct dm_table *t) @@ -1748,7 +1951,7 @@ static int device_not_secure_erase_capable(struct dm_target *ti, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !blk_queue_secure_erase(q); + return !blk_queue_secure_erase(q); } static bool dm_table_supports_secure_erase(struct dm_table *t) @@ -1776,28 +1979,7 @@ static int device_requires_stable_pages(struct dm_target *ti, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && blk_queue_stable_writes(q); -} - -/* - * If any underlying device requires stable pages, a table must require - * them as well. Only targets that support iterate_devices are considered: - * don't want error, zero, etc to require stable pages. - */ -static bool dm_table_requires_stable_pages(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_requires_stable_pages, NULL)) - return true; - } - - return false; + return blk_queue_stable_writes(q); } void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, @@ -1837,22 +2019,22 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); - if (dm_table_supports_dax(t, device_supports_dax, &page_size)) { + if (dm_table_supports_dax(t, device_not_dax_capable, &page_size)) { blk_queue_flag_set(QUEUE_FLAG_DAX, q); - if (dm_table_supports_dax(t, device_dax_synchronous, NULL)) + if (dm_table_supports_dax(t, device_not_dax_synchronous_capable, NULL)) set_dax_synchronous(t->md->dax_dev); } else blk_queue_flag_clear(QUEUE_FLAG_DAX, q); - if (dm_table_supports_dax_write_cache(t)) + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) dax_write_cache(t->md->dax_dev, true); /* Ensure that all underlying devices are non-rotational. */ - if (dm_table_all_devices_attribute(t, device_is_nonrot)) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else + if (dm_table_any_dev_attr(t, device_is_rotational, NULL)) blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + else + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); if (!dm_table_supports_write_same(t)) q->limits.max_write_same_sectors = 0; @@ -1864,8 +2046,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, /* * Some devices don't use blk_integrity but still want stable pages * because they do their own checksumming. + * If any underlying device requires stable pages, a table must require + * them as well. Only targets that support iterate_devices are considered: + * don't want error, zero, etc to require stable pages. */ - if (dm_table_requires_stable_pages(t)) + if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL)) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); else blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); @@ -1876,7 +2061,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not * have it set. */ - if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) + if (blk_queue_add_random(q) && + dm_table_any_dev_attr(t, device_is_not_random, NULL)) blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); /* @@ -1891,6 +2077,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } #endif + dm_update_keyslot_manager(q, t); blk_queue_update_readahead(q); } diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d5223a0e5cc5..844c4be11768 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -148,6 +148,7 @@ struct dm_writecache { size_t metadata_sectors; size_t n_blocks; uint64_t seq_count; + sector_t data_device_sectors; void *block_start; struct wc_entry *entries; unsigned block_size; @@ -159,14 +160,22 @@ struct dm_writecache { bool overwrote_committed:1; bool memory_vmapped:1; + bool start_sector_set:1; bool high_wm_percent_set:1; bool low_wm_percent_set:1; bool max_writeback_jobs_set:1; bool autocommit_blocks_set:1; bool autocommit_time_set:1; + bool max_age_set:1; bool writeback_fua_set:1; bool flush_on_suspend:1; bool cleaner:1; + bool cleaner_set:1; + + unsigned high_wm_percent_value; + unsigned low_wm_percent_value; + unsigned autocommit_time_value; + unsigned max_age_value; unsigned writeback_all; struct workqueue_struct *writeback_wq; @@ -523,7 +532,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc) region.bdev = wc->ssd_dev->bdev; region.sector = 0; - region.count = PAGE_SIZE; + region.count = PAGE_SIZE >> SECTOR_SHIFT; if (unlikely(region.sector + region.count > wc->metadata_sectors)) region.count = wc->metadata_sectors - region.sector; @@ -969,6 +978,8 @@ static void writecache_resume(struct dm_target *ti) wc_lock(wc); + wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev); + if (WC_MODE_PMEM(wc)) { persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); } else { @@ -1638,6 +1649,10 @@ static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t void *address = memory_data(wc, e); persistent_memory_flush_cache(address, block_size); + + if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) + return true; + return bio_add_page(&wb->bio, persistent_memory_page(address), block_size, persistent_memory_page_offset(address)) != 0; } @@ -1709,6 +1724,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba if (writecache_has_error(wc)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); + } else if (unlikely(!bio_sectors(bio))) { + bio->bi_status = BLK_STS_OK; + bio_endio(bio); } else { submit_bio(bio); } @@ -1752,6 +1770,14 @@ static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writebac e = f; } + if (unlikely(to.sector + to.count > wc->data_device_sectors)) { + if (to.sector >= wc->data_device_sectors) { + writecache_copy_endio(0, 0, c); + continue; + } + from.count = to.count = wc->data_device_sectors - to.sector; + } + dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); __writeback_throttle(wc, wbl); @@ -2004,8 +2030,7 @@ static void writecache_dtr(struct dm_target *ti) if (wc->ssd_dev) dm_put_device(ti, wc->ssd_dev); - if (wc->entries) - vfree(wc->entries); + vfree(wc->entries); if (wc->memory_map) { if (WC_MODE_PMEM(wc)) @@ -2020,8 +2045,7 @@ static void writecache_dtr(struct dm_target *ti) if (wc->dm_io) dm_io_client_destroy(wc->dm_io); - if (wc->dirty_bitmap) - vfree(wc->dirty_bitmap); + vfree(wc->dirty_bitmap); kfree(wc); } @@ -2205,6 +2229,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) goto invalid_optional; wc->start_sector = start_sector; + wc->start_sector_set = true; if (wc->start_sector != start_sector || wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) goto invalid_optional; @@ -2214,6 +2239,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; if (high_wm_percent < 0 || high_wm_percent > 100) goto invalid_optional; + wc->high_wm_percent_value = high_wm_percent; wc->high_wm_percent_set = true; } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { string = dm_shift_arg(&as), opt_params--; @@ -2221,6 +2247,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; if (low_wm_percent < 0 || low_wm_percent > 100) goto invalid_optional; + wc->low_wm_percent_value = low_wm_percent; wc->low_wm_percent_set = true; } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { string = dm_shift_arg(&as), opt_params--; @@ -2240,6 +2267,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) if (autocommit_msecs > 3600000) goto invalid_optional; wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); + wc->autocommit_time_value = autocommit_msecs; wc->autocommit_time_set = true; } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { unsigned max_age_msecs; @@ -2249,7 +2277,10 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) if (max_age_msecs > 86400000) goto invalid_optional; wc->max_age = msecs_to_jiffies(max_age_msecs); + wc->max_age_set = true; + wc->max_age_value = max_age_msecs; } else if (!strcasecmp(string, "cleaner")) { + wc->cleaner_set = true; wc->cleaner = true; } else if (!strcasecmp(string, "fua")) { if (WC_MODE_PMEM(wc)) { @@ -2455,7 +2486,6 @@ static void writecache_status(struct dm_target *ti, status_type_t type, struct dm_writecache *wc = ti->private; unsigned extra_args; unsigned sz = 0; - uint64_t x; switch (type) { case STATUSTYPE_INFO: @@ -2467,11 +2497,11 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', wc->dev->name, wc->ssd_dev->name, wc->block_size); extra_args = 0; - if (wc->start_sector) + if (wc->start_sector_set) extra_args += 2; - if (wc->high_wm_percent_set && !wc->cleaner) + if (wc->high_wm_percent_set) extra_args += 2; - if (wc->low_wm_percent_set && !wc->cleaner) + if (wc->low_wm_percent_set) extra_args += 2; if (wc->max_writeback_jobs_set) extra_args += 2; @@ -2479,37 +2509,29 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args += 2; if (wc->autocommit_time_set) extra_args += 2; - if (wc->max_age != MAX_AGE_UNSPECIFIED) + if (wc->max_age_set) extra_args += 2; - if (wc->cleaner) + if (wc->cleaner_set) extra_args++; if (wc->writeback_fua_set) extra_args++; DMEMIT("%u", extra_args); - if (wc->start_sector) + if (wc->start_sector_set) DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); - if (wc->high_wm_percent_set && !wc->cleaner) { - x = (uint64_t)wc->freelist_high_watermark * 100; - x += wc->n_blocks / 2; - do_div(x, (size_t)wc->n_blocks); - DMEMIT(" high_watermark %u", 100 - (unsigned)x); - } - if (wc->low_wm_percent_set && !wc->cleaner) { - x = (uint64_t)wc->freelist_low_watermark * 100; - x += wc->n_blocks / 2; - do_div(x, (size_t)wc->n_blocks); - DMEMIT(" low_watermark %u", 100 - (unsigned)x); - } + if (wc->high_wm_percent_set) + DMEMIT(" high_watermark %u", wc->high_wm_percent_value); + if (wc->low_wm_percent_set) + DMEMIT(" low_watermark %u", wc->low_wm_percent_value); if (wc->max_writeback_jobs_set) DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); if (wc->autocommit_blocks_set) DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); if (wc->autocommit_time_set) - DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); - if (wc->max_age != MAX_AGE_UNSPECIFIED) - DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age)); - if (wc->cleaner) + DMEMIT(" autocommit_time %u", wc->autocommit_time_value); + if (wc->max_age_set) + DMEMIT(" max_age %u", wc->max_age_value); + if (wc->cleaner_set) DMEMIT(" cleaner"); if (wc->writeback_fua_set) DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); @@ -2519,7 +2541,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, static struct target_type writecache_target = { .name = "writecache", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 479ec5bea09e..50b693d776d6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -28,6 +28,7 @@ #include <linux/refcount.h> #include <linux/part_stat.h> #include <linux/blk-crypto.h> +#include <linux/keyslot-manager.h> #define DM_MSG_PREFIX "core" @@ -105,12 +106,16 @@ struct dm_io { struct dm_target_io tio; }; +#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) +#define DM_IO_BIO_OFFSET \ + (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) + void *dm_per_bio_data(struct bio *bio, size_t data_size) { struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); if (!tio->inside_dm_io) - return (char *)bio - offsetof(struct dm_target_io, clone) - data_size; - return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size; + return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; + return (char *)bio - DM_IO_BIO_OFFSET - data_size; } EXPORT_SYMBOL_GPL(dm_per_bio_data); @@ -118,9 +123,9 @@ struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) { struct dm_io *io = (struct dm_io *)((char *)data + data_size); if (io->magic == DM_IO_MAGIC) - return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone)); + return (struct bio *)((char *)io + DM_IO_BIO_OFFSET); BUG_ON(io->magic != DM_TIO_MAGIC); - return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone)); + return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET); } EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); @@ -148,6 +153,16 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE; +#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) +static int swap_bios = DEFAULT_SWAP_BIOS; +static int get_swap_bios(void) +{ + int latch = READ_ONCE(swap_bios); + if (unlikely(latch <= 0)) + latch = DEFAULT_SWAP_BIOS; + return latch; +} + /* * For mempools pre-allocation at the table loading time. */ @@ -969,6 +984,11 @@ void disable_write_zeroes(struct mapped_device *md) limits->max_write_zeroes_sectors = 0; } +static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) +{ + return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); +} + static void clone_endio(struct bio *bio) { blk_status_t error = bio->bi_status; @@ -1020,6 +1040,11 @@ static void clone_endio(struct bio *bio) } } + if (unlikely(swap_bios_limit(tio->ti, bio))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } + free_tio(tio); dec_pending(io, error); } @@ -1129,7 +1154,7 @@ static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bd if (!map) goto out; - ret = dm_table_supports_dax(map, device_supports_dax, &blocksize); + ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize); out: dm_put_live_table(md, srcu_idx); @@ -1253,6 +1278,22 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); +static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) +{ + mutex_lock(&md->swap_bios_lock); + while (latch < md->swap_bios) { + cond_resched(); + down(&md->swap_bios_semaphore); + md->swap_bios--; + } + while (latch > md->swap_bios) { + cond_resched(); + up(&md->swap_bios_semaphore); + md->swap_bios++; + } + mutex_unlock(&md->swap_bios_lock); +} + static blk_qc_t __map_bio(struct dm_target_io *tio) { int r; @@ -1272,6 +1313,14 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) atomic_inc(&io->io_count); sector = clone->bi_iter.bi_sector; + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + int latch = get_swap_bios(); + if (unlikely(latch != md->swap_bios)) + __set_swap_bios_limit(md, latch); + down(&md->swap_bios_semaphore); + } + r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: @@ -1282,10 +1331,18 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) ret = submit_bio_noacct(clone); break; case DM_MAPIO_KILL: + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } free_tio(tio); dec_pending(io, BLK_STS_IOERR); break; case DM_MAPIO_REQUEUE: + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } free_tio(tio); dec_pending(io, BLK_STS_DM_REQUEUE); break; @@ -1718,6 +1775,19 @@ static const struct dax_operations dm_dax_ops; static void dm_wq_work(struct work_struct *work); +#ifdef CONFIG_BLK_INLINE_ENCRYPTION +static void dm_queue_destroy_keyslot_manager(struct request_queue *q) +{ + dm_destroy_keyslot_manager(q->ksm); +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q) +{ +} +#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ + static void cleanup_mapped_device(struct mapped_device *md) { if (md->wq) @@ -1739,14 +1809,17 @@ static void cleanup_mapped_device(struct mapped_device *md) put_disk(md->disk); } - if (md->queue) + if (md->queue) { + dm_queue_destroy_keyslot_manager(md->queue); blk_cleanup_queue(md->queue); + } cleanup_srcu_struct(&md->io_barrier); mutex_destroy(&md->suspend_lock); mutex_destroy(&md->type_lock); mutex_destroy(&md->table_devices_lock); + mutex_destroy(&md->swap_bios_lock); dm_mq_cleanup_mapped_device(md); } @@ -1814,6 +1887,10 @@ static struct mapped_device *alloc_dev(int minor) init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); + md->swap_bios = get_swap_bios(); + sema_init(&md->swap_bios_semaphore, md->swap_bios); + mutex_init(&md->swap_bios_lock); + md->disk->major = _major; md->disk->first_minor = minor; md->disk->fops = &dm_blk_dops; @@ -2849,8 +2926,8 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); - front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); - io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio); + front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; + io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); if (ret) goto out; @@ -3097,6 +3174,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); +module_param(swap_bios, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index fffe1e289c53..b441ad772c18 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -73,7 +73,7 @@ void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn, int *blocksize); -int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, +int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data); void dm_lock_md_type(struct mapped_device *md); diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index 564896659dd4..fe073d92f01e 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -100,7 +100,7 @@ struct dm_block *shadow_parent(struct shadow_spine *s); int shadow_has_parent(struct shadow_spine *s); -int shadow_root(struct shadow_spine *s); +dm_block_t shadow_root(struct shadow_spine *s); /* * Some inlines. diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index e03cb9e48773..8a2bfbfb218b 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -235,7 +235,7 @@ int shadow_has_parent(struct shadow_spine *s) return s->count >= 2; } -int shadow_root(struct shadow_spine *s) +dm_block_t shadow_root(struct shadow_spine *s) { return s->root; } diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 61a66fb8ebb3..7f4ac87c0b32 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -93,9 +93,18 @@ typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv, typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev); +#ifdef CONFIG_BLK_DEV_ZONED typedef int (*dm_report_zones_fn) (struct dm_target *ti, struct dm_report_zones_args *args, unsigned int nr_zones); +#else +/* + * Define dm_report_zones_fn so that targets can assign to NULL if + * CONFIG_BLK_DEV_ZONED disabled. Otherwise each target needs to do + * awkward #ifdefs in their target_type, etc. + */ +typedef int (*dm_report_zones_fn) (struct dm_target *dummy); +#endif /* * These iteration functions are typically used to check (and combine) @@ -187,9 +196,7 @@ struct target_type { dm_status_fn status; dm_message_fn message; dm_prepare_ioctl_fn prepare_ioctl; -#ifdef CONFIG_BLK_DEV_ZONED dm_report_zones_fn report_zones; -#endif dm_busy_fn busy; dm_iterate_devices_fn iterate_devices; dm_io_hints_fn io_hints; @@ -248,8 +255,13 @@ struct target_type { /* * Indicates that a target supports host-managed zoned block devices. */ +#ifdef CONFIG_BLK_DEV_ZONED #define DM_TARGET_ZONED_HM 0x00000040 #define dm_target_supports_zoned_hm(type) ((type)->features & DM_TARGET_ZONED_HM) +#else +#define DM_TARGET_ZONED_HM 0x00000000 +#define dm_target_supports_zoned_hm(type) (false) +#endif /* * A target handles REQ_NOWAIT @@ -257,6 +269,12 @@ struct target_type { #define DM_TARGET_NOWAIT 0x00000080 #define dm_target_supports_nowait(type) ((type)->features & DM_TARGET_NOWAIT) +/* + * A target supports passing through inline crypto support. + */ +#define DM_TARGET_PASSES_CRYPTO 0x00000100 +#define dm_target_passes_crypto(type) ((type)->features & DM_TARGET_PASSES_CRYPTO) + struct dm_target { struct dm_table *table; struct target_type *type; @@ -325,6 +343,11 @@ struct dm_target { * whether or not its underlying devices have support. */ bool discards_supported:1; + + /* + * Set if we need to limit the number of in-flight bios when swapping. + */ + bool limit_swap_bios:1; }; void *dm_per_bio_data(struct bio *bio, size_t data_size); @@ -534,6 +557,11 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *t); /* + * Table keyslot manager functions + */ +void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm); + +/* * A wrapper around vmalloc. */ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h index 443ad817c6c5..a27605e2f826 100644 --- a/include/linux/keyslot-manager.h +++ b/include/linux/keyslot-manager.h @@ -106,4 +106,15 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm); void blk_ksm_destroy(struct blk_keyslot_manager *ksm); +void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, + const struct blk_keyslot_manager *child); + +void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm); + +bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, + struct blk_keyslot_manager *ksm_subset); + +void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, + struct blk_keyslot_manager *reference_ksm); + #endif /* __LINUX_KEYSLOT_MANAGER_H */ diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 4933b6b67b85..fcff6669137b 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -272,9 +272,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 43 +#define DM_VERSION_MINOR 44 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2020-10-01)" +#define DM_VERSION_EXTRA "-ioctl (2021-02-01)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ |