diff options
-rw-r--r-- | Documentation/device-mapper/unstriped.txt | 124 | ||||
-rw-r--r-- | drivers/md/Kconfig | 7 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/dm-unstripe.c | 225 |
4 files changed, 357 insertions, 0 deletions
diff --git a/Documentation/device-mapper/unstriped.txt b/Documentation/device-mapper/unstriped.txt new file mode 100644 index 000000000000..0b2a306c54ee --- /dev/null +++ b/Documentation/device-mapper/unstriped.txt @@ -0,0 +1,124 @@ +Introduction +============ + +The device-mapper "unstriped" target provides a transparent mechanism to +unstripe a device-mapper "striped" target to access the underlying disks +without having to touch the true backing block-device. It can also be +used to unstripe a hardware RAID-0 to access backing disks. + +Parameters: +<number of stripes> <chunk size> <stripe #> <dev_path> <offset> + +<number of stripes> + The number of stripes in the RAID 0. + +<chunk size> + The amount of 512B sectors in the chunk striping. + +<dev_path> + The block device you wish to unstripe. + +<stripe #> + The stripe number within the device that corresponds to physical + drive you wish to unstripe. This must be 0 indexed. + + +Why use this module? +==================== + +An example of undoing an existing dm-stripe +------------------------------------------- + +This small bash script will setup 4 loop devices and use the existing +striped target to combine the 4 devices into one. It then will use +the unstriped target ontop of the striped device to access the +individual backing loop devices. We write data to the newly exposed +unstriped devices and verify the data written matches the correct +underlying device on the striped array. + +#!/bin/bash + +MEMBER_SIZE=$((128 * 1024 * 1024)) +NUM=4 +SEQ_END=$((${NUM}-1)) +CHUNK=256 +BS=4096 + +RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512)) +DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}" +COUNT=$((${MEMBER_SIZE} / ${BS})) + +for i in $(seq 0 ${SEQ_END}); do + dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct + losetup /dev/loop${i} member-${i} + DM_PARMS+=" /dev/loop${i} 0" +done + +echo $DM_PARMS | dmsetup create raid0 +for i in $(seq 0 ${SEQ_END}); do + echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i} +done; + +for i in $(seq 0 ${SEQ_END}); do + dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct + diff /dev/mapper/set-${i} member-${i} +done; + +for i in $(seq 0 ${SEQ_END}); do + dmsetup remove set-${i} +done + +dmsetup remove raid0 + +for i in $(seq 0 ${SEQ_END}); do + losetup -d /dev/loop${i} + rm -f member-${i} +done + +Another example +--------------- + +Intel NVMe drives contain two cores on the physical device. +Each core of the drive has segregated access to its LBA range. +The current LBA model has a RAID 0 128k chunk on each core, resulting +in a 256k stripe across the two cores: + + Core 0: Core 1: + __________ __________ + | LBA 512| | LBA 768| + | LBA 0 | | LBA 256| + ---------- ---------- + +The purpose of this unstriping is to provide better QoS in noisy +neighbor environments. When two partitions are created on the +aggregate drive without this unstriping, reads on one partition +can affect writes on another partition. This is because the partitions +are striped across the two cores. When we unstripe this hardware RAID 0 +and make partitions on each new exposed device the two partitions are now +physically separated. + +With the dm-unstriped target we're able to segregate an fio script that +has read and write jobs that are independent of each other. Compared to +when we run the test on a combined drive with partitions, we were able +to get a 92% reduction in read latency using this device mapper target. + + +Example dmsetup usage +===================== + +unstriped ontop of Intel NVMe device that has 2 cores +----------------------------------------------------- +dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0' +dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0' + +There will now be two devices that expose Intel NVMe core 0 and 1 +respectively: +/dev/mapper/nvmset0 +/dev/mapper/nvmset1 + +unstriped ontop of striped with 4 drives using 128K chunk size +-------------------------------------------------------------- +dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0' +dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0' +dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0' +dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0' diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 83b9362be09c..2c8ac3688815 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -269,6 +269,13 @@ config DM_BIO_PRISON source "drivers/md/persistent-data/Kconfig" +config DM_UNSTRIPED + tristate "Unstriped target" + depends on BLK_DEV_DM + ---help--- + Unstripes I/O so it is issued solely on a single drive in a HW + RAID0 or dm-striped target. + config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index f701bb211783..63255f3ebd97 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o +obj-$(CONFIG_DM_UNSTRIPED) += dm-unstripe.o obj-$(CONFIG_DM_BUFIO) += dm-bufio.o obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c new file mode 100644 index 000000000000..061b4f10bf5c --- /dev/null +++ b/drivers/md/dm-unstripe.c @@ -0,0 +1,225 @@ +/* + * Copyright (C) 2017 Intel Corporation. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/device-mapper.h> + +struct unstripe_c { + struct dm_dev *dev; + sector_t physical_start; + + uint32_t stripes; + + uint32_t unstripe; + sector_t unstripe_width; + sector_t unstripe_offset; + + uint32_t chunk_size; + u8 chunk_shift; +}; + +#define DM_MSG_PREFIX "unstriped" + +static void cleanup_unstripe(struct unstripe_c *uc, struct dm_target *ti) +{ + if (uc->dev) + dm_put_device(ti, uc->dev); + kfree(uc); +} + +/* + * Contruct an unstriped mapping. + * <number of stripes> <chunk size> <stripe #> <dev_path> <offset> + */ +static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct unstripe_c *uc; + sector_t width, tmp_len; + unsigned long long start; + char dummy; + + if (argc != 5) { + ti->error = "Invalid number of arguments"; + return -EINVAL; + } + + uc = kzalloc(sizeof(*uc), GFP_KERNEL); + if (!uc) { + ti->error = "Memory allocation for unstriped context failed"; + return -ENOMEM; + } + + if (kstrtouint(argv[0], 10, &uc->stripes) || !uc->stripes) { + ti->error = "Invalid stripe count"; + goto err; + } + + if (kstrtouint(argv[1], 10, &uc->chunk_size) || !uc->chunk_size) { + ti->error = "Invalid chunk_size"; + goto err; + } + + // FIXME: must support non power of 2 chunk_size, dm-stripe.c does + if (!is_power_of_2(uc->chunk_size)) { + ti->error = "Non power of 2 chunk_size is not supported yet"; + goto err; + } + + if (kstrtouint(argv[2], 10, &uc->unstripe)) { + ti->error = "Invalid stripe number"; + goto err; + } + + if (uc->unstripe > uc->stripes && uc->stripes > 1) { + ti->error = "Please provide stripe between [0, # of stripes]"; + goto err; + } + + if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &uc->dev)) { + ti->error = "Couldn't get striped device"; + goto err; + } + + if (sscanf(argv[4], "%llu%c", &start, &dummy) != 1) { + ti->error = "Invalid striped device offset"; + goto err; + } + uc->physical_start = start; + + uc->unstripe_offset = uc->unstripe * uc->chunk_size; + uc->unstripe_width = (uc->stripes - 1) * uc->chunk_size; + uc->chunk_shift = fls(uc->chunk_size) - 1; + + width = ti->len; + if (sector_div(width, uc->stripes)) { + ti->error = "Target length not divisible by number of stripes"; + goto err; + } + + tmp_len = width; + if (sector_div(tmp_len, uc->chunk_size)) { + ti->error = "Target length not divisible by chunk size"; + goto err; + } + + if (dm_set_target_max_io_len(ti, uc->chunk_size)) { + ti->error = "Failed to set max io len"; + goto err; + } + + ti->private = uc; + return 0; +err: + cleanup_unstripe(uc, ti); + return -EINVAL; +} + +static void unstripe_dtr(struct dm_target *ti) +{ + struct unstripe_c *uc = ti->private; + + cleanup_unstripe(uc, ti); +} + +static sector_t map_to_core(struct dm_target *ti, struct bio *bio) +{ + struct unstripe_c *uc = ti->private; + sector_t sector = bio->bi_iter.bi_sector; + + /* Shift us up to the right "row" on the stripe */ + sector += uc->unstripe_width * (sector >> uc->chunk_shift); + + /* Account for what stripe we're operating on */ + sector += uc->unstripe_offset; + + return sector; +} + +static int unstripe_map(struct dm_target *ti, struct bio *bio) +{ + struct unstripe_c *uc = ti->private; + + bio_set_dev(bio, uc->dev->bdev); + bio->bi_iter.bi_sector = map_to_core(ti, bio) + uc->physical_start; + + return DM_MAPIO_REMAPPED; +} + +static void unstripe_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct unstripe_c *uc = ti->private; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + break; + + case STATUSTYPE_TABLE: + DMEMIT("%d %llu %d %s %llu", + uc->stripes, (unsigned long long)uc->chunk_size, uc->unstripe, + uc->dev->name, (unsigned long long)uc->physical_start); + break; + } +} + +static int unstripe_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct unstripe_c *uc = ti->private; + + return fn(ti, uc->dev, uc->physical_start, ti->len, data); +} + +static void unstripe_io_hints(struct dm_target *ti, + struct queue_limits *limits) +{ + struct unstripe_c *uc = ti->private; + + limits->chunk_sectors = uc->chunk_size; +} + +static struct target_type unstripe_target = { + .name = "unstriped", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = unstripe_ctr, + .dtr = unstripe_dtr, + .map = unstripe_map, + .status = unstripe_status, + .iterate_devices = unstripe_iterate_devices, + .io_hints = unstripe_io_hints, +}; + +static int __init dm_unstripe_init(void) +{ + int r; + + r = dm_register_target(&unstripe_target); + if (r < 0) + DMERR("target registration failed"); + + return r; +} + +static void __exit dm_unstripe_exit(void) +{ + dm_unregister_target(&unstripe_target); +} + +module_init(dm_unstripe_init); +module_exit(dm_unstripe_exit); + +MODULE_DESCRIPTION(DM_NAME " unstriped target"); +MODULE_AUTHOR("Scott Bauer <scott.bauer@intel.com>"); +MODULE_LICENSE("GPL"); |