diff options
author | Linus Torvalds | 2024-01-11 13:58:04 -0800 |
---|---|---|
committer | Linus Torvalds | 2024-01-11 13:58:04 -0800 |
commit | 01d550f0fcc06c7292f79a6f1453aac122d1d2c8 (patch) | |
tree | 58b58ac1cb833af0469b1942774a382633bc6cda /drivers/md | |
parent | d05e626603d57936314816433db8bf1d34b5a504 (diff) | |
parent | 587371ed783b046f22ba7a5e1cc9a19ae35123b4 (diff) |
Merge tag 'for-6.8/block-2024-01-08' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
"Pretty quiet round this time around. This contains:
- NVMe updates via Keith:
- nvme fabrics spec updates (Guixin, Max)
- nvme target udpates (Guixin, Evan)
- nvme attribute refactoring (Daniel)
- nvme-fc numa fix (Keith)
- MD updates via Song:
- Fix/Cleanup RCU usage from conf->disks[i].rdev (Yu Kuai)
- Fix raid5 hang issue (Junxiao Bi)
- Add Yu Kuai as Reviewer of the md subsystem
- Remove deprecated flavors (Song Liu)
- raid1 read error check support (Li Nan)
- Better handle events off-by-1 case (Alex Lyakas)
- Efficiency improvements for passthrough (Kundan)
- Support for mapping integrity data directly (Keith)
- Zoned write fix (Damien)
- rnbd fixes (Kees, Santosh, Supriti)
- Default to a sane discard size granularity (Christoph)
- Make the default max transfer size naming less confusing
(Christoph)
- Remove support for deprecated host aware zoned model (Christoph)
- Misc fixes (me, Li, Matthew, Min, Ming, Randy, liyouhong, Daniel,
Bart, Christoph)"
* tag 'for-6.8/block-2024-01-08' of git://git.kernel.dk/linux: (78 commits)
block: Treat sequential write preferred zone type as invalid
block: remove disk_clear_zoned
sd: remove the !ZBC && blk_queue_is_zoned case in sd_read_block_characteristics
drivers/block/xen-blkback/common.h: Fix spelling typo in comment
blk-cgroup: fix rcu lockdep warning in blkg_lookup()
blk-cgroup: don't use removal safe list iterators
block: floor the discard granularity to the physical block size
mtd_blkdevs: use the default discard granularity
bcache: use the default discard granularity
zram: use the default discard granularity
null_blk: use the default discard granularity
nbd: use the default discard granularity
ubd: use the default discard granularity
block: default the discard granularity to sector size
bcache: discard_granularity should not be smaller than a sector
block: remove two comments in bio_split_discard
block: rename and document BLK_DEF_MAX_SECTORS
loop: don't abuse BLK_DEF_MAX_SECTORS
aoe: don't abuse BLK_DEF_MAX_SECTORS
null_blk: don't cap max_hw_sectors to BLK_DEF_MAX_SECTORS
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 34 | ||||
-rw-r--r-- | drivers/md/Makefile | 10 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 45 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 7 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 4 | ||||
-rw-r--r-- | drivers/md/md-autodetect.c | 8 | ||||
-rw-r--r-- | drivers/md/md-faulty.c | 365 | ||||
-rw-r--r-- | drivers/md/md-linear.c | 318 | ||||
-rw-r--r-- | drivers/md/md-multipath.c | 471 | ||||
-rw-r--r-- | drivers/md/md.c | 305 | ||||
-rw-r--r-- | drivers/md/md.h | 5 | ||||
-rw-r--r-- | drivers/md/raid1-10.c | 54 | ||||
-rw-r--r-- | drivers/md/raid1.c | 91 | ||||
-rw-r--r-- | drivers/md/raid10.c | 271 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 11 | ||||
-rw-r--r-- | drivers/md/raid5-ppl.c | 16 | ||||
-rw-r--r-- | drivers/md/raid5.c | 203 | ||||
-rw-r--r-- | drivers/md/raid5.h | 4 |
20 files changed, 400 insertions, 1825 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3ff87cb4dc49..a743e2c572fc 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -61,19 +61,6 @@ config MD_BITMAP_FILE various kernel APIs and can only work with files on a file system not actually sitting on the MD device. -config MD_LINEAR - tristate "Linear (append) mode (deprecated)" - depends on BLK_DEV_MD - help - If you say Y here, then your multiple devices driver will be able to - use the so-called linear mode, i.e. it will combine the hard disk - partitions by simply appending one to the other. - - To compile this as a module, choose M here: the module - will be called linear. - - If unsure, say Y. - config MD_RAID0 tristate "RAID-0 (striping) mode" depends on BLK_DEV_MD @@ -172,27 +159,6 @@ config MD_RAID456 If unsure, say Y. -config MD_MULTIPATH - tristate "Multipath I/O support (deprecated)" - depends on BLK_DEV_MD - help - MD_MULTIPATH provides a simple multi-path personality for use - the MD framework. It is not under active development. New - projects should consider using DM_MULTIPATH which has more - features and more testing. - - If unsure, say N. - -config MD_FAULTY - tristate "Faulty test module for MD (deprecated)" - depends on BLK_DEV_MD - help - The "faulty" module allows for a block device that occasionally returns - read or write errors. It is useful for testing. - - In unsure, say N. - - config MD_CLUSTER tristate "Cluster Support for MD" depends on BLK_DEV_MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 84291e38dca8..027d7cfeca3f 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -29,22 +29,16 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o -linear-y += md-linear.o -multipath-y += md-multipath.o -faulty-y += md-faulty.o # Note: link order is important. All raid personalities -# and must come before md.o, as they each initialise -# themselves, and md.o may use the personalities when it +# and must come before md.o, as they each initialise +# themselves, and md.o may use the personalities when it # auto-initialised. -obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o obj-$(CONFIG_MD_RAID456) += raid456.o -obj-$(CONFIG_MD_MULTIPATH) += multipath.o -obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_MD_CLUSTER) += md-cluster.o obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1402096b8076..dc3f50f69714 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -954,7 +954,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, q->limits.max_segment_size = UINT_MAX; q->limits.max_segments = BIO_MAX_VECS; blk_queue_max_discard_sectors(q, UINT_MAX); - q->limits.discard_granularity = 512; q->limits.io_min = block_size; q->limits.logical_block_size = block_size; q->limits.physical_block_size = block_size; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index d01807c50f20..36bcfdccae04 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -807,7 +807,7 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, */ if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { for (i = 0; i < job->num_dests; i++) { - if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) { + if (bdev_is_zoned(dests[i].bdev)) { job->flags |= BIT(DM_KCOPYD_WRITE_SEQ); break; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 198d38b53322..260b5b8f2b0d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1579,21 +1579,18 @@ bool dm_table_has_no_data_devices(struct dm_table *t) return true; } -static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_zoned(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - enum blk_zoned_model *zoned_model = data; + bool *zoned = data; - return blk_queue_zoned_model(q) != *zoned_model; + return bdev_is_zoned(dev->bdev) != *zoned; } static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - - return blk_queue_zoned_model(q) != BLK_ZONED_NONE; + return bdev_is_zoned(dev->bdev); } /* @@ -1603,8 +1600,7 @@ static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev, * has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any * zoned model with all zoned devices having the same zone size. */ -static bool dm_table_supports_zoned_model(struct dm_table *t, - enum blk_zoned_model zoned_model) +static bool dm_table_supports_zoned(struct dm_table *t, bool zoned) { for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1623,11 +1619,11 @@ static bool dm_table_supports_zoned_model(struct dm_table *t, if (dm_target_supports_zoned_hm(ti->type)) { if (!ti->type->iterate_devices || - ti->type->iterate_devices(ti, device_not_zoned_model, - &zoned_model)) + ti->type->iterate_devices(ti, device_not_zoned, + &zoned)) return false; } else if (!dm_target_supports_mixed_zoned_model(ti->type)) { - if (zoned_model == BLK_ZONED_HM) + if (zoned) return false; } } @@ -1650,14 +1646,13 @@ static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev * * zone sectors, if the destination device is a zoned block device, it shall * have the specified zone_sectors. */ -static int validate_hardware_zoned_model(struct dm_table *t, - enum blk_zoned_model zoned_model, - unsigned int zone_sectors) +static int validate_hardware_zoned(struct dm_table *t, bool zoned, + unsigned int zone_sectors) { - if (zoned_model == BLK_ZONED_NONE) + if (!zoned) return 0; - if (!dm_table_supports_zoned_model(t, zoned_model)) { + if (!dm_table_supports_zoned(t, zoned)) { DMERR("%s: zoned model is not consistent across all devices", dm_device_name(t->md)); return -EINVAL; @@ -1683,8 +1678,8 @@ int dm_calculate_queue_limits(struct dm_table *t, struct queue_limits *limits) { struct queue_limits ti_limits; - enum blk_zoned_model zoned_model = BLK_ZONED_NONE; unsigned int zone_sectors = 0; + bool zoned = false; blk_set_stacking_limits(limits); @@ -1706,12 +1701,12 @@ int dm_calculate_queue_limits(struct dm_table *t, ti->type->iterate_devices(ti, dm_set_device_limits, &ti_limits); - if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) { + if (!zoned && ti_limits.zoned) { /* * After stacking all limits, validate all devices * in table support this zoned model and zone sectors. */ - zoned_model = ti_limits.zoned; + zoned = ti_limits.zoned; zone_sectors = ti_limits.chunk_sectors; } @@ -1744,18 +1739,18 @@ combine_limits: * Verify that the zoned model and zone sectors, as determined before * any .io_hints override, are the same across all devices in the table. * - this is especially relevant if .io_hints is emulating a disk-managed - * zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices. + * zoned model on host-managed zoned block devices. * BUT... */ - if (limits->zoned != BLK_ZONED_NONE) { + if (limits->zoned) { /* * ...IF the above limits stacking determined a zoned model * validate that all of the table's devices conform to it. */ - zoned_model = limits->zoned; + zoned = limits->zoned; zone_sectors = limits->chunk_sectors; } - if (validate_hardware_zoned_model(t, zoned_model, zone_sectors)) + if (validate_hardware_zoned(t, zoned, zone_sectors)) return -EINVAL; return validate_hardware_logical_block_alignment(t, limits); diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 60a4dc01ea18..fdfe30f7b697 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -2836,12 +2836,11 @@ static void dmz_print_dev(struct dmz_metadata *zmd, int num) { struct dmz_dev *dev = &zmd->dev[num]; - if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) + if (!bdev_is_zoned(dev->bdev)) dmz_dev_info(dev, "Regular block device"); else - dmz_dev_info(dev, "Host-%s zoned block device", - bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? - "aware" : "managed"); + dmz_dev_info(dev, "Host-managed zoned block device"); + if (zmd->sb_version > 1) { sector_t sector_offset = dev->zone_offset << zmd->zone_nr_sectors_shift; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index b487f7acc860..621794a9edd6 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -702,7 +702,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path, } bdev = ddev->bdev; - if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) { + if (!bdev_is_zoned(bdev)) { if (nr_devs == 1) { ti->error = "Invalid regular device"; goto err; @@ -1010,7 +1010,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->max_sectors = chunk_sectors; /* We are exposing a drive-managed zoned block device */ - limits->zoned = BLK_ZONED_NONE; + limits->zoned = false; } /* diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 4b80165afd23..b2a00f213c2c 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -49,7 +49,6 @@ static int md_setup_ents __initdata; * instead of just one. -- KTK * 18May2000: Added support for persistent-superblock arrays: * md=n,0,factor,fault,device-list uses RAID0 for device n - * md=n,-1,factor,fault,device-list uses LINEAR for device n * md=n,device-list reads a RAID superblock from the devices * elements in device-list are read by name_to_kdev_t so can be * a hex number or something like /dev/hda1 /dev/sdb @@ -88,7 +87,7 @@ static int __init md_setup(char *str) md_setup_ents++; switch (get_option(&str, &level)) { /* RAID level */ case 2: /* could be 0 or -1.. */ - if (level == 0 || level == LEVEL_LINEAR) { + if (level == 0) { if (get_option(&str, &factor) != 2 || /* Chunk Size */ get_option(&str, &fault) != 2) { printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); @@ -96,10 +95,7 @@ static int __init md_setup(char *str) } md_setup_args[ent].level = level; md_setup_args[ent].chunk = 1 << (factor+12); - if (level == LEVEL_LINEAR) - pername = "linear"; - else - pername = "raid0"; + pername = "raid0"; break; } fallthrough; diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c deleted file mode 100644 index a039e8e20f55..000000000000 --- a/drivers/md/md-faulty.c +++ /dev/null @@ -1,365 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * faulty.c : Multiple Devices driver for Linux - * - * Copyright (C) 2004 Neil Brown - * - * fautly-device-simulator personality for md - */ - - -/* - * The "faulty" personality causes some requests to fail. - * - * Possible failure modes are: - * reads fail "randomly" but succeed on retry - * writes fail "randomly" but succeed on retry - * reads for some address fail and then persist until a write - * reads for some address fail and then persist irrespective of write - * writes for some address fail and persist - * all writes fail - * - * Different modes can be active at a time, but only - * one can be set at array creation. Others can be added later. - * A mode can be one-shot or recurrent with the recurrence being - * once in every N requests. - * The bottom 5 bits of the "layout" indicate the mode. The - * remainder indicate a period, or 0 for one-shot. - * - * There is an implementation limit on the number of concurrently - * persisting-faulty blocks. When a new fault is requested that would - * exceed the limit, it is ignored. - * All current faults can be clear using a layout of "0". - * - * Requests are always sent to the device. If they are to fail, - * we clone the bio and insert a new b_end_io into the chain. - */ - -#define WriteTransient 0 -#define ReadTransient 1 -#define WritePersistent 2 -#define ReadPersistent 3 -#define WriteAll 4 /* doesn't go to device */ -#define ReadFixable 5 -#define Modes 6 - -#define ClearErrors 31 -#define ClearFaults 30 - -#define AllPersist 100 /* internal use only */ -#define NoPersist 101 - -#define ModeMask 0x1f -#define ModeShift 5 - -#define MaxFault 50 -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/raid/md_u.h> -#include <linux/slab.h> -#include "md.h" -#include <linux/seq_file.h> - - -static void faulty_fail(struct bio *bio) -{ - struct bio *b = bio->bi_private; - - b->bi_iter.bi_size = bio->bi_iter.bi_size; - b->bi_iter.bi_sector = bio->bi_iter.bi_sector; - - bio_put(bio); - - bio_io_error(b); -} - -struct faulty_conf { - int period[Modes]; - atomic_t counters[Modes]; - sector_t faults[MaxFault]; - int modes[MaxFault]; - int nfaults; - struct md_rdev *rdev; -}; - -static int check_mode(struct faulty_conf *conf, int mode) -{ - if (conf->period[mode] == 0 && - atomic_read(&conf->counters[mode]) <= 0) - return 0; /* no failure, no decrement */ - - - if (atomic_dec_and_test(&conf->counters[mode])) { - if (conf->period[mode]) - atomic_set(&conf->counters[mode], conf->period[mode]); - return 1; - } - return 0; -} - -static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir) -{ - /* If we find a ReadFixable sector, we fix it ... */ - int i; - for (i=0; i<conf->nfaults; i++) - if (conf->faults[i] >= start && - conf->faults[i] < end) { - /* found it ... */ - switch (conf->modes[i] * 2 + dir) { - case WritePersistent*2+WRITE: return 1; - case ReadPersistent*2+READ: return 1; - case ReadFixable*2+READ: return 1; - case ReadFixable*2+WRITE: - conf->modes[i] = NoPersist; - return 0; - case AllPersist*2+READ: - case AllPersist*2+WRITE: return 1; - default: - return 0; - } - } - return 0; -} - -static void add_sector(struct faulty_conf *conf, sector_t start, int mode) -{ - int i; - int n = conf->nfaults; - for (i=0; i<conf->nfaults; i++) - if (conf->faults[i] == start) { - switch(mode) { - case NoPersist: conf->modes[i] = mode; return; - case WritePersistent: - if (conf->modes[i] == ReadPersistent || - conf->modes[i] == ReadFixable) - conf->modes[i] = AllPersist; - else - conf->modes[i] = WritePersistent; - return; - case ReadPersistent: - if (conf->modes[i] == WritePersistent) - conf->modes[i] = AllPersist; - else - conf->modes[i] = ReadPersistent; - return; - case ReadFixable: - if (conf->modes[i] == WritePersistent || - conf->modes[i] == ReadPersistent) - conf->modes[i] = AllPersist; - else - conf->modes[i] = ReadFixable; - return; - } - } else if (conf->modes[i] == NoPersist) - n = i; - - if (n >= MaxFault) - return; - conf->faults[n] = start; - conf->modes[n] = mode; - if (conf->nfaults == n) - conf->nfaults = n+1; -} - -static bool faulty_make_request(struct mddev *mddev, struct bio *bio) -{ - struct faulty_conf *conf = mddev->private; - int failit = 0; - - if (bio_data_dir(bio) == WRITE) { - /* write request */ - if (atomic_read(&conf->counters[WriteAll])) { - /* special case - don't decrement, don't submit_bio_noacct, - * just fail immediately - */ - bio_io_error(bio); - return true; - } - - if (check_sector(conf, bio->bi_iter.bi_sector, - bio_end_sector(bio), WRITE)) - failit = 1; - if (check_mode(conf, WritePersistent)) { - add_sector(conf, bio->bi_iter.bi_sector, - WritePersistent); - failit = 1; - } - if (check_mode(conf, WriteTransient)) - failit = 1; - } else { - /* read request */ - if (check_sector(conf, bio->bi_iter.bi_sector, - bio_end_sector(bio), READ)) - failit = 1; - if (check_mode(conf, ReadTransient)) - failit = 1; - if (check_mode(conf, ReadPersistent)) { - add_sector(conf, bio->bi_iter.bi_sector, - ReadPersistent); - failit = 1; - } - if (check_mode(conf, ReadFixable)) { - add_sector(conf, bio->bi_iter.bi_sector, - ReadFixable); - failit = 1; - } - } - - md_account_bio(mddev, &bio); - if (failit) { - struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO, - &mddev->bio_set); - - b->bi_private = bio; - b->bi_end_io = faulty_fail; - bio = b; - } else - bio_set_dev(bio, conf->rdev->bdev); - - submit_bio_noacct(bio); - return true; -} - -static void faulty_status(struct seq_file *seq, struct mddev *mddev) -{ - struct faulty_conf *conf = mddev->private; - int n; - - if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) - seq_printf(seq, " WriteTransient=%d(%d)", - n, conf->period[WriteTransient]); - - if ((n=atomic_read(&conf->counters[ReadTransient])) != 0) - seq_printf(seq, " ReadTransient=%d(%d)", - n, conf->period[ReadTransient]); - - if ((n=atomic_read(&conf->counters[WritePersistent])) != 0) - seq_printf(seq, " WritePersistent=%d(%d)", - n, conf->period[WritePersistent]); - - if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0) - seq_printf(seq, " ReadPersistent=%d(%d)", - n, conf->period[ReadPersistent]); - - - if ((n=atomic_read(&conf->counters[ReadFixable])) != 0) - seq_printf(seq, " ReadFixable=%d(%d)", - n, conf->period[ReadFixable]); - - if ((n=atomic_read(&conf->counters[WriteAll])) != 0) - seq_printf(seq, " WriteAll"); - - seq_printf(seq, " nfaults=%d", conf->nfaults); -} - - -static int faulty_reshape(struct mddev *mddev) -{ - int mode = mddev->new_layout & ModeMask; - int count = mddev->new_layout >> ModeShift; - struct faulty_conf *conf = mddev->private; - - if (mddev->new_layout < 0) - return 0; - - /* new layout */ - if (mode == ClearFaults) - conf->nfaults = 0; - else if (mode == ClearErrors) { - int i; - for (i=0 ; i < Modes ; i++) { - conf->period[i] = 0; - atomic_set(&conf->counters[i], 0); - } - } else if (mode < Modes) { - conf->period[mode] = count; - if (!count) count++; - atomic_set(&conf->counters[mode], count); - } else - return -EINVAL; - mddev->new_layout = -1; - mddev->layout = -1; /* makes sure further changes come through */ - return 0; -} - -static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - WARN_ONCE(raid_disks, - "%s does not support generic reshape\n", __func__); - - if (sectors == 0) - return mddev->dev_sectors; - - return sectors; -} - -static int faulty_run(struct mddev *mddev) -{ - struct md_rdev *rdev; - int i; - struct faulty_conf *conf; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - - conf = kmalloc(sizeof(*conf), GFP_KERNEL); - if (!conf) - return -ENOMEM; - - for (i=0; i<Modes; i++) { - atomic_set(&conf->counters[i], 0); - conf->period[i] = 0; - } - conf->nfaults = 0; - - rdev_for_each(rdev, mddev) { - conf->rdev = rdev; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - } - - md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); - mddev->private = conf; - - faulty_reshape(mddev); - - return 0; -} - -static void faulty_free(struct mddev *mddev, void *priv) -{ - struct faulty_conf *conf = priv; - - kfree(conf); -} - -static struct md_personality faulty_personality = -{ - .name = "faulty", - .level = LEVEL_FAULTY, - .owner = THIS_MODULE, - .make_request = faulty_make_request, - .run = faulty_run, - .free = faulty_free, - .status = faulty_status, - .check_reshape = faulty_reshape, - .size = faulty_size, -}; - -static int __init raid_init(void) -{ - return register_md_personality(&faulty_personality); -} - -static void raid_exit(void) -{ - unregister_md_personality(&faulty_personality); -} - -module_init(raid_init); -module_exit(raid_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Fault injection personality for MD (deprecated)"); -MODULE_ALIAS("md-personality-10"); /* faulty */ -MODULE_ALIAS("md-faulty"); -MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c deleted file mode 100644 index 8eca7693b793..000000000000 --- a/drivers/md/md-linear.c +++ /dev/null @@ -1,318 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - linear.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - <zyngier@ufr-info-p7.ibp.fr> or - <maz@gloups.fdn.fr> - - Linear mode management functions. - -*/ - -#include <linux/blkdev.h> -#include <linux/raid/md_u.h> -#include <linux/seq_file.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <trace/events/block.h> -#include "md.h" -#include "md-linear.h" - -/* - * find which device holds a particular offset - */ -static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) -{ - int lo, mid, hi; - struct linear_conf *conf; - - lo = 0; - hi = mddev->raid_disks - 1; - conf = mddev->private; - - /* - * Binary Search - */ - - while (hi > lo) { - - mid = (hi + lo) / 2; - if (sector < conf->disks[mid].end_sector) - hi = mid; - else - lo = mid + 1; - } - - return conf->disks + lo; -} - -static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - struct linear_conf *conf; - sector_t array_sectors; - - conf = mddev->private; - WARN_ONCE(sectors || raid_disks, - "%s does not support generic reshape\n", __func__); - array_sectors = conf->array_sectors; - - return array_sectors; -} - -static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) -{ - struct linear_conf *conf; - struct md_rdev *rdev; - int i, cnt; - - conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL); - if (!conf) - return NULL; - - /* - * conf->raid_disks is copy of mddev->raid_disks. The reason to - * keep a copy of mddev->raid_disks in struct linear_conf is, - * mddev->raid_disks may not be consistent with pointers number of - * conf->disks[] when it is updated in linear_add() and used to - * iterate old conf->disks[] earray in linear_congested(). - * Here conf->raid_disks is always consitent with number of - * pointers in conf->disks[] array, and mddev->private is updated - * with rcu_assign_pointer() in linear_addr(), such race can be - * avoided. - */ - conf->raid_disks = raid_disks; - - cnt = 0; - conf->array_sectors = 0; - - rdev_for_each(rdev, mddev) { - int j = rdev->raid_disk; - struct dev_info *disk = conf->disks + j; - sector_t sectors; - - if (j < 0 || j >= raid_disks || disk->rdev) { - pr_warn("md/linear:%s: disk numbering problem. Aborting!\n", - mdname(mddev)); - goto out; - } - - disk->rdev = rdev; - if (mddev->chunk_sectors) { - sectors = rdev->sectors; - sector_div(sectors, mddev->chunk_sectors); - rdev->sectors = sectors * mddev->chunk_sectors; - } - - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - conf->array_sectors += rdev->sectors; - cnt++; - } - if (cnt != raid_disks) { - pr_warn("md/linear:%s: not enough drives present. Aborting!\n", - mdname(mddev)); - goto out; - } - - /* - * Here we calculate the device offsets. - */ - conf->disks[0].end_sector = conf->disks[0].rdev->sectors; - - for (i = 1; i < raid_disks; i++) - conf->disks[i].end_sector = - conf->disks[i-1].end_sector + - conf->disks[i].rdev->sectors; - - return conf; - -out: - kfree(conf); - return NULL; -} - -static int linear_run (struct mddev *mddev) -{ - struct linear_conf *conf; - int ret; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - conf = linear_conf(mddev, mddev->raid_disks); - - if (!conf) - return 1; - mddev->private = conf; - md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - - ret = md_integrity_register(mddev); - if (ret) { - kfree(conf); - mddev->private = NULL; - } - return ret; -} - -static int linear_add(struct mddev *mddev, struct md_rdev *rdev) -{ - /* Adding a drive to a linear array allows the array to grow. - * It is permitted if the new drive has a matching superblock - * already on it, with raid_disk equal to raid_disks. - * It is achieved by creating a new linear_private_data structure - * and swapping it in in-place of the current one. - * The current one is never freed until the array is stopped. - * This avoids races. - */ - struct linear_conf *newconf, *oldconf; - - if (rdev->saved_raid_disk != mddev->raid_disks) - return -EINVAL; - - rdev->raid_disk = rdev->saved_raid_disk; - rdev->saved_raid_disk = -1; - - newconf = linear_conf(mddev,mddev->raid_disks+1); - - if (!newconf) - return -ENOMEM; - - /* newconf->raid_disks already keeps a copy of * the increased - * value of mddev->raid_disks, WARN_ONCE() is just used to make - * sure of this. It is possible that oldconf is still referenced - * in linear_congested(), therefore kfree_rcu() is used to free - * oldconf until no one uses it anymore. - */ - oldconf = rcu_dereference_protected(mddev->private, - lockdep_is_held(&mddev->reconfig_mutex)); - mddev->raid_disks++; - WARN_ONCE(mddev->raid_disks != newconf->raid_disks, - "copied raid_disks doesn't match mddev->raid_disks"); - rcu_assign_pointer(mddev->private, newconf); - md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); - kfree_rcu(oldconf, rcu); - return 0; -} - -static void linear_free(struct mddev *mddev, void *priv) -{ - struct linear_conf *conf = priv; - - kfree(conf); -} - -static bool linear_make_request(struct mddev *mddev, struct bio *bio) -{ - struct dev_info *tmp_dev; - sector_t start_sector, end_sector, data_offset; - sector_t bio_sector = bio->bi_iter.bi_sector; - - if (unlikely(bio->bi_opf & REQ_PREFLUSH) - && md_flush_request(mddev, bio)) - return true; - - tmp_dev = which_dev(mddev, bio_sector); - start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; - end_sector = tmp_dev->end_sector; - data_offset = tmp_dev->rdev->data_offset; - - if (unlikely(bio_sector >= end_sector || - bio_sector < start_sector)) - goto out_of_bounds; - - if (unlikely(is_rdev_broken(tmp_dev->rdev))) { - md_error(mddev, tmp_dev->rdev); - bio_io_error(bio); - return true; - } - - if (unlikely(bio_end_sector(bio) > end_sector)) { - /* This bio crosses a device boundary, so we have to split it */ - struct bio *split = bio_split(bio, end_sector - bio_sector, - GFP_NOIO, &mddev->bio_set); - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; - } - - md_account_bio(mddev, &bio); - bio_set_dev(bio, tmp_dev->rdev->bdev); - bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - - start_sector + data_offset; - - if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) { - /* Just ignore it */ - bio_endio(bio); - } else { - if (mddev->gendisk) - trace_block_bio_remap(bio, disk_devt(mddev->gendisk), - bio_sector); - mddev_check_write_zeroes(mddev, bio); - submit_bio_noacct(bio); - } - return true; - -out_of_bounds: - pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n", - mdname(mddev), - (unsigned long long)bio->bi_iter.bi_sector, - tmp_dev->rdev->bdev, - (unsigned long long)tmp_dev->rdev->sectors, - (unsigned long long)start_sector); - bio_io_error(bio); - return true; -} - -static void linear_status (struct seq_file *seq, struct mddev *mddev) -{ - seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); -} - -static void linear_error(struct mddev *mddev, struct md_rdev *rdev) -{ - if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) { - char *md_name = mdname(mddev); - - pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n", - md_name, rdev->bdev); - } -} - -static void linear_quiesce(struct mddev *mddev, int state) -{ -} - -static struct md_personality linear_personality = -{ - .name = "linear", - .level = LEVEL_LINEAR, - .owner = THIS_MODULE, - .make_request = linear_make_request, - .run = linear_run, - .free = linear_free, - .status = linear_status, - .hot_add_disk = linear_add, - .size = linear_size, - .quiesce = linear_quiesce, - .error_handler = linear_error, -}; - -static int __init linear_init (void) -{ - return register_md_personality (&linear_personality); -} - -static void linear_exit (void) -{ - unregister_md_personality (&linear_personality); -} - -module_init(linear_init); -module_exit(linear_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)"); -MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ -MODULE_ALIAS("md-linear"); -MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c deleted file mode 100644 index d22276870283..000000000000 --- a/drivers/md/md-multipath.c +++ /dev/null @@ -1,471 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * multipath.c : Multiple Devices driver for Linux - * - * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat - * - * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman - * - * MULTIPATH management functions. - * - * derived from raid1.c. - */ - -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/raid/md_u.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include "md.h" -#include "md-multipath.h" - -#define MAX_WORK_PER_DISK 128 - -#define NR_RESERVED_BUFS 32 - -static int multipath_map (struct mpconf *conf) -{ - int i, disks = conf->raid_disks; - - /* - * Later we do read balancing on the read side - * now we use the first available disk. - */ - - rcu_read_lock(); - for (i = 0; i < disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - return i; - } - } - rcu_read_unlock(); - - pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n"); - return (-1); -} - -static void multipath_reschedule_retry (struct multipath_bh *mp_bh) -{ - unsigned long flags; - struct mddev *mddev = mp_bh->mddev; - struct mpconf *conf = mddev->private; - - spin_lock_irqsave(&conf->device_lock, flags); - list_add(&mp_bh->retry_list, &conf->retry_list); - spin_unlock_irqrestore(&conf->device_lock, flags); - md_wakeup_thread(mddev->thread); -} - -/* - * multipath_end_bh_io() is called when we have finished servicing a multipathed - * operation and are ready to return a success/failure code to the buffer - * cache layer. - */ -static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status) -{ - struct bio *bio = mp_bh->master_bio; - struct mpconf *conf = mp_bh->mddev->private; - - bio->bi_status = status; - bio_endio(bio); - mempool_free(mp_bh, &conf->pool); -} - -static void multipath_end_request(struct bio *bio) -{ - struct multipath_bh *mp_bh = bio->bi_private; - struct mpconf *conf = mp_bh->mddev->private; - struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; - - if (!bio->bi_status) - multipath_end_bh_io(mp_bh, 0); - else if (!(bio->bi_opf & REQ_RAHEAD)) { - /* - * oops, IO error: - */ - md_error (mp_bh->mddev, rdev); - pr_info("multipath: %pg: rescheduling sector %llu\n", - rdev->bdev, - (unsigned long long)bio->bi_iter.bi_sector); - multipath_reschedule_retry(mp_bh); - } else - multipath_end_bh_io(mp_bh, bio->bi_status); - rdev_dec_pending(rdev, conf->mddev); -} - -static bool multipath_make_request(struct mddev *mddev, struct bio * bio) -{ - struct mpconf *conf = mddev->private; - struct multipath_bh * mp_bh; - struct multipath_info *multipath; - - if (unlikely(bio->bi_opf & REQ_PREFLUSH) - && md_flush_request(mddev, bio)) - return true; - - md_account_bio(mddev, &bio); - mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); - - mp_bh->master_bio = bio; - mp_bh->mddev = mddev; - - mp_bh->path = multipath_map(conf); - if (mp_bh->path < 0) { - bio_io_error(bio); - mempool_free(mp_bh, &conf->pool); - return true; - } - multipath = conf->multipaths + mp_bh->path; - - bio_init_clone(multipath->rdev->bdev, &mp_bh->bio, bio, GFP_NOIO); - - mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; - mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; - mp_bh->bio.bi_end_io = multipath_end_request; - mp_bh->bio.bi_private = mp_bh; - mddev_check_write_zeroes(mddev, &mp_bh->bio); - submit_bio_noacct(&mp_bh->bio); - return true; -} - -static void multipath_status(struct seq_file *seq, struct mddev *mddev) -{ - struct mpconf *conf = mddev->private; - int i; - - seq_printf (seq, " [%d/%d] [", conf->raid_disks, - conf->raid_disks - mddev->degraded); - rcu_read_lock(); - for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); - } - rcu_read_unlock(); - seq_putc(seq, ']'); -} - -/* - * Careful, this can execute in IRQ contexts as well! - */ -static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - - if (conf->raid_disks - mddev->degraded <= 1) { - /* - * Uh oh, we can do nothing if this is our last path, but - * first check if this is a queued request for a device - * which has just failed. - */ - pr_warn("multipath: only one IO path left and IO error.\n"); - /* leave it active... it's all we have */ - return; - } - /* - * Mark disk as unusable - */ - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - } - set_bit(Faulty, &rdev->flags); - set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - pr_err("multipath: IO failure on %pg, disabling IO path.\n" - "multipath: Operation continuing on %d IO paths.\n", - rdev->bdev, - conf->raid_disks - mddev->degraded); -} - -static void print_multipath_conf (struct mpconf *conf) -{ - int i; - struct multipath_info *tmp; - - pr_debug("MULTIPATH conf printout:\n"); - if (!conf) { - pr_debug("(conf==NULL)\n"); - return; - } - pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, - conf->raid_disks); - - for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->multipaths + i; - if (tmp->rdev) - pr_debug(" disk%d, o:%d, dev:%pg\n", - i,!test_bit(Faulty, &tmp->rdev->flags), - tmp->rdev->bdev); - } -} - -static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - int err = -EEXIST; - int path; - struct multipath_info *p; - int first = 0; - int last = mddev->raid_disks - 1; - - if (rdev->raid_disk >= 0) - first = last = rdev->raid_disk; - - print_multipath_conf(conf); - - for (path = first; path <= last; path++) - if ((p=conf->multipaths+path)->rdev == NULL) { - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - err = md_integrity_add_rdev(rdev, mddev); - if (err) - break; - spin_lock_irq(&conf->device_lock); - mddev->degraded--; - rdev->raid_disk = path; - set_bit(In_sync, &rdev->flags); - spin_unlock_irq(&conf->device_lock); - rcu_assign_pointer(p->rdev, rdev); - err = 0; - break; - } - - print_multipath_conf(conf); - - return err; -} - -static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - int err = 0; - int number = rdev->raid_disk; - struct multipath_info *p = conf->multipaths + number; - - print_multipath_conf(conf); - - if (rdev == p->rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number); - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } - } - err = md_integrity_register(mddev); - } -abort: - - print_multipath_conf(conf); - return err; -} - -/* - * This is a kernel thread which: - * - * 1. Retries failed read operations on working multipaths. - * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. - */ - -static void multipathd(struct md_thread *thread) -{ - struct mddev *mddev = thread->mddev; - struct multipath_bh *mp_bh; - struct bio *bio; - unsigned long flags; - struct mpconf *conf = mddev->private; - struct list_head *head = &conf->retry_list; - - md_check_recovery(mddev); - for (;;) { - spin_lock_irqsave(&conf->device_lock, flags); - if (list_empty(head)) - break; - mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); - list_del(head->prev); - spin_unlock_irqrestore(&conf->device_lock, flags); - - bio = &mp_bh->bio; - bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; - - if ((mp_bh->path = multipath_map (conf))<0) { - pr_err("multipath: %pg: unrecoverable IO read error for block %llu\n", - bio->bi_bdev, - (unsigned long long)bio->bi_iter.bi_sector); - multipath_end_bh_io(mp_bh, BLK_STS_IOERR); - } else { - pr_err("multipath: %pg: redirecting sector %llu to another IO path\n", - bio->bi_bdev, - (unsigned long long)bio->bi_iter.bi_sector); - *bio = *(mp_bh->master_bio); - bio->bi_iter.bi_sector += - conf->multipaths[mp_bh->path].rdev->data_offset; - bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev); - bio->bi_opf |= REQ_FAILFAST_TRANSPORT; - bio->bi_end_io = multipath_end_request; - bio->bi_private = mp_bh; - submit_bio_noacct(bio); - } - } - spin_unlock_irqrestore(&conf->device_lock, flags); -} - -static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - WARN_ONCE(sectors || raid_disks, - "%s does not support generic reshape\n", __func__); - - return mddev->dev_sectors; -} - -static int multipath_run (struct mddev *mddev) -{ - struct mpconf *conf; - int disk_idx; - struct multipath_info *disk; - struct md_rdev *rdev; - int working_disks; - int ret; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - - if (mddev->level != LEVEL_MULTIPATH) { - pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n", - mdname(mddev), mddev->level); - goto out; - } - /* - * copy the already verified devices into our private MULTIPATH - * bookkeeping area. [whatever we allocate in multipath_run(), - * should be freed in multipath_free()] - */ - - conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); - mddev->private = conf; - if (!conf) - goto out; - - conf->multipaths = kcalloc(mddev->raid_disks, - sizeof(struct multipath_info), - GFP_KERNEL); - if (!conf->multipaths) - goto out_free_conf; - - working_disks = 0; - rdev_for_each(rdev, mddev) { - disk_idx = rdev->raid_disk; - if (disk_idx < 0 || - disk_idx >= mddev->raid_disks) - continue; - - disk = conf->multipaths + disk_idx; - disk->rdev = rdev; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - if (!test_bit(Faulty, &rdev->flags)) - working_disks++; - } - - conf->raid_disks = mddev->raid_disks; - conf->mddev = mddev; - spin_lock_init(&conf->device_lock); - INIT_LIST_HEAD(&conf->retry_list); - - if (!working_disks) { - pr_warn("multipath: no operational IO paths for %s\n", - mdname(mddev)); - goto out_free_conf; - } - mddev->degraded = conf->raid_disks - working_disks; - - ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS, - sizeof(struct multipath_bh)); - if (ret) - goto out_free_conf; - - rcu_assign_pointer(mddev->thread, - md_register_thread(multipathd, mddev, "multipath")); - if (!mddev->thread) - goto out_free_conf; - - pr_info("multipath: array %s active with %d out of %d IO paths\n", - mdname(mddev), conf->raid_disks - mddev->degraded, - mddev->raid_disks); - /* - * Ok, everything is just fine now - */ - md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); - - if (md_integrity_register(mddev)) - goto out_free_conf; - - return 0; - -out_free_conf: - mempool_exit(&conf->pool); - kfree(conf->multipaths); - kfree(conf); - mddev->private = NULL; -out: - return -EIO; -} - -static void multipath_free(struct mddev *mddev, void *priv) -{ - struct mpconf *conf = priv; - - mempool_exit(&conf->pool); - kfree(conf->multipaths); - kfree(conf); -} - -static struct md_personality multipath_personality = -{ - .name = "multipath", - .level = LEVEL_MULTIPATH, - .owner = THIS_MODULE, - .make_request = multipath_make_request, - .run = multipath_run, - .free = multipath_free, - .status = multipath_status, - .error_handler = multipath_error, - .hot_add_disk = multipath_add_disk, - .hot_remove_disk= multipath_remove_disk, - .size = multipath_size, -}; - -static int __init multipath_init (void) -{ - return register_md_personality (&multipath_personality); -} - -static void __exit multipath_exit (void) -{ - unregister_md_personality (&multipath_personality); -} - -module_init(multipath_init); -module_exit(multipath_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("simple multi-path personality for MD (deprecated)"); -MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ -MODULE_ALIAS("md-multipath"); -MODULE_ALIAS("md-level--4"); diff --git a/drivers/md/md.c b/drivers/md/md.c index 9bdd57324c37..0a2bd72a6d76 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -543,6 +543,9 @@ static void md_end_flush(struct bio *bio) rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pair is percpu_ref_get() from md_flush_request() */ + percpu_ref_put(&mddev->active_io); + /* The pre-request flush has finished */ queue_work(md_wq, &mddev->flush_work); } @@ -562,12 +565,8 @@ static void submit_flushes(struct work_struct *ws) rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { - /* Take two references, one is dropped - * when request finishes, one after - * we reclaim rcu_read_lock - */ struct bio *bi; - atomic_inc(&rdev->nr_pending); + atomic_inc(&rdev->nr_pending); rcu_read_unlock(); bi = bio_alloc_bioset(rdev->bdev, 0, @@ -578,7 +577,6 @@ static void submit_flushes(struct work_struct *ws) atomic_inc(&mddev->flush_pending); submit_bio(bi); rcu_read_lock(); - rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); if (atomic_dec_and_test(&mddev->flush_pending)) @@ -631,6 +629,18 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) /* new request after previous flush is completed */ if (ktime_after(req_start, mddev->prev_flush_start)) { WARN_ON(mddev->flush_bio); + /* + * Grab a reference to make sure mddev_suspend() will wait for + * this flush to be done. + * + * md_flush_reqeust() is called under md_handle_request() and + * 'active_io' is already grabbed, hence percpu_ref_is_zero() + * won't pass, percpu_ref_tryget_live() can't be used because + * percpu_ref_kill() can be called by mddev_suspend() + * concurrently. + */ + WARN_ON(percpu_ref_is_zero(&mddev->active_io)); + percpu_ref_get(&mddev->active_io); mddev->flush_bio = bio; bio = NULL; } @@ -1027,9 +1037,10 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, return; bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, - 1, - REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA, - GFP_NOIO, &mddev->sync_set); + 1, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META + | REQ_PREFLUSH | REQ_FUA, + GFP_NOIO, &mddev->sync_set); atomic_inc(&rdev->nr_pending); @@ -1209,6 +1220,7 @@ struct super_type { struct md_rdev *refdev, int minor_version); int (*validate_super)(struct mddev *mddev, + struct md_rdev *freshest, struct md_rdev *rdev); void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); @@ -1289,17 +1301,11 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor rdev->sb_size = MD_SB_BYTES; rdev->badblocks.shift = -1; - if (sb->level == LEVEL_MULTIPATH) - rdev->desc_nr = -1; - else - rdev->desc_nr = sb->this_disk.number; - - /* not spare disk, or LEVEL_MULTIPATH */ - if (sb->level == LEVEL_MULTIPATH || - (rdev->desc_nr >= 0 && - rdev->desc_nr < MD_SB_DISKS && - sb->disks[rdev->desc_nr].state & - ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) + rdev->desc_nr = sb->this_disk.number; + + /* not spare disk */ + if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && + sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) spare_disk = false; if (!refdev) { @@ -1346,8 +1352,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor /* * validate_super for 0.90.0 + * note: we are not using "freshest" for 0.9 superblock */ -static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) +static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { mdp_disk_t *desc; mdp_super_t *sb = page_address(rdev->sb_page); @@ -1445,31 +1452,28 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) return 0; } - if (mddev->level != LEVEL_MULTIPATH) { - desc = sb->disks + rdev->desc_nr; + desc = sb->disks + rdev->desc_nr; - if (desc->state & (1<<MD_DISK_FAULTY)) - set_bit(Faulty, &rdev->flags); - else if (desc->state & (1<<MD_DISK_SYNC) /* && - desc->raid_disk < mddev->raid_disks */) { - set_bit(In_sync, &rdev->flags); + if (desc->state & (1<<MD_DISK_FAULTY)) + set_bit(Faulty, &rdev->flags); + else if (desc->state & (1<<MD_DISK_SYNC)) { + set_bit(In_sync, &rdev->flags); + rdev->raid_disk = desc->raid_disk; + rdev->saved_raid_disk = desc->raid_disk; + } else if (desc->state & (1<<MD_DISK_ACTIVE)) { + /* active but not in sync implies recovery up to + * reshape position. We don't know exactly where + * that is, so set to zero for now + */ + if (mddev->minor_version >= 91) { + rdev->recovery_offset = 0; rdev->raid_disk = desc->raid_disk; - rdev->saved_raid_disk = desc->raid_disk; - } else if (desc->state & (1<<MD_DISK_ACTIVE)) { - /* active but not in sync implies recovery up to - * reshape position. We don't know exactly where - * that is, so set to zero for now */ - if (mddev->minor_version >= 91) { - rdev->recovery_offset = 0; - rdev->raid_disk = desc->raid_disk; - } } - if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) - set_bit(WriteMostly, &rdev->flags); - if (desc->state & (1<<MD_DISK_FAILFAST)) - set_bit(FailFast, &rdev->flags); - } else /* MULTIPATH are always insync */ - set_bit(In_sync, &rdev->flags); + } + if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) + set_bit(WriteMostly, &rdev->flags); + if (desc->state & (1<<MD_DISK_FAILFAST)) + set_bit(FailFast, &rdev->flags); return 0; } @@ -1759,10 +1763,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; - if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) - rdev->desc_nr = -1; - else - rdev->desc_nr = le32_to_cpu(sb->dev_number); + rdev->desc_nr = le32_to_cpu(sb->dev_number); if (!rdev->bb_page) { rdev->bb_page = alloc_page(GFP_KERNEL); @@ -1815,12 +1816,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sb->level != 0) return -EINVAL; - /* not spare disk, or LEVEL_MULTIPATH */ - if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || - (rdev->desc_nr >= 0 && - rdev->desc_nr < le32_to_cpu(sb->max_dev) && - (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || - le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) + /* not spare disk */ + if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) spare_disk = false; if (!refdev) { @@ -1859,10 +1858,11 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ return ret; } -static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) +static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); + int role; rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); @@ -1955,13 +1955,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for - * spares (which don't need an event count) */ - ++ev1; + * spares (which don't need an event count). + * Similar to mdadm, we allow event counter difference of 1 + * from the freshest device. + */ if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) - if (ev1 < mddev->events) + if (ev1 + 1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an @@ -1976,58 +1978,85 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0; } - if (mddev->level != LEVEL_MULTIPATH) { - int role; - if (rdev->desc_nr < 0 || - rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { - role = MD_DISK_ROLE_SPARE; - rdev->desc_nr = -1; - } else - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); - switch(role) { - case MD_DISK_ROLE_SPARE: /* spare */ - break; - case MD_DISK_ROLE_FAULTY: /* faulty */ - set_bit(Faulty, &rdev->flags); - break; - case MD_DISK_ROLE_JOURNAL: /* journal device */ - if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { - /* journal device without journal feature */ - pr_warn("md: journal device provided without journal feature, ignoring the device\n"); - return -EINVAL; - } - set_bit(Journal, &rdev->flags); - rdev->journal_tail = le64_to_cpu(sb->journal_tail); - rdev->raid_disk = 0; - break; - default: - rdev->saved_raid_disk = role; - if ((le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_OFFSET)) { - rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); - if (!(le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_BITMAP)) - rdev->saved_raid_disk = -1; - } else { - /* - * If the array is FROZEN, then the device can't - * be in_sync with rest of array. - */ - if (!test_bit(MD_RECOVERY_FROZEN, - &mddev->recovery)) - set_bit(In_sync, &rdev->flags); - } - rdev->raid_disk = role; - break; + + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = MD_DISK_ROLE_SPARE; + rdev->desc_nr = -1; + } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { + /* + * If we are assembling, and our event counter is smaller than the + * highest event counter, we cannot trust our superblock about the role. + * It could happen that our rdev was marked as Faulty, and all other + * superblocks were updated with +1 event counter. + * Then, before the next superblock update, which typically happens when + * remove_and_add_spares() removes the device from the array, there was + * a crash or reboot. + * If we allow current rdev without consulting the freshest superblock, + * we could cause data corruption. + * Note that in this case our event counter is smaller by 1 than the + * highest, otherwise, this rdev would not be allowed into array; + * both kernel and mdadm allow event counter difference of 1. + */ + struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); + u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); + + if (rdev->desc_nr >= freshest_max_dev) { + /* this is unexpected, better not proceed */ + pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", + mdname(mddev), rdev->bdev, rdev->desc_nr, + freshest->bdev, freshest_max_dev); + return -EUCLEAN; } - if (sb->devflags & WriteMostly1) - set_bit(WriteMostly, &rdev->flags); - if (sb->devflags & FailFast1) - set_bit(FailFast, &rdev->flags); - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) - set_bit(Replacement, &rdev->flags); - } else /* MULTIPATH are always insync */ - set_bit(In_sync, &rdev->flags); + + role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); + pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", + mdname(mddev), rdev->bdev, role, role, freshest->bdev); + } else { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + } + switch (role) { + case MD_DISK_ROLE_SPARE: /* spare */ + break; + case MD_DISK_ROLE_FAULTY: /* faulty */ + set_bit(Faulty, &rdev->flags); + break; + case MD_DISK_ROLE_JOURNAL: /* journal device */ + if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { + /* journal device without journal feature */ + pr_warn("md: journal device provided without journal feature, ignoring the device\n"); + return -EINVAL; + } + set_bit(Journal, &rdev->flags); + rdev->journal_tail = le64_to_cpu(sb->journal_tail); + rdev->raid_disk = 0; + break; + default: + rdev->saved_raid_disk = role; + if ((le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_OFFSET)) { + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + if (!(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_BITMAP)) + rdev->saved_raid_disk = -1; + } else { + /* + * If the array is FROZEN, then the device can't + * be in_sync with rest of array. + */ + if (!test_bit(MD_RECOVERY_FROZEN, + &mddev->recovery)) + set_bit(In_sync, &rdev->flags); + } + rdev->raid_disk = role; + break; + } + if (sb->devflags & WriteMostly1) + set_bit(WriteMostly, &rdev->flags); + if (sb->devflags & FailFast1) + set_bit(FailFast, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) + set_bit(Replacement, &rdev->flags); return 0; } @@ -2845,10 +2874,6 @@ rewrite: } else pr_debug("md: %pg (skipping faulty)\n", rdev->bdev); - - if (mddev->level == LEVEL_MULTIPATH) - /* only need to write one superblock... */ - break; } if (md_super_wait(mddev) < 0) goto rewrite; @@ -2890,7 +2915,7 @@ static int add_bound_rdev(struct md_rdev *rdev) * and should be added immediately. */ super_types[mddev->major_version]. - validate_super(mddev, rdev); + validate_super(mddev, NULL/*freshest*/, rdev); err = mddev->pers->hot_add_disk(mddev, rdev); if (err) { md_kick_rdev_from_array(rdev); @@ -3827,7 +3852,7 @@ static int analyze_sbs(struct mddev *mddev) } super_types[mddev->major_version]. - validate_super(mddev, freshest); + validate_super(mddev, NULL/*freshest*/, freshest); i = 0; rdev_for_each_safe(rdev, tmp, mddev) { @@ -3842,20 +3867,15 @@ static int analyze_sbs(struct mddev *mddev) } if (rdev != freshest) { if (super_types[mddev->major_version]. - validate_super(mddev, rdev)) { + validate_super(mddev, freshest, rdev)) { pr_warn("md: kicking non-fresh %pg from array!\n", rdev->bdev); md_kick_rdev_from_array(rdev); continue; } } - if (mddev->level == LEVEL_MULTIPATH) { - rdev->desc_nr = i++; - rdev->raid_disk = rdev->desc_nr; - set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= - (mddev->raid_disks - min(0, mddev->delta_disks)) && - !test_bit(Journal, &rdev->flags)) { + if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && + !test_bit(Journal, &rdev->flags)) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } @@ -6833,7 +6853,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. - validate_super(mddev, rdev); + validate_super(mddev, NULL/*freshest*/, rdev); if ((info->state & (1<<MD_DISK_SYNC)) && rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't @@ -8076,7 +8096,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) return; mddev->pers->error_handler(mddev, rdev); - if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) + if (mddev->pers->level == 0) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) @@ -9240,44 +9260,19 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *rdev; int spares = 0; int removed = 0; - bool remove_some = false; if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) /* Mustn't remove devices when resync thread is running */ return 0; rdev_for_each(rdev, mddev) { - if ((this == NULL || rdev == this) && - rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - test_bit(Faulty, &rdev->flags) && - atomic_read(&rdev->nr_pending)==0) { - /* Faulty non-Blocked devices with nr_pending == 0 - * never get nr_pending incremented, - * never get Faulty cleared, and never get Blocked set. - * So we can synchronize_rcu now rather than once per device - */ - remove_some = true; - set_bit(RemoveSynchronized, &rdev->flags); - } - } - - if (remove_some) - synchronize_rcu(); - rdev_for_each(rdev, mddev) { - if ((this == NULL || rdev == this) && - (test_bit(RemoveSynchronized, &rdev->flags) || - rdev_removeable(rdev))) { - if (mddev->pers->hot_remove_disk( - mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->saved_raid_disk = rdev->raid_disk; - rdev->raid_disk = -1; - removed++; - } + if ((this == NULL || rdev == this) && rdev_removeable(rdev) && + !mddev->pers->hot_remove_disk(mddev, rdev)) { + sysfs_unlink_rdev(mddev, rdev); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + removed++; } - if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) - clear_bit(RemoveSynchronized, &rdev->flags); } if (removed && mddev->kobj.sd) diff --git a/drivers/md/md.h b/drivers/md/md.h index ade83af123a2..8d881cc59799 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -190,11 +190,6 @@ enum flag_bits { * than other devices in the array */ ClusterRemove, - RemoveSynchronized, /* synchronize_rcu() was called after - * this device was known to be faulty, - * so it is safe to remove without - * another synchronize_rcu() call. - */ ExternalBbl, /* External metadata provides bad * block management for a disk */ diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 3f22edec70e7..512746551f36 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -173,3 +173,57 @@ static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) else md_bitmap_unplug(bitmap); } + +/* + * Used by fix_read_error() to decay the per rdev read_errors. + * We halve the read error count for every hour that has elapsed + * since the last recorded read error. + */ +static inline void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) +{ + long cur_time_mon; + unsigned long hours_since_last; + unsigned int read_errors = atomic_read(&rdev->read_errors); + + cur_time_mon = ktime_get_seconds(); + + if (rdev->last_read_error == 0) { + /* first time we've seen a read error */ + rdev->last_read_error = cur_time_mon; + return; + } + + hours_since_last = (long)(cur_time_mon - + rdev->last_read_error) / 3600; + + rdev->last_read_error = cur_time_mon; + + /* + * if hours_since_last is > the number of bits in read_errors + * just set read errors to 0. We do this to avoid + * overflowing the shift of read_errors by hours_since_last. + */ + if (hours_since_last >= 8 * sizeof(read_errors)) + atomic_set(&rdev->read_errors, 0); + else + atomic_set(&rdev->read_errors, read_errors >> hours_since_last); +} + +static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev) +{ + int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + int read_errors; + + check_decay_read_errors(mddev, rdev); + read_errors = atomic_inc_return(&rdev->read_errors); + if (read_errors > max_read_errors) { + pr_notice("md/"RAID_1_10_NAME":%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", + mdname(mddev), rdev->bdev, read_errors, max_read_errors); + pr_notice("md/"RAID_1_10_NAME":%s: %pg: Failing raid device\n", + mdname(mddev), rdev->bdev); + md_error(mddev, rdev); + return true; + } + + return false; +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 35d12948e0a9..aaa434f0c175 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -49,6 +49,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #define raid1_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) +#define RAID_1_10_NAME "raid1" #include "raid1-10.c" #define START(node) ((node)->start) @@ -609,7 +610,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect int choose_first; int choose_next_idle; - rcu_read_lock(); /* * Check if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. @@ -642,7 +642,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect unsigned int pending; bool nonrot; - rdev = rcu_dereference(conf->mirrors[disk].rdev); + rdev = conf->mirrors[disk].rdev; if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL || test_bit(Faulty, &rdev->flags)) @@ -773,7 +773,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect } if (best_disk >= 0) { - rdev = rcu_dereference(conf->mirrors[best_disk].rdev); + rdev = conf->mirrors[best_disk].rdev; if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); @@ -784,7 +784,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; } - rcu_read_unlock(); *max_sectors = sectors; return best_disk; @@ -1126,8 +1125,6 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO, &r1_bio->mddev->bio_set); - if (!behind_bio) - return; /* discard op, we don't support writezero/writesame yet */ if (!bio_has_data(bio)) { @@ -1235,14 +1232,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, if (r1bio_existed) { /* Need to get the block device name carefully */ - struct md_rdev *rdev; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev); + struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; + if (rdev) snprintf(b, sizeof(b), "%pg", rdev->bdev); else strcpy(b, "???"); - rcu_read_unlock(); } /* @@ -1396,10 +1391,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, disks = conf->raid_disks * 2; blocked_rdev = NULL; - rcu_read_lock(); max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; /* * The write-behind io is only attempted on drives marked as @@ -1465,7 +1459,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, } r1_bio->bios[i] = bio; } - rcu_read_unlock(); if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ @@ -1617,15 +1610,16 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) struct r1conf *conf = mddev->private; int i; + lockdep_assert_held(&mddev->lock); + seq_printf(seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev); + seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); } - rcu_read_unlock(); seq_printf(seq, "]"); } @@ -1691,16 +1685,15 @@ static void print_conf(struct r1conf *conf) pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); - rcu_read_lock(); + lockdep_assert_held(&conf->mddev->reconfig_mutex); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; if (rdev) pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", i, !test_bit(In_sync, &rdev->flags), !test_bit(Faulty, &rdev->flags), rdev->bdev); } - rcu_read_unlock(); } static void close_sync(struct r1conf *conf) @@ -1810,7 +1803,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) */ if (rdev->saved_raid_disk < 0) conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); + WRITE_ONCE(p->rdev, rdev); break; } if (test_bit(WantReplacement, &p->rdev->flags) && @@ -1826,7 +1819,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = repl_slot; err = 0; conf->fullsync = 1; - rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); + WRITE_ONCE(p[conf->raid_disks].rdev, rdev); } print_conf(conf); @@ -1862,16 +1855,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - p->rdev = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } - } + WRITE_ONCE(p->rdev, NULL); if (conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before @@ -1892,7 +1876,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } clear_bit(Replacement, &repl->flags); - p->rdev = repl; + WRITE_ONCE(p->rdev, repl); conf->mirrors[conf->raid_disks + number].rdev = NULL; unfreeze_array(conf); } @@ -2272,16 +2256,24 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) * 3. Performs writes following reads for array synchronising. */ -static void fix_read_error(struct r1conf *conf, int read_disk, - sector_t sect, int sectors) +static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) { + sector_t sect = r1_bio->sector; + int sectors = r1_bio->sectors; + int read_disk = r1_bio->read_disk; struct mddev *mddev = conf->mddev; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[read_disk].rdev); + + if (exceed_read_errors(mddev, rdev)) { + r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; + return; + } + while(sectors) { int s = sectors; int d = read_disk; int success = 0; int start; - struct md_rdev *rdev; if (s > (PAGE_SIZE>>9)) s = PAGE_SIZE >> 9; @@ -2290,8 +2282,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, sector_t first_bad; int bad_sectors; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && (test_bit(In_sync, &rdev->flags) || (!test_bit(Faulty, &rdev->flags) && @@ -2299,15 +2290,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk, is_badblock(rdev, sect, s, &first_bad, &bad_sectors) == 0) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); if (sync_page_io(rdev, sect, s<<9, conf->tmppage, REQ_OP_READ, false)) success = 1; rdev_dec_pending(rdev, mddev); if (success) break; - } else - rcu_read_unlock(); + } + d++; if (d == conf->raid_disks * 2) d = 0; @@ -2326,29 +2316,24 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (d==0) d = conf->raid_disks * 2; d--; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); r1_sync_page_io(rdev, sect, s, conf->tmppage, WRITE); rdev_dec_pending(rdev, mddev); - } else - rcu_read_unlock(); + } } d = start; while (d != read_disk) { if (d==0) d = conf->raid_disks * 2; d--; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); @@ -2359,8 +2344,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, rdev->bdev); } rdev_dec_pending(rdev, mddev); - } else - rcu_read_unlock(); + } } sectors -= s; sect += s; @@ -2530,8 +2514,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) if (mddev->ro == 0 && !test_bit(FailFast, &rdev->flags)) { freeze_array(conf, 1); - fix_read_error(conf, r1_bio->read_disk, - r1_bio->sector, r1_bio->sectors); + fix_read_error(conf, r1_bio); unfreeze_array(conf); } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) { md_error(mddev, rdev); @@ -2741,7 +2724,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, r1_bio = raid1_alloc_init_r1buf(conf); - rcu_read_lock(); /* * If we get a correctably read error during resync or recovery, * we might want to read from a different device. So we @@ -2762,7 +2744,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, struct md_rdev *rdev; bio = r1_bio->bios[i]; - rdev = rcu_dereference(conf->mirrors[i].rdev); + rdev = conf->mirrors[i].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) @@ -2820,7 +2802,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_opf |= MD_FAILFAST; } } - rcu_read_unlock(); if (disk < 0) disk = wonly; r1_bio->read_disk = disk; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a5927e98dc67..7412066ea22c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -19,6 +19,8 @@ #include <linux/raid/md_p.h> #include <trace/events/block.h> #include "md.h" + +#define RAID_1_10_NAME "raid10" #include "raid10.h" #include "raid0.h" #include "md-bitmap.h" @@ -743,7 +745,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, struct geom *geo = &conf->geo; raid10_find_phys(conf, r10_bio); - rcu_read_lock(); best_dist_slot = -1; min_pending = UINT_MAX; best_dist_rdev = NULL; @@ -775,18 +776,11 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (r10_bio->devs[slot].bio == IO_BLOCKED) continue; disk = r10_bio->devs[slot].devnum; - rdev = rcu_dereference(conf->mirrors[disk].replacement); + rdev = conf->mirrors[disk].replacement; if (rdev == NULL || test_bit(Faulty, &rdev->flags) || r10_bio->devs[slot].addr + sectors > - rdev->recovery_offset) { - /* - * Read replacement first to prevent reading both rdev - * and replacement as NULL during replacement replace - * rdev. - */ - smp_mb(); - rdev = rcu_dereference(conf->mirrors[disk].rdev); - } + rdev->recovery_offset) + rdev = conf->mirrors[disk].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) continue; @@ -876,7 +870,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, r10_bio->read_slot = slot; } else rdev = NULL; - rcu_read_unlock(); *max_sectors = best_good_sectors; return rdev; @@ -1198,9 +1191,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, */ gfp = GFP_NOIO | __GFP_HIGH; - rcu_read_lock(); disk = r10_bio->devs[slot].devnum; - err_rdev = rcu_dereference(conf->mirrors[disk].rdev); + err_rdev = conf->mirrors[disk].rdev; if (err_rdev) snprintf(b, sizeof(b), "%pg", err_rdev->bdev); else { @@ -1208,7 +1200,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, /* This never gets dereferenced */ err_rdev = r10_bio->devs[slot].rdev; } - rcu_read_unlock(); } if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) @@ -1279,15 +1270,8 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, int devnum = r10_bio->devs[n_copy].devnum; struct bio *mbio; - if (replacement) { - rdev = conf->mirrors[devnum].replacement; - if (rdev == NULL) { - /* Replacement just got moved to main 'rdev' */ - smp_mb(); - rdev = conf->mirrors[devnum].rdev; - } - } else - rdev = conf->mirrors[devnum].rdev; + rdev = replacement ? conf->mirrors[devnum].replacement : + conf->mirrors[devnum].rdev; mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); if (replacement) @@ -1321,25 +1305,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } -static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror, - struct md_rdev **prrdev) -{ - struct md_rdev *rdev, *rrdev; - - rrdev = rcu_dereference(mirror->replacement); - /* - * Read replacement first to prevent reading both rdev and - * replacement as NULL during replacement replace rdev. - */ - smp_mb(); - rdev = rcu_dereference(mirror->rdev); - if (rdev == rrdev) - rrdev = NULL; - - *prrdev = rrdev; - return rdev; -} - static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) { int i; @@ -1348,11 +1313,11 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) retry_wait: blocked_rdev = NULL; - rcu_read_lock(); for (i = 0; i < conf->copies; i++) { struct md_rdev *rdev, *rrdev; - rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev); + rdev = conf->mirrors[i].rdev; + rrdev = conf->mirrors[i].replacement; if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; @@ -1391,7 +1356,6 @@ retry_wait: } } } - rcu_read_unlock(); if (unlikely(blocked_rdev)) { /* Have to wait for this device to get unblocked, then retry */ @@ -1474,14 +1438,14 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, wait_blocked_dev(mddev, r10_bio); - rcu_read_lock(); max_sectors = r10_bio->sectors; for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; struct md_rdev *rdev, *rrdev; - rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev); + rdev = conf->mirrors[d].rdev; + rrdev = conf->mirrors[d].replacement; if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1535,7 +1499,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&rrdev->nr_pending); } } - rcu_read_unlock(); if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; @@ -1625,17 +1588,8 @@ static void raid10_end_discard_request(struct bio *bio) set_bit(R10BIO_Uptodate, &r10_bio->state); dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); - if (repl) - rdev = conf->mirrors[dev].replacement; - if (!rdev) { - /* - * raid10_remove_disk uses smp_mb to make sure rdev is set to - * replacement before setting replacement to NULL. It can read - * rdev first without barrier protect even replacement is NULL - */ - smp_rmb(); - rdev = conf->mirrors[dev].rdev; - } + rdev = repl ? conf->mirrors[dev].replacement : + conf->mirrors[dev].rdev; raid_end_discard_bio(r10_bio); rdev_dec_pending(rdev, conf->mddev); @@ -1785,11 +1739,11 @@ retry_discard: * inc refcount on their rdev. Record them by setting * bios[x] to bio */ - rcu_read_lock(); for (disk = 0; disk < geo->raid_disks; disk++) { struct md_rdev *rdev, *rrdev; - rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev); + rdev = conf->mirrors[disk].rdev; + rrdev = conf->mirrors[disk].replacement; r10_bio->devs[disk].bio = NULL; r10_bio->devs[disk].repl_bio = NULL; @@ -1809,7 +1763,6 @@ retry_discard: atomic_inc(&rrdev->nr_pending); } } - rcu_read_unlock(); atomic_set(&r10_bio->remaining, 1); for (disk = 0; disk < geo->raid_disks; disk++) { @@ -1939,6 +1892,8 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev) struct r10conf *conf = mddev->private; int i; + lockdep_assert_held(&mddev->lock); + if (conf->geo.near_copies < conf->geo.raid_disks) seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); if (conf->geo.near_copies > 1) @@ -1953,12 +1908,11 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev) } seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, conf->geo.raid_disks - mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->geo.raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev); + seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); } - rcu_read_unlock(); seq_printf(seq, "]"); } @@ -1980,7 +1934,6 @@ static int _enough(struct r10conf *conf, int previous, int ignore) ncopies = conf->geo.near_copies; } - rcu_read_lock(); do { int n = conf->copies; int cnt = 0; @@ -1988,7 +1941,7 @@ static int _enough(struct r10conf *conf, int previous, int ignore) while (n--) { struct md_rdev *rdev; if (this != ignore && - (rdev = rcu_dereference(conf->mirrors[this].rdev)) && + (rdev = conf->mirrors[this].rdev) && test_bit(In_sync, &rdev->flags)) cnt++; this = (this+1) % disks; @@ -1999,7 +1952,6 @@ static int _enough(struct r10conf *conf, int previous, int ignore) } while (first != 0); has_enough = 1; out: - rcu_read_unlock(); return has_enough; } @@ -2072,8 +2024,7 @@ static void print_conf(struct r10conf *conf) pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, conf->geo.raid_disks); - /* This is only called with ->reconfix_mutex held, so - * rcu protection of rdev is not needed */ + lockdep_assert_held(&conf->mddev->reconfig_mutex); for (i = 0; i < conf->geo.raid_disks; i++) { rdev = conf->mirrors[i].rdev; if (rdev) @@ -2190,7 +2141,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) err = 0; if (rdev->saved_raid_disk != mirror) conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); + WRITE_ONCE(p->rdev, rdev); break; } @@ -2204,7 +2155,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); conf->fullsync = 1; - rcu_assign_pointer(p->replacement, rdev); + WRITE_ONCE(p->replacement, rdev); } print_conf(conf); @@ -2246,24 +2197,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - *rdevp = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - goto abort; - } - } + WRITE_ONCE(*rdevp, NULL); if (p->replacement) { /* We must have just cleared 'rdev' */ - p->rdev = p->replacement; + WRITE_ONCE(p->rdev, p->replacement); clear_bit(Replacement, &p->replacement->flags); - smp_mb(); /* Make sure other CPUs may see both as identical - * but will never see neither -- if they are careful. - */ - p->replacement = NULL; + WRITE_ONCE(p->replacement, NULL); } clear_bit(WantReplacement, &rdev->flags); @@ -2655,42 +2594,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) } } -/* - * Used by fix_read_error() to decay the per rdev read_errors. - * We halve the read error count for every hour that has elapsed - * since the last recorded read error. - * - */ -static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) -{ - long cur_time_mon; - unsigned long hours_since_last; - unsigned int read_errors = atomic_read(&rdev->read_errors); - - cur_time_mon = ktime_get_seconds(); - - if (rdev->last_read_error == 0) { - /* first time we've seen a read error */ - rdev->last_read_error = cur_time_mon; - return; - } - - hours_since_last = (long)(cur_time_mon - - rdev->last_read_error) / 3600; - - rdev->last_read_error = cur_time_mon; - - /* - * if hours_since_last is > the number of bits in read_errors - * just set read errors to 0. We do this to avoid - * overflowing the shift of read_errors by hours_since_last. - */ - if (hours_since_last >= 8 * sizeof(read_errors)) - atomic_set(&rdev->read_errors, 0); - else - atomic_set(&rdev->read_errors, read_errors >> hours_since_last); -} - static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, int sectors, struct page *page, enum req_op op) { @@ -2728,7 +2631,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 int sect = 0; /* Offset from r10_bio->sector */ int sectors = r10_bio->sectors, slot = r10_bio->read_slot; struct md_rdev *rdev; - int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int d = r10_bio->devs[slot].devnum; /* still own a reference to this rdev, so it cannot @@ -2741,15 +2643,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 more fix_read_error() attempts */ return; - check_decay_read_errors(mddev, rdev); - atomic_inc(&rdev->read_errors); - if (atomic_read(&rdev->read_errors) > max_read_errors) { - pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", - mdname(mddev), rdev->bdev, - atomic_read(&rdev->read_errors), max_read_errors); - pr_notice("md/raid10:%s: %pg: Failing raid device\n", - mdname(mddev), rdev->bdev); - md_error(mddev, rdev); + if (exceed_read_errors(mddev, rdev)) { r10_bio->devs[slot].bio = IO_BLOCKED; return; } @@ -2763,20 +2657,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 if (s > (PAGE_SIZE>>9)) s = PAGE_SIZE >> 9; - rcu_read_lock(); do { sector_t first_bad; int bad_sectors; d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, &first_bad, &bad_sectors) == 0) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); success = sync_page_io(rdev, r10_bio->devs[sl].addr + sect, @@ -2784,7 +2676,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 conf->tmppage, REQ_OP_READ, false); rdev_dec_pending(rdev, mddev); - rcu_read_lock(); if (success) break; } @@ -2792,7 +2683,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 if (sl == conf->copies) sl = 0; } while (sl != slot); - rcu_read_unlock(); if (!success) { /* Cannot read from anywhere, just mark the block @@ -2816,20 +2706,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 start = sl; /* write it back and re-read */ - rcu_read_lock(); while (sl != slot) { if (sl==0) sl = conf->copies; sl--; d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (!rdev || test_bit(Faulty, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); if (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, @@ -2848,7 +2736,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rdev->bdev); } rdev_dec_pending(rdev, mddev); - rcu_read_lock(); } sl = start; while (sl != slot) { @@ -2856,14 +2743,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 sl = conf->copies; sl--; d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (!rdev || test_bit(Faulty, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); switch (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, @@ -2891,9 +2777,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 } rdev_dec_pending(rdev, mddev); - rcu_read_lock(); } - rcu_read_unlock(); sectors -= s; sect += s; @@ -3367,14 +3251,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Completed a full sync so the replacements * are now fully recovered. */ - rcu_read_lock(); for (i = 0; i < conf->geo.raid_disks; i++) { struct md_rdev *rdev = - rcu_dereference(conf->mirrors[i].replacement); + conf->mirrors[i].replacement; + if (rdev) rdev->recovery_offset = MaxSector; } - rcu_read_unlock(); } conf->fullsync = 0; } @@ -3455,9 +3338,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; - rcu_read_lock(); - mrdev = rcu_dereference(mirror->rdev); - mreplace = rcu_dereference(mirror->replacement); + mrdev = mirror->rdev; + mreplace = mirror->replacement; if (mrdev && (test_bit(Faulty, &mrdev->flags) || test_bit(In_sync, &mrdev->flags))) @@ -3465,22 +3347,18 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mreplace && test_bit(Faulty, &mreplace->flags)) mreplace = NULL; - if (!mrdev && !mreplace) { - rcu_read_unlock(); + if (!mrdev && !mreplace) continue; - } still_degraded = 0; /* want to reconstruct this device */ rb2 = r10_bio; sect = raid10_find_virt(conf, sector_nr, i); - if (sect >= mddev->resync_max_sectors) { + if (sect >= mddev->resync_max_sectors) /* last stripe is not complete - don't * try to recover this sector. */ - rcu_read_unlock(); continue; - } /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap @@ -3496,14 +3374,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * that there will never be anything to do here */ chunks_skipped = -1; - rcu_read_unlock(); continue; } if (mrdev) atomic_inc(&mrdev->nr_pending); if (mreplace) atomic_inc(&mreplace->nr_pending); - rcu_read_unlock(); r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; @@ -3522,10 +3398,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to check if the array will still be * degraded */ - rcu_read_lock(); for (j = 0; j < conf->geo.raid_disks; j++) { - struct md_rdev *rdev = rcu_dereference( - conf->mirrors[j].rdev); + struct md_rdev *rdev = conf->mirrors[j].rdev; + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { still_degraded = 1; break; @@ -3540,8 +3415,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int k; int d = r10_bio->devs[j].devnum; sector_t from_addr, to_addr; - struct md_rdev *rdev = - rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rdev = conf->mirrors[d].rdev; sector_t sector, first_bad; int bad_sectors; if (!rdev || @@ -3620,7 +3494,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, atomic_inc(&r10_bio->remaining); break; } - rcu_read_unlock(); if (j == conf->copies) { /* Cannot recover, so abort the recovery or * record a bad block */ @@ -3747,12 +3620,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[i].bio; bio->bi_status = BLK_STS_IOERR; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { - rcu_read_unlock(); + rdev = conf->mirrors[d].rdev; + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) continue; - } + sector = r10_bio->devs[i].addr; if (is_badblock(rdev, sector, max_sync, &first_bad, &bad_sectors)) { @@ -3762,7 +3633,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bad_sectors -= (sector - first_bad); if (max_sync > bad_sectors) max_sync = bad_sectors; - rcu_read_unlock(); continue; } } @@ -3778,11 +3648,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio_set_dev(bio, rdev->bdev); count++; - rdev = rcu_dereference(conf->mirrors[d].replacement); - if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { - rcu_read_unlock(); + rdev = conf->mirrors[d].replacement; + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) continue; - } + atomic_inc(&rdev->nr_pending); /* Need to set up for writing to the replacement */ @@ -3799,7 +3668,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_iter.bi_sector = sector + rdev->data_offset; bio_set_dev(bio, rdev->bdev); count++; - rcu_read_unlock(); } if (count < 2) { @@ -4509,11 +4377,11 @@ static int calc_degraded(struct r10conf *conf) int degraded, degraded2; int i; - rcu_read_lock(); degraded = 0; /* 'prev' section first */ for (i = 0; i < conf->prev.raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) degraded++; else if (!test_bit(In_sync, &rdev->flags)) @@ -4523,13 +4391,12 @@ static int calc_degraded(struct r10conf *conf) */ degraded++; } - rcu_read_unlock(); if (conf->geo.raid_disks == conf->prev.raid_disks) return degraded; - rcu_read_lock(); degraded2 = 0; for (i = 0; i < conf->geo.raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) degraded2++; else if (!test_bit(In_sync, &rdev->flags)) { @@ -4542,7 +4409,6 @@ static int calc_degraded(struct r10conf *conf) degraded2++; } } - rcu_read_unlock(); if (degraded2 > degraded) return degraded2; return degraded; @@ -4974,16 +4840,15 @@ read_more: blist = read_bio; read_bio->bi_next = NULL; - rcu_read_lock(); for (s = 0; s < conf->copies*2; s++) { struct bio *b; int d = r10_bio->devs[s/2].devnum; struct md_rdev *rdev2; if (s&1) { - rdev2 = rcu_dereference(conf->mirrors[d].replacement); + rdev2 = conf->mirrors[d].replacement; b = r10_bio->devs[s/2].repl_bio; } else { - rdev2 = rcu_dereference(conf->mirrors[d].rdev); + rdev2 = conf->mirrors[d].rdev; b = r10_bio->devs[s/2].bio; } if (!rdev2 || test_bit(Faulty, &rdev2->flags)) @@ -5017,7 +4882,6 @@ read_more: sector_nr += len >> 9; nr_sectors += len >> 9; } - rcu_read_unlock(); r10_bio->sectors = nr_sectors; /* Now submit the read */ @@ -5070,20 +4934,17 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) struct bio *b; int d = r10_bio->devs[s/2].devnum; struct md_rdev *rdev; - rcu_read_lock(); if (s&1) { - rdev = rcu_dereference(conf->mirrors[d].replacement); + rdev = conf->mirrors[d].replacement; b = r10_bio->devs[s/2].repl_bio; } else { - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; b = r10_bio->devs[s/2].bio; } - if (!rdev || test_bit(Faulty, &rdev->flags)) { - rcu_read_unlock(); + if (!rdev || test_bit(Faulty, &rdev->flags)) continue; - } + atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); md_sync_acct_bio(b, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; @@ -5154,10 +5015,9 @@ static int handle_reshape_read_error(struct mddev *mddev, if (s > (PAGE_SIZE >> 9)) s = PAGE_SIZE >> 9; - rcu_read_lock(); while (!success) { int d = r10b->devs[slot].devnum; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rdev = conf->mirrors[d].rdev; sector_t addr; if (rdev == NULL || test_bit(Faulty, &rdev->flags) || @@ -5166,14 +5026,12 @@ static int handle_reshape_read_error(struct mddev *mddev, addr = r10b->devs[slot].addr + idx * PAGE_SIZE; atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); success = sync_page_io(rdev, addr, s << 9, pages[idx], REQ_OP_READ, false); rdev_dec_pending(rdev, mddev); - rcu_read_lock(); if (success) break; failed: @@ -5183,7 +5041,6 @@ static int handle_reshape_read_error(struct mddev *mddev, if (slot == first_slot) break; } - rcu_read_unlock(); if (!success) { /* couldn't read this block, must give up */ set_bit(MD_RECOVERY_INTR, @@ -5209,12 +5066,8 @@ static void end_reshape_write(struct bio *bio) struct md_rdev *rdev = NULL; d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); - if (repl) - rdev = conf->mirrors[d].replacement; - if (!rdev) { - smp_mb(); - rdev = conf->mirrors[d].rdev; - } + rdev = repl ? conf->mirrors[d].replacement : + conf->mirrors[d].rdev; if (bio->bi_status) { /* FIXME should record badblock */ @@ -5249,18 +5102,16 @@ static void raid10_finish_reshape(struct mddev *mddev) mddev->resync_max_sectors = mddev->array_sectors; } else { int d; - rcu_read_lock(); for (d = conf->geo.raid_disks ; d < conf->geo.raid_disks - mddev->delta_disks; d++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rdev = conf->mirrors[d].rdev; if (rdev) clear_bit(In_sync, &rdev->flags); - rdev = rcu_dereference(conf->mirrors[d].replacement); + rdev = conf->mirrors[d].replacement; if (rdev) clear_bit(In_sync, &rdev->flags); } - rcu_read_unlock(); } mddev->layout = mddev->new_layout; mddev->chunk_sectors = 1 << conf->geo.chunk_shift; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 6157f5beb9fe..874874fe4fa1 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1890,28 +1890,22 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf, continue; /* in case device is broken */ - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[disk_index].rdev); + rdev = conf->disks[disk_index].rdev; if (rdev) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); sync_page_io(rdev, sh->sector, PAGE_SIZE, sh->dev[disk_index].page, REQ_OP_WRITE, false); rdev_dec_pending(rdev, rdev->mddev); - rcu_read_lock(); } - rrdev = rcu_dereference(conf->disks[disk_index].replacement); + rrdev = conf->disks[disk_index].replacement; if (rrdev) { atomic_inc(&rrdev->nr_pending); - rcu_read_unlock(); sync_page_io(rrdev, sh->sector, PAGE_SIZE, sh->dev[disk_index].page, REQ_OP_WRITE, false); rdev_dec_pending(rrdev, rrdev->mddev); - rcu_read_lock(); } - rcu_read_unlock(); } ctx->data_parity_stripes++; out: @@ -2948,7 +2942,6 @@ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) if (!log) return false; - WARN_ON_ONCE(!rcu_read_lock_held()); tree_index = r5c_tree_index(conf, sect); slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); return slot != NULL; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index eaea57aee602..da4ba736c4f0 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -620,11 +620,9 @@ static void ppl_do_flush(struct ppl_io_unit *io) struct md_rdev *rdev; struct block_device *bdev = NULL; - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; if (rdev && !test_bit(Faulty, &rdev->flags)) bdev = rdev->bdev; - rcu_read_unlock(); if (bdev) { struct bio *bio; @@ -882,9 +880,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)r_sector, dd_idx, (unsigned long long)sector); - /* Array has not started so rcu dereference is safe */ - rdev = rcu_dereference_protected( - conf->disks[dd_idx].rdev, 1); + rdev = conf->disks[dd_idx].rdev; if (!rdev || (!test_bit(In_sync, &rdev->flags) && sector >= rdev->recovery_offset)) { pr_debug("%s:%*s data member disk %d missing\n", @@ -936,9 +932,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, 0, &disk, &sh); BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk)); - /* Array has not started so rcu dereference is safe */ - parity_rdev = rcu_dereference_protected( - conf->disks[sh.pd_idx].rdev, 1); + parity_rdev = conf->disks[sh.pd_idx].rdev; BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev); pr_debug("%s:%*s write parity at sector %llu, disk %pg\n", @@ -1404,9 +1398,7 @@ int ppl_init_log(struct r5conf *conf) for (i = 0; i < ppl_conf->count; i++) { struct ppl_log *log = &ppl_conf->child_logs[i]; - /* Array has not started so rcu dereference is safe */ - struct md_rdev *rdev = - rcu_dereference_protected(conf->disks[i].rdev, 1); + struct md_rdev *rdev = conf->disks[i].rdev; mutex_init(&log->io_mutex); spin_lock_init(&log->io_list_lock); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 26e1e8a5e941..8497880135ee 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -36,7 +36,6 @@ */ #include <linux/blkdev.h> -#include <linux/delay.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/async_tx.h> @@ -694,12 +693,12 @@ int raid5_calc_degraded(struct r5conf *conf) int degraded, degraded2; int i; - rcu_read_lock(); degraded = 0; for (i = 0; i < conf->previous_raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = rcu_dereference(conf->disks[i].replacement); + rdev = READ_ONCE(conf->disks[i].replacement); if (!rdev || test_bit(Faulty, &rdev->flags)) degraded++; else if (test_bit(In_sync, &rdev->flags)) @@ -717,15 +716,14 @@ int raid5_calc_degraded(struct r5conf *conf) if (conf->raid_disks >= conf->previous_raid_disks) degraded++; } - rcu_read_unlock(); if (conf->raid_disks == conf->previous_raid_disks) return degraded; - rcu_read_lock(); degraded2 = 0; for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = rcu_dereference(conf->disks[i].replacement); + rdev = READ_ONCE(conf->disks[i].replacement); if (!rdev || test_bit(Faulty, &rdev->flags)) degraded2++; else if (test_bit(In_sync, &rdev->flags)) @@ -739,7 +737,6 @@ int raid5_calc_degraded(struct r5conf *conf) if (conf->raid_disks <= conf->previous_raid_disks) degraded2++; } - rcu_read_unlock(); if (degraded2 > degraded) return degraded2; return degraded; @@ -1184,14 +1181,8 @@ again: bi = &dev->req; rbi = &dev->rreq; /* For writing to replacement */ - rcu_read_lock(); - rrdev = rcu_dereference(conf->disks[i].replacement); - smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ - rdev = rcu_dereference(conf->disks[i].rdev); - if (!rdev) { - rdev = rrdev; - rrdev = NULL; - } + rdev = conf->disks[i].rdev; + rrdev = conf->disks[i].replacement; if (op_is_write(op)) { if (replace_only) rdev = NULL; @@ -1212,7 +1203,6 @@ again: rrdev = NULL; if (rrdev) atomic_inc(&rrdev->nr_pending); - rcu_read_unlock(); /* We have already checked bad blocks for reads. Now * need to check for writes. We never accept write errors @@ -2731,28 +2721,6 @@ static void shrink_stripes(struct r5conf *conf) conf->slab_cache = NULL; } -/* - * This helper wraps rcu_dereference_protected() and can be used when - * it is known that the nr_pending of the rdev is elevated. - */ -static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev) -{ - return rcu_dereference_protected(rdev, - atomic_read(&rcu_access_pointer(rdev)->nr_pending)); -} - -/* - * This helper wraps rcu_dereference_protected() and should be used - * when it is known that the mddev_lock() is held. This is safe - * seeing raid5_remove_disk() has the same lock held. - */ -static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev, - struct md_rdev __rcu *rdev) -{ - return rcu_dereference_protected(rdev, - lockdep_is_held(&mddev->reconfig_mutex)); -} - static void raid5_end_read_request(struct bio * bi) { struct stripe_head *sh = bi->bi_private; @@ -2778,9 +2746,9 @@ static void raid5_end_read_request(struct bio * bi) * In that case it moved down to 'rdev'. * rdev is not removed until all requests are finished. */ - rdev = rdev_pend_deref(conf->disks[i].replacement); + rdev = conf->disks[i].replacement; if (!rdev) - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; if (use_new_offset(conf, sh)) s = sh->sector + rdev->new_data_offset; @@ -2893,11 +2861,11 @@ static void raid5_end_write_request(struct bio *bi) for (i = 0 ; i < disks; i++) { if (bi == &sh->dev[i].req) { - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; break; } if (bi == &sh->dev[i].rreq) { - rdev = rdev_pend_deref(conf->disks[i].replacement); + rdev = conf->disks[i].replacement; if (rdev) replacement = 1; else @@ -2905,7 +2873,7 @@ static void raid5_end_write_request(struct bio *bi) * replaced it. rdev is not removed * until all requests are finished. */ - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; break; } } @@ -3667,15 +3635,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, int bitmap_end = 0; if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - struct md_rdev *rdev; - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = conf->disks[i].rdev; + if (rdev && test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) atomic_inc(&rdev->nr_pending); else rdev = NULL; - rcu_read_unlock(); if (rdev) { if (!rdev_set_badblocks( rdev, @@ -3793,16 +3759,17 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, /* During recovery devices cannot be removed, so * locking and refcounting of rdevs is not needed */ - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = conf->disks[i].rdev; + if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; - rdev = rcu_dereference(conf->disks[i].replacement); + rdev = conf->disks[i].replacement; + if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) @@ -3810,7 +3777,6 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; } - rcu_read_unlock(); if (abort) conf->recovery_disabled = conf->mddev->recovery_disabled; @@ -3823,15 +3789,13 @@ static int want_replace(struct stripe_head *sh, int disk_idx) struct md_rdev *rdev; int rv = 0; - rcu_read_lock(); - rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); + rdev = sh->raid_conf->disks[disk_idx].replacement; if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && (rdev->recovery_offset <= sh->sector || rdev->mddev->recovery_cp <= sh->sector)) rv = 1; - rcu_read_unlock(); return rv; } @@ -4708,7 +4672,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) s->log_failed = r5l_log_disk_error(conf); /* Now to look around and see what can be done */ - rcu_read_lock(); for (i=disks; i--; ) { struct md_rdev *rdev; sector_t first_bad; @@ -4753,7 +4716,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) /* Prefer to use the replacement for reads, but only * if it is recovered enough and has no bad blocks. */ - rdev = rcu_dereference(conf->disks[i].replacement); + rdev = conf->disks[i].replacement; if (rdev && !test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), @@ -4764,7 +4727,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) set_bit(R5_NeedReplace, &dev->flags); else clear_bit(R5_NeedReplace, &dev->flags); - rdev = rcu_dereference(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; clear_bit(R5_ReadRepl, &dev->flags); } if (rdev && test_bit(Faulty, &rdev->flags)) @@ -4811,8 +4774,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_WriteError, &dev->flags)) { /* This flag does not apply to '.replacement' * only to .rdev, so make sure to check that*/ - struct md_rdev *rdev2 = rcu_dereference( - conf->disks[i].rdev); + struct md_rdev *rdev2 = conf->disks[i].rdev; + if (rdev2 == rdev) clear_bit(R5_Insync, &dev->flags); if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { @@ -4824,8 +4787,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_MadeGood, &dev->flags)) { /* This flag does not apply to '.replacement' * only to .rdev, so make sure to check that*/ - struct md_rdev *rdev2 = rcu_dereference( - conf->disks[i].rdev); + struct md_rdev *rdev2 = conf->disks[i].rdev; + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { s->handle_bad_blocks = 1; atomic_inc(&rdev2->nr_pending); @@ -4833,8 +4796,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) clear_bit(R5_MadeGood, &dev->flags); } if (test_bit(R5_MadeGoodRepl, &dev->flags)) { - struct md_rdev *rdev2 = rcu_dereference( - conf->disks[i].replacement); + struct md_rdev *rdev2 = conf->disks[i].replacement; + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { s->handle_bad_blocks = 1; atomic_inc(&rdev2->nr_pending); @@ -4855,8 +4818,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev && !test_bit(Faulty, &rdev->flags)) do_recovery = 1; else if (!rdev) { - rdev = rcu_dereference( - conf->disks[i].replacement); + rdev = conf->disks[i].replacement; if (rdev && !test_bit(Faulty, &rdev->flags)) do_recovery = 1; } @@ -4883,7 +4845,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) else s->replacing = 1; } - rcu_read_unlock(); } /* @@ -5340,23 +5301,23 @@ finish: struct r5dev *dev = &sh->dev[i]; if (test_and_clear_bit(R5_WriteError, &dev->flags)) { /* We own a safe reference to the rdev */ - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; if (!rdev_set_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { - rdev = rdev_pend_deref(conf->disks[i].replacement); + rdev = conf->disks[i].replacement; if (!rdev) /* rdev have been moved down */ - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); @@ -5515,24 +5476,22 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) &dd_idx, NULL); end_sector = sector + bio_sectors(raid_bio); - rcu_read_lock(); if (r5c_big_stripe_cached(conf, sector)) - goto out_rcu_unlock; + return 0; - rdev = rcu_dereference(conf->disks[dd_idx].replacement); + rdev = conf->disks[dd_idx].replacement; if (!rdev || test_bit(Faulty, &rdev->flags) || rdev->recovery_offset < end_sector) { - rdev = rcu_dereference(conf->disks[dd_idx].rdev); + rdev = conf->disks[dd_idx].rdev; if (!rdev) - goto out_rcu_unlock; + return 0; if (test_bit(Faulty, &rdev->flags) || !(test_bit(In_sync, &rdev->flags) || rdev->recovery_offset >= end_sector)) - goto out_rcu_unlock; + return 0; } atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, &bad_sectors)) { @@ -5576,10 +5535,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) raid_bio->bi_iter.bi_sector); submit_bio_noacct(align_bio); return 1; - -out_rcu_unlock: - rcu_read_unlock(); - return 0; } static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) @@ -6582,14 +6537,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n * Note in case of > 1 drive failures it's possible we're rebuilding * one drive while leaving another faulty drive in array. */ - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = conf->disks[i].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) still_degraded = 1; } - rcu_read_unlock(); md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); @@ -6820,18 +6773,7 @@ static void raid5d(struct md_thread *thread) spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); - - /* - * Waiting on MD_SB_CHANGE_PENDING below may deadlock - * seeing md_check_recovery() is needed to clear - * the flag when using mdmon. - */ - continue; } - - wait_event_lock_irq(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), - conf->device_lock); } pr_debug("%d stripes handled\n", handled); @@ -7911,18 +7853,10 @@ static int raid5_run(struct mddev *mddev) for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; i++) { - rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); - if (!rdev && conf->disks[i].replacement) { - /* The replacement is all we have yet */ - rdev = rdev_mdlock_deref(mddev, - conf->disks[i].replacement); - conf->disks[i].replacement = NULL; - clear_bit(Replacement, &rdev->flags); - rcu_assign_pointer(conf->disks[i].rdev, rdev); - } + rdev = conf->disks[i].rdev; if (!rdev) continue; - if (rcu_access_pointer(conf->disks[i].replacement) && + if (conf->disks[i].replacement && conf->reshape_progress != MaxSector) { /* replacements and reshape simply do not mix. */ pr_warn("md: cannot handle concurrent replacement and reshape.\n"); @@ -8106,15 +8040,16 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) struct r5conf *conf = mddev->private; int i; + lockdep_assert_held(&mddev->lock); + seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, conf->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); } - rcu_read_unlock(); seq_printf (seq, "]"); } @@ -8152,9 +8087,8 @@ static int raid5_spare_active(struct mddev *mddev) unsigned long flags; for (i = 0; i < conf->raid_disks; i++) { - rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); - replacement = rdev_mdlock_deref(mddev, - conf->disks[i].replacement); + rdev = conf->disks[i].rdev; + replacement = conf->disks[i].replacement; if (replacement && replacement->recovery_offset == MaxSector && !test_bit(Faulty, &replacement->flags) @@ -8193,7 +8127,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r5conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; - struct md_rdev __rcu **rdevp; + struct md_rdev **rdevp; struct disk_info *p; struct md_rdev *tmp; @@ -8216,9 +8150,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (unlikely(number >= conf->pool_size)) return 0; p = conf->disks + number; - if (rdev == rcu_access_pointer(p->rdev)) + if (rdev == p->rdev) rdevp = &p->rdev; - else if (rdev == rcu_access_pointer(p->replacement)) + else if (rdev == p->replacement) rdevp = &p->replacement; else return 0; @@ -8238,37 +8172,24 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (!test_bit(Faulty, &rdev->flags) && mddev->recovery_disabled != conf->recovery_disabled && !has_failed(conf) && - (!rcu_access_pointer(p->replacement) || - rcu_access_pointer(p->replacement) == rdev) && + (!p->replacement || p->replacement == rdev) && number < conf->raid_disks) { err = -EBUSY; goto abort; } - *rdevp = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - lockdep_assert_held(&mddev->reconfig_mutex); - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - rcu_assign_pointer(*rdevp, rdev); - } - } + WRITE_ONCE(*rdevp, NULL); if (!err) { err = log_modify(conf, rdev, false); if (err) goto abort; } - tmp = rcu_access_pointer(p->replacement); + tmp = p->replacement; if (tmp) { /* We must have just cleared 'rdev' */ - rcu_assign_pointer(p->rdev, tmp); + WRITE_ONCE(p->rdev, tmp); clear_bit(Replacement, &tmp->flags); - smp_mb(); /* Make sure other CPUs may see both as identical - * but will never see neither - if they are careful - */ - rcu_assign_pointer(p->replacement, NULL); + WRITE_ONCE(p->replacement, NULL); if (!err) err = log_modify(conf, tmp, true); @@ -8336,7 +8257,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = disk; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); + WRITE_ONCE(p->rdev, rdev); err = log_modify(conf, rdev, true); @@ -8345,7 +8266,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) } for (disk = first; disk <= last; disk++) { p = conf->disks + disk; - tmp = rdev_mdlock_deref(mddev, p->rdev); + tmp = p->rdev; if (test_bit(WantReplacement, &tmp->flags) && mddev->reshape_position == MaxSector && p->replacement == NULL) { @@ -8354,7 +8275,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = disk; err = 0; conf->fullsync = 1; - rcu_assign_pointer(p->replacement, rdev); + WRITE_ONCE(p->replacement, rdev); break; } } @@ -8487,7 +8408,7 @@ static int raid5_start_reshape(struct mddev *mddev) if (mddev->recovery_cp < MaxSector) return -EBUSY; for (i = 0; i < conf->raid_disks; i++) - if (rdev_mdlock_deref(mddev, conf->disks[i].replacement)) + if (conf->disks[i].replacement) return -EBUSY; rdev_for_each(rdev, mddev) { @@ -8658,12 +8579,10 @@ static void raid5_finish_reshape(struct mddev *mddev) for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; d++) { - rdev = rdev_mdlock_deref(mddev, - conf->disks[d].rdev); + rdev = conf->disks[d].rdev; if (rdev) clear_bit(In_sync, &rdev->flags); - rdev = rdev_mdlock_deref(mddev, - conf->disks[d].replacement); + rdev = conf->disks[d].replacement; if (rdev) clear_bit(In_sync, &rdev->flags); } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 22bea20eccbd..9b5a7dc3f2a0 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -473,8 +473,8 @@ enum { */ struct disk_info { - struct md_rdev __rcu *rdev; - struct md_rdev __rcu *replacement; + struct md_rdev *rdev; + struct md_rdev *replacement; struct page *extra_page; /* extra page to use in prexor */ }; |