aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds2022-05-23 14:04:14 -0700
committerLinus Torvalds2022-05-23 14:04:14 -0700
commit5dc921868c507c1f0835932d3f255cf1b7415618 (patch)
tree3b458e2a75c36722fdb71f47d73e6f27fceae02a /drivers/md
parent115cd47132d71bd7e4aa1093e15d861a59e73a94 (diff)
parent537b9f2bf60f4bbd8ab89cea16aaab70f0c1560d (diff)
Merge tag 'for-5.19/drivers-2022-05-22' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe: "Here are the driver updates queued up for 5.19. This contains: - NVMe pull requests via Christoph: - tighten the PCI presence check (Stefan Roese) - fix a potential NULL pointer dereference in an error path (Kyle Miller Smith) - fix interpretation of the DMRSL field (Tom Yan) - relax the data transfer alignment (Keith Busch) - verbose error logging improvements (Max Gurtovoy, Chaitanya Kulkarni) - misc cleanups (Chaitanya Kulkarni, Christoph) - set non-mdts limits in nvme_scan_work (Chaitanya Kulkarni) - add support for TP4084 - Time-to-Ready Enhancements (Christoph) - MD pull request via Song: - Improve annotation in raid5 code, by Logan Gunthorpe - Support MD_BROKEN flag in raid-1/5/10, by Mariusz Tkaczyk - Other small fixes/cleanups - null_blk series making the configfs side much saner (Damien) - Various minor drbd cleanups and fixes (Haowen, Uladzislau, Jiapeng, Arnd, Cai) - Avoid using the system workqueue (and hence flushing it) in rnbd (Jack) - Avoid using the system workqueue (and hence flushing it) in aoe (Tetsuo) - Series fixing discard_alignment issues in drivers (Christoph) - Small series fixing drivers poking at disk->part0 for openers information (Christoph) - Series fixing deadlocks in loop (Christoph, Tetsuo) - Remove loop.h and add SPDX headers (Christoph) - Various fixes and cleanups (Julia, Xie, Yu)" * tag 'for-5.19/drivers-2022-05-22' of git://git.kernel.dk/linux-block: (72 commits) mtip32xx: fix typo in comment nvme: set non-mdts limits in nvme_scan_work nvme: add support for TP4084 - Time-to-Ready Enhancements nvme: split the enum used for various register constants nbd: Fix hung on disconnect request if socket is closed before nvme-fabrics: add a request timeout helper nvme-pci: harden drive presence detect in nvme_dev_disable() nvme-pci: fix a NULL pointer dereference in nvme_alloc_admin_tags nvme: mark internal passthru request RQF_QUIET nvme: remove unneeded include from constants file nvme: add missing status values to verbose logging nvme: set dma alignment to dword nvme: fix interpretation of DMRSL loop: remove most the top-of-file boilerplate comment from the UAPI header loop: remove most the top-of-file boilerplate comment loop: add a SPDX header loop: remove loop.h block: null_blk: Improve device creation with configfs block: null_blk: Cleanup messages block: null_blk: Cleanup device creation and deletion ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-zoned-target.c2
-rw-r--r--drivers/md/md-bitmap.c45
-rw-r--r--drivers/md/md-cluster.c2
-rw-r--r--drivers/md/md.c62
-rw-r--r--drivers/md/md.h62
-rw-r--r--drivers/md/raid0.c31
-rw-r--r--drivers/md/raid1.c43
-rw-r--r--drivers/md/raid10.c40
-rw-r--r--drivers/md/raid5-ppl.c13
-rw-r--r--drivers/md/raid5.c227
-rw-r--r--drivers/md/raid5.h23
11 files changed, 322 insertions, 228 deletions
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index cac295cc8840..0ec5d8b9b1a4 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -1001,7 +1001,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);
- limits->discard_alignment = DMZ_BLOCK_SIZE;
+ limits->discard_alignment = 0;
limits->discard_granularity = DMZ_BLOCK_SIZE;
limits->max_discard_sectors = chunk_sectors;
limits->max_hw_discard_sectors = chunk_sectors;
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index bfd6026d7809..d87f674ab762 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -639,14 +639,6 @@ re_read:
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
- /* Setup nodes/clustername only if bitmap version is
- * cluster-compatible
- */
- if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
- nodes = le32_to_cpu(sb->nodes);
- strlcpy(bitmap->mddev->bitmap_info.cluster_name,
- sb->cluster_name, 64);
- }
/* verify that the bitmap-specific fields are valid */
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -668,6 +660,16 @@ re_read:
goto out;
}
+ /*
+ * Setup nodes/clustername only if bitmap version is
+ * cluster-compatible
+ */
+ if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
+ nodes = le32_to_cpu(sb->nodes);
+ strscpy(bitmap->mddev->bitmap_info.cluster_name,
+ sb->cluster_name, 64);
+ }
+
/* keep the array size field of the bitmap superblock up to date */
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
@@ -695,14 +697,13 @@ re_read:
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
- strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
err = 0;
out:
kunmap_atomic(sb);
- /* Assigning chunksize is required for "re_read" */
- bitmap->mddev->bitmap_info.chunksize = chunksize;
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
+ /* Assigning chunksize is required for "re_read" */
+ bitmap->mddev->bitmap_info.chunksize = chunksize;
err = md_setup_cluster(bitmap->mddev, nodes);
if (err) {
pr_warn("%s: Could not setup cluster service (%d)\n",
@@ -713,18 +714,18 @@ out:
goto re_read;
}
-
out_no_sb:
- if (test_bit(BITMAP_STALE, &bitmap->flags))
- bitmap->events_cleared = bitmap->mddev->events;
- bitmap->mddev->bitmap_info.chunksize = chunksize;
- bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
- bitmap->mddev->bitmap_info.max_write_behind = write_behind;
- bitmap->mddev->bitmap_info.nodes = nodes;
- if (bitmap->mddev->bitmap_info.space == 0 ||
- bitmap->mddev->bitmap_info.space > sectors_reserved)
- bitmap->mddev->bitmap_info.space = sectors_reserved;
- if (err) {
+ if (err == 0) {
+ if (test_bit(BITMAP_STALE, &bitmap->flags))
+ bitmap->events_cleared = bitmap->mddev->events;
+ bitmap->mddev->bitmap_info.chunksize = chunksize;
+ bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
+ bitmap->mddev->bitmap_info.max_write_behind = write_behind;
+ bitmap->mddev->bitmap_info.nodes = nodes;
+ if (bitmap->mddev->bitmap_info.space == 0 ||
+ bitmap->mddev->bitmap_info.space > sectors_reserved)
+ bitmap->mddev->bitmap_info.space = sectors_reserved;
+ } else {
md_bitmap_print_sb(bitmap);
if (bitmap->cluster_slot < 0)
md_cluster_stop(bitmap->mddev);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 1c8a06b77c85..37cbcce3cc66 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -201,7 +201,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
goto out_err;
}
- strlcpy(res->name, name, namelen + 1);
+ strscpy(res->name, name, namelen + 1);
if (with_lvb) {
res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
if (!res->lksb.sb_lvbptr) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2587f872c088..707e802d0082 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2627,14 +2627,16 @@ static void sync_sbs(struct mddev *mddev, int nospares)
static bool does_sb_need_changing(struct mddev *mddev)
{
- struct md_rdev *rdev;
+ struct md_rdev *rdev = NULL, *iter;
struct mdp_superblock_1 *sb;
int role;
/* Find a good rdev */
- rdev_for_each(rdev, mddev)
- if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
+ rdev_for_each(iter, mddev)
+ if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
+ rdev = iter;
break;
+ }
/* No good device found. */
if (!rdev)
@@ -2645,11 +2647,11 @@ static bool does_sb_need_changing(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
/* Device activated? */
- if (role == 0xffff && rdev->raid_disk >=0 &&
+ if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags))
return true;
/* Device turned faulty? */
- if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
+ if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX))
return true;
}
@@ -2984,10 +2986,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
md_error(rdev->mddev, rdev);
- if (test_bit(Faulty, &rdev->flags))
- err = 0;
- else
+
+ if (test_bit(MD_BROKEN, &rdev->mddev->flags))
err = -EBUSY;
+ else
+ err = 0;
} else if (cmd_match(buf, "remove")) {
if (rdev->mddev->pers) {
clear_bit(Blocked, &rdev->flags);
@@ -4028,7 +4031,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
oldpriv = mddev->private;
mddev->pers = pers;
mddev->private = priv;
- strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
mddev->level = mddev->new_level;
mddev->layout = mddev->new_layout;
mddev->chunk_sectors = mddev->new_chunk_sectors;
@@ -4353,10 +4356,9 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
* like active, but no writes have been seen for a while (100msec).
*
* broken
- * RAID0/LINEAR-only: same as clean, but array is missing a member.
- * It's useful because RAID0/LINEAR mounted-arrays aren't stopped
- * when a member is gone, so this state will at least alert the
- * user that something is wrong.
+* Array is failed. It's useful because mounted-arrays aren't stopped
+* when array is failed, so this state will at least alert the user that
+* something is wrong.
*/
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
write_pending, active_idle, broken, bad_word};
@@ -5763,7 +5765,7 @@ static int add_named_array(const char *val, const struct kernel_param *kp)
len--;
if (len >= DISK_NAME_LEN)
return -E2BIG;
- strlcpy(buf, val, len+1);
+ strscpy(buf, val, len+1);
if (strncmp(buf, "md_", 3) == 0)
return md_alloc(0, buf);
if (strncmp(buf, "md", 2) == 0 &&
@@ -5896,7 +5898,7 @@ int md_run(struct mddev *mddev)
mddev->level = pers->level;
mddev->new_level = pers->level;
}
- strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
@@ -7443,7 +7445,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
err = -ENODEV;
else {
md_error(mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
+ if (test_bit(MD_BROKEN, &mddev->flags))
err = -EBUSY;
}
rcu_read_unlock();
@@ -7984,13 +7986,16 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
if (!mddev->pers || !mddev->pers->error_handler)
return;
- mddev->pers->error_handler(mddev,rdev);
- if (mddev->degraded)
+ mddev->pers->error_handler(mddev, rdev);
+
+ if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
+ if (!test_bit(MD_BROKEN, &mddev->flags)) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
if (mddev->event_work.func)
queue_work(md_misc_wq, &mddev->event_work);
md_new_event();
@@ -9670,7 +9675,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
if (test_bit(Candidate, &rdev2->flags)) {
- if (role == 0xfffe) {
+ if (role == MD_DISK_ROLE_FAULTY) {
pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
md_kick_rdev_from_array(rdev2);
continue;
@@ -9683,7 +9688,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
/*
* got activated except reshape is happening.
*/
- if (rdev2->raid_disk == -1 && role != 0xffff &&
+ if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE)) {
rdev2->saved_raid_disk = role;
@@ -9700,7 +9705,8 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
* as faulty. The recovery is performed by the
* one who initiated the error.
*/
- if ((role == 0xfffe) || (role == 0xfffd)) {
+ if (role == MD_DISK_ROLE_FAULTY ||
+ role == MD_DISK_ROLE_JOURNAL) {
md_error(mddev, rdev2);
clear_bit(Blocked, &rdev2->flags);
}
@@ -9790,16 +9796,18 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
void md_reload_sb(struct mddev *mddev, int nr)
{
- struct md_rdev *rdev;
+ struct md_rdev *rdev = NULL, *iter;
int err;
/* Find the rdev */
- rdev_for_each_rcu(rdev, mddev) {
- if (rdev->desc_nr == nr)
+ rdev_for_each_rcu(iter, mddev) {
+ if (iter->desc_nr == nr) {
+ rdev = iter;
break;
+ }
}
- if (!rdev || rdev->desc_nr != nr) {
+ if (!rdev) {
pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
return;
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 6ac283864533..cf2cbb17acbd 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -234,34 +234,42 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
struct md_cluster_info;
-/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */
+/**
+ * enum mddev_flags - md device flags.
+ * @MD_ARRAY_FIRST_USE: First use of array, needs initialization.
+ * @MD_CLOSING: If set, we are closing the array, do not open it then.
+ * @MD_JOURNAL_CLEAN: A raid with journal is already clean.
+ * @MD_HAS_JOURNAL: The raid array has journal feature set.
+ * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took
+ * resync lock, need to release the lock.
+ * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as
+ * calls to md_error() will never cause the array to
+ * become failed.
+ * @MD_HAS_PPL: The raid array has PPL feature set.
+ * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
+ * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata
+ * without taking reconfig_mutex.
+ * @MD_UPDATING_SB: md_check_recovery is updating the metadata without
+ * explicitly holding reconfig_mutex.
+ * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
+ * array is ready yet.
+ * @MD_BROKEN: This is used to stop writes and mark array as failed.
+ *
+ * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
+ */
enum mddev_flags {
- MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */
- MD_CLOSING, /* If set, we are closing the array, do not open
- * it then */
- MD_JOURNAL_CLEAN, /* A raid with journal is already clean */
- MD_HAS_JOURNAL, /* The raid array has journal feature set */
- MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
- * already took resync lock, need to
- * release the lock */
- MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is
- * supported as calls to md_error() will
- * never cause the array to become failed.
- */
- MD_HAS_PPL, /* The raid array has PPL feature set */
- MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */
- MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update
- * the metadata without taking reconfig_mutex.
- */
- MD_UPDATING_SB, /* md_check_recovery is updating the metadata
- * without explicitly holding reconfig_mutex.
- */
- MD_NOT_READY, /* do_md_run() is active, so 'array_state'
- * must not report that array is ready yet
- */
- MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop
- * I/O in case an array member is gone/failed.
- */
+ MD_ARRAY_FIRST_USE,
+ MD_CLOSING,
+ MD_JOURNAL_CLEAN,
+ MD_HAS_JOURNAL,
+ MD_CLUSTER_RESYNC_LOCKED,
+ MD_FAILFAST_SUPPORTED,
+ MD_HAS_PPL,
+ MD_HAS_MULTIPLE_PPLS,
+ MD_ALLOW_SB_UPDATE,
+ MD_UPDATING_SB,
+ MD_NOT_READY,
+ MD_BROKEN,
};
enum mddev_sb_flags {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7231f5e1eaa7..e11701e394ca 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -128,21 +128,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
- if (conf->nr_strip_zones == 1) {
- conf->layout = RAID0_ORIG_LAYOUT;
- } else if (mddev->layout == RAID0_ORIG_LAYOUT ||
- mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) {
- conf->layout = mddev->layout;
- } else if (default_layout == RAID0_ORIG_LAYOUT ||
- default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
- conf->layout = default_layout;
- } else {
- pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
- mdname(mddev));
- pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n");
- err = -ENOTSUPP;
- goto abort;
- }
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
@@ -273,6 +258,22 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
(unsigned long long)smallest->sectors);
}
+ if (conf->nr_strip_zones == 1 || conf->strip_zone[1].nb_dev == 1) {
+ conf->layout = RAID0_ORIG_LAYOUT;
+ } else if (mddev->layout == RAID0_ORIG_LAYOUT ||
+ mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = mddev->layout;
+ } else if (default_layout == RAID0_ORIG_LAYOUT ||
+ default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = default_layout;
+ } else {
+ pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
+ mdname(mddev));
+ pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n");
+ err = -EOPNOTSUPP;
+ goto abort;
+ }
+
pr_debug("md/raid0:%s: done.\n", mdname(mddev));
*private_conf = conf;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5aed2c8b746e..99d5af1362d7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1641,30 +1641,39 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, "]");
}
+/**
+ * raid1_error() - RAID1 error handler.
+ * @mddev: affected md device.
+ * @rdev: member device to fail.
+ *
+ * The routine acknowledges &rdev failure and determines new @mddev state.
+ * If it failed, then:
+ * - &MD_BROKEN flag is set in &mddev->flags.
+ * - recovery is disabled.
+ * Otherwise, it must be degraded:
+ * - recovery is interrupted.
+ * - &mddev->degraded is bumped.
+ *
+ * @rdev is marked as &Faulty excluding case when array is failed and
+ * &mddev->fail_last_dev is off.
+ */
static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
{
char b[BDEVNAME_SIZE];
struct r1conf *conf = mddev->private;
unsigned long flags;
- /*
- * If it is not operational, then we have already marked it as dead
- * else if it is the last working disks with "fail_last_dev == false",
- * ignore the error, let the next level up know.
- * else mark the drive as failed
- */
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
- && (conf->raid_disks - mddev->degraded) == 1) {
- /*
- * Don't fail the drive, act as though we were just a
- * normal single drive.
- * However don't try a recovery from this drive as
- * it is very likely to fail.
- */
- conf->recovery_disabled = mddev->recovery_disabled;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- return;
+
+ if (test_bit(In_sync, &rdev->flags) &&
+ (conf->raid_disks - mddev->degraded) == 1) {
+ set_bit(MD_BROKEN, &mddev->flags);
+
+ if (!mddev->fail_last_dev) {
+ conf->recovery_disabled = mddev->recovery_disabled;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ return;
+ }
}
set_bit(Blocked, &rdev->flags);
if (test_and_clear_bit(In_sync, &rdev->flags))
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 834eb3ba95a6..dfa576cdf11c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1970,32 +1970,40 @@ static int enough(struct r10conf *conf, int ignore)
_enough(conf, 1, ignore);
}
+/**
+ * raid10_error() - RAID10 error handler.
+ * @mddev: affected md device.
+ * @rdev: member device to fail.
+ *
+ * The routine acknowledges &rdev failure and determines new @mddev state.
+ * If it failed, then:
+ * - &MD_BROKEN flag is set in &mddev->flags.
+ * Otherwise, it must be degraded:
+ * - recovery is interrupted.
+ * - &mddev->degraded is bumped.
+
+ * @rdev is marked as &Faulty excluding case when array is failed and
+ * &mddev->fail_last_dev is off.
+ */
static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
{
char b[BDEVNAME_SIZE];
struct r10conf *conf = mddev->private;
unsigned long flags;
- /*
- * If it is not operational, then we have already marked it as dead
- * else if it is the last working disks with "fail_last_dev == false",
- * ignore the error, let the next level up know.
- * else mark the drive as failed
- */
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
- && !enough(conf, rdev->raid_disk)) {
- /*
- * Don't fail the drive, just return an IO error.
- */
- spin_unlock_irqrestore(&conf->device_lock, flags);
- return;
+
+ if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) {
+ set_bit(MD_BROKEN, &mddev->flags);
+
+ if (!mddev->fail_last_dev) {
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ return;
+ }
}
if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
- /*
- * If recovery is running, make sure it aborts.
- */
+
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index d3962d92df18..55d065a87b89 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -883,7 +883,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
(unsigned long long)r_sector, dd_idx,
(unsigned long long)sector);
- rdev = conf->disks[dd_idx].rdev;
+ /* Array has not started so rcu dereference is safe */
+ rdev = rcu_dereference_protected(
+ conf->disks[dd_idx].rdev, 1);
if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
sector >= rdev->recovery_offset)) {
pr_debug("%s:%*s data member disk %d missing\n",
@@ -934,7 +936,10 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
parity_sector = raid5_compute_sector(conf, r_sector_first + i,
0, &disk, &sh);
BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
- parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+ /* Array has not started so rcu dereference is safe */
+ parity_rdev = rcu_dereference_protected(
+ conf->disks[sh.pd_idx].rdev, 1);
BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
@@ -1404,7 +1409,9 @@ int ppl_init_log(struct r5conf *conf)
for (i = 0; i < ppl_conf->count; i++) {
struct ppl_log *log = &ppl_conf->child_logs[i];
- struct md_rdev *rdev = conf->disks[i].rdev;
+ /* Array has not started so rcu dereference is safe */
+ struct md_rdev *rdev =
+ rcu_dereference_protected(conf->disks[i].rdev, 1);
mutex_init(&log->io_mutex);
spin_lock_init(&log->io_list_lock);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 59f91e392a2a..39038fa8b1c8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -79,18 +79,21 @@ static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
}
static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+ __acquires(&conf->device_lock)
{
spin_lock_irq(conf->hash_locks + hash);
spin_lock(&conf->device_lock);
}
static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+ __releases(&conf->device_lock)
{
spin_unlock(&conf->device_lock);
spin_unlock_irq(conf->hash_locks + hash);
}
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+ __acquires(&conf->device_lock)
{
int i;
spin_lock_irq(conf->hash_locks);
@@ -100,6 +103,7 @@ static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
}
static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+ __releases(&conf->device_lock)
{
int i;
spin_unlock(&conf->device_lock);
@@ -164,6 +168,7 @@ static bool stripe_is_lowprio(struct stripe_head *sh)
}
static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+ __must_hold(&sh->raid_conf->device_lock)
{
struct r5conf *conf = sh->raid_conf;
struct r5worker_group *group;
@@ -211,6 +216,7 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
int i;
int injournal = 0; /* number of date pages with R5_InJournal */
@@ -296,6 +302,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
if (atomic_dec_and_test(&sh->count))
do_release_stripe(conf, sh, temp_inactive_list);
@@ -350,9 +357,9 @@ static void release_inactive_stripe_list(struct r5conf *conf,
}
}
-/* should hold conf->device_lock already */
static int release_stripe_list(struct r5conf *conf,
struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
struct stripe_head *sh, *t;
int count = 0;
@@ -629,6 +636,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
* This is because some failed devices may only affect one
* of the two sections, and some non-in_sync devices may
* be insync in the section most affected by failed devices.
+ *
+ * Most calls to this function hold &conf->device_lock. Calls
+ * in raid5_run() do not require the lock as no other threads
+ * have been started yet.
*/
int raid5_calc_degraded(struct r5conf *conf)
{
@@ -686,17 +697,17 @@ int raid5_calc_degraded(struct r5conf *conf)
return degraded;
}
-static int has_failed(struct r5conf *conf)
+static bool has_failed(struct r5conf *conf)
{
- int degraded;
+ int degraded = conf->mddev->degraded;
- if (conf->mddev->reshape_position == MaxSector)
- return conf->mddev->degraded > conf->max_degraded;
+ if (test_bit(MD_BROKEN, &conf->mddev->flags))
+ return true;
- degraded = raid5_calc_degraded(conf);
- if (degraded > conf->max_degraded)
- return 1;
- return 0;
+ if (conf->mddev->reshape_position != MaxSector)
+ degraded = raid5_calc_degraded(conf);
+
+ return degraded > conf->max_degraded;
}
struct stripe_head *
@@ -2648,6 +2659,28 @@ static void shrink_stripes(struct r5conf *conf)
conf->slab_cache = NULL;
}
+/*
+ * This helper wraps rcu_dereference_protected() and can be used when
+ * it is known that the nr_pending of the rdev is elevated.
+ */
+static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
+{
+ return rcu_dereference_protected(rdev,
+ atomic_read(&rcu_access_pointer(rdev)->nr_pending));
+}
+
+/*
+ * This helper wraps rcu_dereference_protected() and should be used
+ * when it is known that the mddev_lock() is held. This is safe
+ * seeing raid5_remove_disk() has the same lock held.
+ */
+static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
+ struct md_rdev __rcu *rdev)
+{
+ return rcu_dereference_protected(rdev,
+ lockdep_is_held(&mddev->reconfig_mutex));
+}
+
static void raid5_end_read_request(struct bio * bi)
{
struct stripe_head *sh = bi->bi_private;
@@ -2674,9 +2707,9 @@ static void raid5_end_read_request(struct bio * bi)
* In that case it moved down to 'rdev'.
* rdev is not removed until all requests are finished.
*/
- rdev = conf->disks[i].replacement;
+ rdev = rdev_pend_deref(conf->disks[i].replacement);
if (!rdev)
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
if (use_new_offset(conf, sh))
s = sh->sector + rdev->new_data_offset;
@@ -2790,11 +2823,11 @@ static void raid5_end_write_request(struct bio *bi)
for (i = 0 ; i < disks; i++) {
if (bi == &sh->dev[i].req) {
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
break;
}
if (bi == &sh->dev[i].rreq) {
- rdev = conf->disks[i].replacement;
+ rdev = rdev_pend_deref(conf->disks[i].replacement);
if (rdev)
replacement = 1;
else
@@ -2802,7 +2835,7 @@ static void raid5_end_write_request(struct bio *bi)
* replaced it. rdev is not removed
* until all requests are finished.
*/
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
break;
}
}
@@ -2863,34 +2896,31 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
unsigned long flags;
pr_debug("raid456: error called\n");
+ pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n",
+ mdname(mddev), bdevname(rdev->bdev, b));
+
spin_lock_irqsave(&conf->device_lock, flags);
+ set_bit(Faulty, &rdev->flags);
+ clear_bit(In_sync, &rdev->flags);
+ mddev->degraded = raid5_calc_degraded(conf);
- if (test_bit(In_sync, &rdev->flags) &&
- mddev->degraded == conf->max_degraded) {
- /*
- * Don't allow to achieve failed state
- * Don't try to recover this device
- */
+ if (has_failed(conf)) {
+ set_bit(MD_BROKEN, &conf->mddev->flags);
conf->recovery_disabled = mddev->recovery_disabled;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- return;
+
+ pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
+ mdname(mddev), mddev->degraded, conf->raid_disks);
+ } else {
+ pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
+ mdname(mddev), conf->raid_disks - mddev->degraded);
}
- set_bit(Faulty, &rdev->flags);
- clear_bit(In_sync, &rdev->flags);
- mddev->degraded = raid5_calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
- pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
- "md/raid:%s: Operation continuing on %d devices.\n",
- mdname(mddev),
- bdevname(rdev->bdev, b),
- mdname(mddev),
- conf->raid_disks - mddev->degraded);
r5c_update_on_rdev_error(mddev, rdev);
}
@@ -5213,23 +5243,23 @@ finish:
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
/* We own a safe reference to the rdev */
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
if (!rdev_set_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
- rdev = conf->disks[i].replacement;
+ rdev = rdev_pend_deref(conf->disks[i].replacement);
if (!rdev)
/* rdev have been moved down */
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
@@ -5256,6 +5286,7 @@ finish:
}
static void raid5_activate_delayed(struct r5conf *conf)
+ __must_hold(&conf->device_lock)
{
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
while (!list_empty(&conf->delayed_list)) {
@@ -5273,9 +5304,9 @@ static void raid5_activate_delayed(struct r5conf *conf)
}
static void activate_bit_delay(struct r5conf *conf,
- struct list_head *temp_inactive_list)
+ struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
- /* device_lock is held */
struct list_head head;
list_add(&head, &conf->bitmap_list);
list_del_init(&conf->bitmap_list);
@@ -5500,6 +5531,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
* handle_list.
*/
static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
+ __must_hold(&conf->device_lock)
{
struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL;
@@ -6288,7 +6320,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
*/
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
+ struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
@@ -6371,8 +6403,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
static int handle_active_stripes(struct r5conf *conf, int group,
struct r5worker *worker,
struct list_head *temp_inactive_list)
- __releases(&conf->device_lock)
- __acquires(&conf->device_lock)
+ __must_hold(&conf->device_lock)
{
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
int i, batch_size = 0, hash;
@@ -7166,7 +7197,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
int i;
int group_cnt;
struct r5worker_group *new_group;
- int ret;
+ int ret = -ENOMEM;
if (mddev->new_level != 5
&& mddev->new_level != 4
@@ -7225,6 +7256,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(&conf->device_lock);
seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
mutex_init(&conf->cache_size_mutex);
+
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
@@ -7302,11 +7334,13 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->level = mddev->new_level;
conf->chunk_sectors = mddev->new_chunk_sectors;
- if (raid5_alloc_percpu(conf) != 0)
+ ret = raid5_alloc_percpu(conf);
+ if (ret)
goto abort;
pr_debug("raid456: run(%s) called.\n", mdname(mddev));
+ ret = -EIO;
rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks
@@ -7317,11 +7351,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (test_bit(Replacement, &rdev->flags)) {
if (disk->replacement)
goto abort;
- disk->replacement = rdev;
+ RCU_INIT_POINTER(disk->replacement, rdev);
} else {
if (disk->rdev)
goto abort;
- disk->rdev = rdev;
+ RCU_INIT_POINTER(disk->rdev, rdev);
}
if (test_bit(In_sync, &rdev->flags)) {
@@ -7370,6 +7404,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (grow_stripes(conf, conf->min_nr_stripes)) {
pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
mdname(mddev), memory);
+ ret = -ENOMEM;
goto abort;
} else
pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
@@ -7383,7 +7418,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->shrinker.count_objects = raid5_cache_count;
conf->shrinker.batch = 128;
conf->shrinker.flags = 0;
- if (register_shrinker(&conf->shrinker)) {
+ ret = register_shrinker(&conf->shrinker);
+ if (ret) {
pr_warn("md/raid:%s: couldn't register shrinker.\n",
mdname(mddev));
goto abort;
@@ -7394,17 +7430,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (!conf->thread) {
pr_warn("md/raid:%s: couldn't allocate thread.\n",
mdname(mddev));
+ ret = -ENOMEM;
goto abort;
}
return conf;
abort:
- if (conf) {
+ if (conf)
free_conf(conf);
- return ERR_PTR(-EIO);
- } else
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
@@ -7621,17 +7656,18 @@ static int raid5_run(struct mddev *mddev)
for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
i++) {
- rdev = conf->disks[i].rdev;
+ rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
if (!rdev && conf->disks[i].replacement) {
/* The replacement is all we have yet */
- rdev = conf->disks[i].replacement;
+ rdev = rdev_mdlock_deref(mddev,
+ conf->disks[i].replacement);
conf->disks[i].replacement = NULL;
clear_bit(Replacement, &rdev->flags);
- conf->disks[i].rdev = rdev;
+ rcu_assign_pointer(conf->disks[i].rdev, rdev);
}
if (!rdev)
continue;
- if (conf->disks[i].replacement &&
+ if (rcu_access_pointer(conf->disks[i].replacement) &&
conf->reshape_progress != MaxSector) {
/* replacements and reshape simply do not mix. */
pr_warn("md: cannot handle concurrent replacement and reshape.\n");
@@ -7749,7 +7785,6 @@ static int raid5_run(struct mddev *mddev)
*/
stripe = stripe * PAGE_SIZE;
stripe = roundup_pow_of_two(stripe);
- mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
@@ -7828,8 +7863,8 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
static void print_raid5_conf (struct r5conf *conf)
{
+ struct md_rdev *rdev;
int i;
- struct disk_info *tmp;
pr_debug("RAID conf printout:\n");
if (!conf) {
@@ -7840,50 +7875,54 @@ static void print_raid5_conf (struct r5conf *conf)
conf->raid_disks,
conf->raid_disks - conf->mddev->degraded);
+ rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE];
- tmp = conf->disks + i;
- if (tmp->rdev)
+ rdev = rcu_dereference(conf->disks[i].rdev);
+ if (rdev)
pr_debug(" disk %d, o:%d, dev:%s\n",
- i, !test_bit(Faulty, &tmp->rdev->flags),
- bdevname(tmp->rdev->bdev, b));
+ i, !test_bit(Faulty, &rdev->flags),
+ bdevname(rdev->bdev, b));
}
+ rcu_read_unlock();
}
static int raid5_spare_active(struct mddev *mddev)
{
int i;
struct r5conf *conf = mddev->private;
- struct disk_info *tmp;
+ struct md_rdev *rdev, *replacement;
int count = 0;
unsigned long flags;
for (i = 0; i < conf->raid_disks; i++) {
- tmp = conf->disks + i;
- if (tmp->replacement
- && tmp->replacement->recovery_offset == MaxSector
- && !test_bit(Faulty, &tmp->replacement->flags)
- && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+ rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
+ replacement = rdev_mdlock_deref(mddev,
+ conf->disks[i].replacement);
+ if (replacement
+ && replacement->recovery_offset == MaxSector
+ && !test_bit(Faulty, &replacement->flags)
+ && !test_and_set_bit(In_sync, &replacement->flags)) {
/* Replacement has just become active. */
- if (!tmp->rdev
- || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+ if (!rdev
+ || !test_and_clear_bit(In_sync, &rdev->flags))
count++;
- if (tmp->rdev) {
+ if (rdev) {
/* Replaced device not technically faulty,
* but we need to be sure it gets removed
* and never re-added.
*/
- set_bit(Faulty, &tmp->rdev->flags);
+ set_bit(Faulty, &rdev->flags);
sysfs_notify_dirent_safe(
- tmp->rdev->sysfs_state);
+ rdev->sysfs_state);
}
- sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
- } else if (tmp->rdev
- && tmp->rdev->recovery_offset == MaxSector
- && !test_bit(Faulty, &tmp->rdev->flags)
- && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ sysfs_notify_dirent_safe(replacement->sysfs_state);
+ } else if (rdev
+ && rdev->recovery_offset == MaxSector
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_and_set_bit(In_sync, &rdev->flags)) {
count++;
- sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
@@ -7898,8 +7937,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r5conf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
- struct md_rdev **rdevp;
+ struct md_rdev __rcu **rdevp;
struct disk_info *p = conf->disks + number;
+ struct md_rdev *tmp;
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags) && conf->log) {
@@ -7917,9 +7957,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
log_exit(conf);
return 0;
}
- if (rdev == p->rdev)
+ if (rdev == rcu_access_pointer(p->rdev))
rdevp = &p->rdev;
- else if (rdev == p->replacement)
+ else if (rdev == rcu_access_pointer(p->replacement))
rdevp = &p->replacement;
else
return 0;
@@ -7939,18 +7979,20 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled &&
!has_failed(conf) &&
- (!p->replacement || p->replacement == rdev) &&
+ (!rcu_access_pointer(p->replacement) ||
+ rcu_access_pointer(p->replacement) == rdev) &&
number < conf->raid_disks) {
err = -EBUSY;
goto abort;
}
*rdevp = NULL;
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
+ lockdep_assert_held(&mddev->reconfig_mutex);
synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) {
/* lost the race, try later */
err = -EBUSY;
- *rdevp = rdev;
+ rcu_assign_pointer(*rdevp, rdev);
}
}
if (!err) {
@@ -7958,17 +8000,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err)
goto abort;
}
- if (p->replacement) {
+
+ tmp = rcu_access_pointer(p->replacement);
+ if (tmp) {
/* We must have just cleared 'rdev' */
- p->rdev = p->replacement;
- clear_bit(Replacement, &p->replacement->flags);
+ rcu_assign_pointer(p->rdev, tmp);
+ clear_bit(Replacement, &tmp->flags);
smp_mb(); /* Make sure other CPUs may see both as identical
* but will never see neither - if they are careful
*/
- p->replacement = NULL;
+ rcu_assign_pointer(p->replacement, NULL);
if (!err)
- err = log_modify(conf, p->rdev, true);
+ err = log_modify(conf, tmp, true);
}
clear_bit(WantReplacement, &rdev->flags);
@@ -7984,6 +8028,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int ret, err = -EEXIST;
int disk;
struct disk_info *p;
+ struct md_rdev *tmp;
int first = 0;
int last = conf->raid_disks - 1;
@@ -8041,7 +8086,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
}
for (disk = first; disk <= last; disk++) {
p = conf->disks + disk;
- if (test_bit(WantReplacement, &p->rdev->flags) &&
+ tmp = rdev_mdlock_deref(mddev, p->rdev);
+ if (test_bit(WantReplacement, &tmp->flags) &&
p->replacement == NULL) {
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
@@ -8332,6 +8378,7 @@ static void end_reshape(struct r5conf *conf)
static void raid5_finish_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
+ struct md_rdev *rdev;
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -8343,10 +8390,12 @@ static void raid5_finish_reshape(struct mddev *mddev)
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
d++) {
- struct md_rdev *rdev = conf->disks[d].rdev;
+ rdev = rdev_mdlock_deref(mddev,
+ conf->disks[d].rdev);
if (rdev)
clear_bit(In_sync, &rdev->flags);
- rdev = conf->disks[d].replacement;
+ rdev = rdev_mdlock_deref(mddev,
+ conf->disks[d].replacement);
if (rdev)
clear_bit(In_sync, &rdev->flags);
}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9e8486a9e445..638d29863503 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -473,7 +473,8 @@ enum {
*/
struct disk_info {
- struct md_rdev *rdev, *replacement;
+ struct md_rdev __rcu *rdev;
+ struct md_rdev __rcu *replacement;
struct page *extra_page; /* extra page to use in prexor */
};
@@ -560,6 +561,16 @@ struct r5pending_data {
struct bio_list bios;
};
+struct raid5_percpu {
+ struct page *spare_page; /* Used when checking P/Q in raid6 */
+ void *scribble; /* space for constructing buffer
+ * lists and performing address
+ * conversions
+ */
+ int scribble_obj_size;
+ local_lock_t lock;
+};
+
struct r5conf {
struct hlist_head *stripe_hashtbl;
/* only protect corresponding hash list and inactive_list */
@@ -635,15 +646,7 @@ struct r5conf {
*/
int recovery_disabled;
/* per cpu variables */
- struct raid5_percpu {
- struct page *spare_page; /* Used when checking P/Q in raid6 */
- void *scribble; /* space for constructing buffer
- * lists and performing address
- * conversions
- */
- int scribble_obj_size;
- local_lock_t lock;
- } __percpu *percpu;
+ struct raid5_percpu __percpu *percpu;
int scribble_disks;
int scribble_sectors;
struct hlist_node node;