diff options
author | Linus Torvalds | 2016-05-19 17:25:13 -0700 |
---|---|---|
committer | Linus Torvalds | 2016-05-19 17:25:13 -0700 |
commit | feaa7cb5c59416143432829b15826be76605b8fe (patch) | |
tree | c8dce351ab8f14f62648e7410da9182797cae426 | |
parent | e0fb1b36398487475e0d2c50264e4ec1eaed3e11 (diff) | |
parent | 1fa9a1ad0a9db3c745fe0c1bfa73fd87901fd7f3 (diff) |
Merge tag 'md/4.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD updates from Shaohua Li:
"Several patches from Guoqing fixing md-cluster bugs and several
patches from Heinz fixing dm-raid bugs"
* tag 'md/4.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
md-cluster: check the return value of process_recvd_msg
md-cluster: gather resync infos and enable recv_thread after bitmap is ready
md: set MD_CHANGE_PENDING in a atomic region
md: raid5: add prerequisite to run underneath dm-raid
md: raid10: add prerequisite to run underneath dm-raid
md: md.c: fix oops in mddev_suspend for raid0
md-cluster: fix ifnullfree.cocci warnings
md-cluster/bitmap: unplug bitmap to sync dirty pages to disk
md-cluster/bitmap: fix wrong page num in bitmap_file_clear_bit and bitmap_file_set_bit
md-cluster/bitmap: fix wrong calcuation of offset
md-cluster: sync bitmap when node received RESYNCING msg
md-cluster: always setup in-memory bitmap
md-cluster: wakeup thread if activated a spare disk
md-cluster: change array_sectors and update size are not supported
md-cluster: fix locking when node joins cluster during message broadcast
md-cluster: unregister thread if err happened
md-cluster: wake up thread to continue recovery
md-cluser: make resync_finish only called after pers->sync_request
md-cluster: change resync lock from asynchronous to synchronous
-rw-r--r-- | Documentation/md-cluster.txt | 6 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 88 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 3 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 96 | ||||
-rw-r--r-- | drivers/md/md-cluster.h | 1 | ||||
-rw-r--r-- | drivers/md/md.c | 86 | ||||
-rw-r--r-- | drivers/md/raid1.c | 4 | ||||
-rw-r--r-- | drivers/md/raid10.c | 20 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 4 | ||||
-rw-r--r-- | drivers/md/raid5.c | 10 | ||||
-rw-r--r-- | include/linux/bitops.h | 16 |
11 files changed, 257 insertions, 77 deletions
diff --git a/Documentation/md-cluster.txt b/Documentation/md-cluster.txt index c100c7163507..38883276d31c 100644 --- a/Documentation/md-cluster.txt +++ b/Documentation/md-cluster.txt @@ -316,3 +316,9 @@ The algorithm is: nodes are using the raid which is achieved by lock all bitmap locks within the cluster, and also those locks are unlocked accordingly. + +7. Unsupported features + +There are somethings which are not supported by cluster MD yet. + +- update size and change array_sectors. diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 3fe86b54d50b..d8129ec93ebd 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -46,7 +46,7 @@ static inline char *bmname(struct bitmap *bitmap) * allocated while we're using it */ static int bitmap_checkpage(struct bitmap_counts *bitmap, - unsigned long page, int create) + unsigned long page, int create, int no_hijack) __releases(bitmap->lock) __acquires(bitmap->lock) { @@ -90,6 +90,9 @@ __acquires(bitmap->lock) if (mappage == NULL) { pr_debug("md/bitmap: map page allocation failed, hijacking\n"); + /* We don't support hijack for cluster raid */ + if (no_hijack) + return -ENOMEM; /* failed - set the hijacked flag so that we can use the * pointer as a counter */ if (!bitmap->bp[page].map) @@ -756,7 +759,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, bytes += sizeof(bitmap_super_t); num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); - offset = slot_number * (num_pages - 1); + offset = slot_number * num_pages; store->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); @@ -900,6 +903,11 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) struct page *page; void *kaddr; unsigned long chunk = block >> bitmap->counts.chunkshift; + struct bitmap_storage *store = &bitmap->storage; + unsigned long node_offset = 0; + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * store->file_pages; page = filemap_get_page(&bitmap->storage, chunk); if (!page) @@ -915,7 +923,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) kunmap_atomic(kaddr); pr_debug("set file bit %lu page %lu\n", bit, page->index); /* record page number so it gets flushed to disk when unplug occurs */ - set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY); + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); } static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) @@ -924,6 +932,11 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) struct page *page; void *paddr; unsigned long chunk = block >> bitmap->counts.chunkshift; + struct bitmap_storage *store = &bitmap->storage; + unsigned long node_offset = 0; + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * store->file_pages; page = filemap_get_page(&bitmap->storage, chunk); if (!page) @@ -935,8 +948,8 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) else clear_bit_le(bit, paddr); kunmap_atomic(paddr); - if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { - set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); + if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); bitmap->allclean = 0; } } @@ -1321,7 +1334,7 @@ __acquires(bitmap->lock) sector_t csize; int err; - err = bitmap_checkpage(bitmap, page, create); + err = bitmap_checkpage(bitmap, page, create, 0); if (bitmap->bp[page].hijacked || bitmap->bp[page].map == NULL) @@ -1594,6 +1607,27 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) } EXPORT_SYMBOL(bitmap_cond_end_sync); +void bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi) +{ + struct bitmap *bitmap = mddev->bitmap; + sector_t sector, blocks = 0; + + for (sector = old_lo; sector < new_lo; ) { + bitmap_end_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); + + for (sector = old_hi; sector < new_hi; ) { + bitmap_start_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); +} +EXPORT_SYMBOL(bitmap_sync_with_cluster); + static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { /* For each chunk covered by any of these sectors, set the @@ -1814,6 +1848,9 @@ int bitmap_load(struct mddev *mddev) if (!bitmap) goto out; + if (mddev_is_clustered(mddev)) + md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); + /* Clear out old bitmap info first: Either there is none, or we * are resuming after someone else has possibly changed things, * so we should forget old cached info. @@ -1890,14 +1927,14 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot, if (clear_bits) { bitmap_update_sb(bitmap); - /* Setting this for the ev_page should be enough. - * And we do not require both write_all and PAGE_DIRT either - */ + /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs + * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ for (i = 0; i < bitmap->storage.file_pages; i++) - set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); - bitmap_write_all(bitmap); + if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); bitmap_unplug(bitmap); } + bitmap_unplug(mddev->bitmap); *low = lo; *high = hi; err: @@ -2032,6 +2069,35 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, chunks << chunkshift); spin_lock_irq(&bitmap->counts.lock); + /* For cluster raid, need to pre-allocate bitmap */ + if (mddev_is_clustered(bitmap->mddev)) { + unsigned long page; + for (page = 0; page < pages; page++) { + ret = bitmap_checkpage(&bitmap->counts, page, 1, 1); + if (ret) { + unsigned long k; + + /* deallocate the page memory */ + for (k = 0; k < page; k++) { + kfree(new_bp[k].map); + } + + /* restore some fields from old_counts */ + bitmap->counts.bp = old_counts.bp; + bitmap->counts.pages = old_counts.pages; + bitmap->counts.missing_pages = old_counts.pages; + bitmap->counts.chunkshift = old_counts.chunkshift; + bitmap->counts.chunks = old_counts.chunks; + bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + + BITMAP_BLOCK_SHIFT); + blocks = old_counts.chunks << old_counts.chunkshift; + pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n"); + break; + } else + bitmap->counts.bp[page].count += 1; + } + } + for (block = 0; block < blocks; ) { bitmap_counter_t *bmc_old, *bmc_new; int set; diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 5e3fcd6ecf77..5b6dd63dda91 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -258,6 +258,9 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); void bitmap_close_sync(struct bitmap *bitmap); void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); +void bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi); void bitmap_unplug(struct bitmap *bitmap); void bitmap_daemon_work(struct mddev *mddev); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index dd97d4245822..41573f1f626f 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -61,6 +61,10 @@ struct resync_info { * the lock. */ #define MD_CLUSTER_SEND_LOCKED_ALREADY 5 +/* We should receive message after node joined cluster and + * set up all the related infos such as bitmap and personality */ +#define MD_CLUSTER_ALREADY_IN_CLUSTER 6 +#define MD_CLUSTER_PENDING_RECV_EVENT 7 struct md_cluster_info { @@ -85,6 +89,9 @@ struct md_cluster_info { struct completion newdisk_completion; wait_queue_head_t wait; unsigned long state; + /* record the region in RESYNCING message */ + sector_t sync_low; + sector_t sync_hi; }; enum msg_type { @@ -284,11 +291,14 @@ static void recover_bitmaps(struct md_thread *thread) goto dlm_unlock; } if (hi > 0) { - /* TODO:Wait for current resync to get over */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); if (lo < mddev->recovery_cp) mddev->recovery_cp = lo; - md_check_recovery(mddev); + /* wake up thread to continue resync in case resync + * is not finished */ + if (mddev->recovery_cp != MaxSector) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } } dlm_unlock: dlm_unlock_sync(bm_lockres); @@ -370,8 +380,12 @@ static void ack_bast(void *arg, int mode) struct dlm_lock_resource *res = arg; struct md_cluster_info *cinfo = res->mddev->cluster_info; - if (mode == DLM_LOCK_EX) - md_wakeup_thread(cinfo->recv_thread); + if (mode == DLM_LOCK_EX) { + if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state)) + md_wakeup_thread(cinfo->recv_thread); + else + set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state); + } } static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) @@ -408,6 +422,30 @@ static void process_suspend_info(struct mddev *mddev, md_wakeup_thread(mddev->thread); return; } + + /* + * The bitmaps are not same for different nodes + * if RESYNCING is happening in one node, then + * the node which received the RESYNCING message + * probably will perform resync with the region + * [lo, hi] again, so we could reduce resync time + * a lot if we can ensure that the bitmaps among + * different nodes are match up well. + * + * sync_low/hi is used to record the region which + * arrived in the previous RESYNCING message, + * + * Call bitmap_sync_with_cluster to clear + * NEEDED_MASK and set RESYNC_MASK since + * resync thread is running in another node, + * so we don't need to do the resync again + * with the same section */ + bitmap_sync_with_cluster(mddev, cinfo->sync_low, + cinfo->sync_hi, + lo, hi); + cinfo->sync_low = lo; + cinfo->sync_hi = hi; + s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); if (!s) return; @@ -482,11 +520,13 @@ static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) __func__, __LINE__, le32_to_cpu(msg->raid_slot)); } -static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) +static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) { + int ret = 0; + if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot), "node %d received it's own msg\n", le32_to_cpu(msg->slot))) - return; + return -1; switch (le32_to_cpu(msg->type)) { case METADATA_UPDATED: process_metadata_update(mddev, msg); @@ -509,9 +549,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) __recover_slot(mddev, le32_to_cpu(msg->slot)); break; default: + ret = -1; pr_warn("%s:%d Received unknown message from %d\n", __func__, __LINE__, msg->slot); } + return ret; } /* @@ -535,7 +577,9 @@ static void recv_daemon(struct md_thread *thread) /* read lvb and wake up thread to process this message_lockres */ memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg)); - process_recvd_msg(thread->mddev, &msg); + ret = process_recvd_msg(thread->mddev, &msg); + if (ret) + goto out; /*release CR on ack_lockres*/ ret = dlm_unlock_sync(ack_lockres); @@ -549,6 +593,7 @@ static void recv_daemon(struct md_thread *thread) ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR); if (unlikely(ret != 0)) pr_info("lock CR on ack failed return %d\n", ret); +out: /*release CR on message_lockres*/ ret = dlm_unlock_sync(message_lockres); if (unlikely(ret != 0)) @@ -778,17 +823,24 @@ static int join(struct mddev *mddev, int nodes) cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); if (!cinfo->token_lockres) goto err; - cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); - if (!cinfo->ack_lockres) - goto err; cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); if (!cinfo->no_new_dev_lockres) goto err; + ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (ret) { + ret = -EAGAIN; + pr_err("md-cluster: can't join cluster to avoid lock issue\n"); + goto err; + } + cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); + if (!cinfo->ack_lockres) + goto err; /* get sync CR lock on ACK. */ if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", ret); + dlm_unlock_sync(cinfo->token_lockres); /* get sync CR lock on no-new-dev. */ if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); @@ -809,12 +861,10 @@ static int join(struct mddev *mddev, int nodes) if (!cinfo->resync_lockres) goto err; - ret = gather_all_resync_info(mddev, nodes); - if (ret) - goto err; - return 0; err: + md_unregister_thread(&cinfo->recovery_thread); + md_unregister_thread(&cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); @@ -828,6 +878,19 @@ err: return ret; } +static void load_bitmaps(struct mddev *mddev, int total_slots) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + /* load all the node's bitmap info for resync */ + if (gather_all_resync_info(mddev, total_slots)) + pr_err("md-cluster: failed to gather all resyn infos\n"); + set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state); + /* wake up recv thread in case something need to be handled */ + if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state)) + md_wakeup_thread(cinfo->recv_thread); +} + static void resync_bitmap(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -937,7 +1000,6 @@ static void metadata_update_cancel(struct mddev *mddev) static int resync_start(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; - cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE; return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); } @@ -967,7 +1029,6 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) static int resync_finish(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; - cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE; dlm_unlock_sync(cinfo->resync_lockres); return resync_info_update(mddev, 0, 0); } @@ -1171,6 +1232,7 @@ static struct md_cluster_operations cluster_ops = { .add_new_disk_cancel = add_new_disk_cancel, .new_disk_ack = new_disk_ack, .remove_disk = remove_disk, + .load_bitmaps = load_bitmaps, .gather_bitmaps = gather_bitmaps, .lock_all_bitmaps = lock_all_bitmaps, .unlock_all_bitmaps = unlock_all_bitmaps, diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 45ce6c97d8bd..e765499ba591 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -23,6 +23,7 @@ struct md_cluster_operations { void (*add_new_disk_cancel)(struct mddev *mddev); int (*new_disk_ack)(struct mddev *mddev, bool ack); int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); + void (*load_bitmaps)(struct mddev *mddev, int total_slots); int (*gather_bitmaps)(struct md_rdev *rdev); int (*lock_all_bitmaps)(struct mddev *mddev); void (*unlock_all_bitmaps)(struct mddev *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index c9a475c33cc7..866825f10b4c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -307,7 +307,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) */ void mddev_suspend(struct mddev *mddev) { - WARN_ON_ONCE(current == mddev->thread->tsk); + WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); if (mddev->suspended++) return; synchronize_rcu(); @@ -2291,19 +2291,24 @@ void md_update_sb(struct mddev *mddev, int force_change) return; } +repeat: if (mddev_is_clustered(mddev)) { if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; + if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) + nospares = 1; ret = md_cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) md_cluster_ops->metadata_update_cancel(mddev); - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), + BIT(MD_CHANGE_DEVS) | + BIT(MD_CHANGE_CLEAN)); return; } } -repeat: + /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && @@ -2430,15 +2435,14 @@ repeat: md_super_wait(mddev); /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ - spin_lock(&mddev->lock); + if (mddev_is_clustered(mddev) && ret == 0) + md_cluster_ops->metadata_update_finish(mddev); + if (mddev->in_sync != sync_req || - test_bit(MD_CHANGE_DEVS, &mddev->flags)) { + !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN))) /* have to write it out again */ - spin_unlock(&mddev->lock); goto repeat; - } - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - spin_unlock(&mddev->lock); wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); @@ -2452,9 +2456,6 @@ repeat: clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } - - if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_finish(mddev); } EXPORT_SYMBOL(md_update_sb); @@ -4816,6 +4817,10 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; + /* cluster raid doesn't support change array_sectors */ + if (mddev_is_clustered(mddev)) + return -EINVAL; + if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) sectors = mddev->pers->size(mddev, 0, 0); @@ -6437,6 +6442,10 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) int rv; int fit = (num_sectors == 0); + /* cluster raid doesn't support update size */ + if (mddev_is_clustered(mddev)) + return -EINVAL; + if (mddev->pers->resize == NULL) return -EINVAL; /* The "num_sectors" is the number of sectors of each device that @@ -7785,7 +7794,7 @@ void md_do_sync(struct md_thread *thread) struct md_rdev *rdev; char *desc, *action = NULL; struct blk_plug plug; - bool cluster_resync_finished = false; + int ret; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7795,6 +7804,19 @@ void md_do_sync(struct md_thread *thread) return; } + if (mddev_is_clustered(mddev)) { + ret = md_cluster_ops->resync_start(mddev); + if (ret) + goto skip; + + if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + && ((unsigned long long)mddev->curr_resync_completed + < (unsigned long long)mddev->resync_max_sectors)) + goto skip; + } + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { desc = "data-check"; @@ -8089,11 +8111,6 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - /* tell personality and other nodes that we are finished */ - if (mddev_is_clustered(mddev)) { - md_cluster_ops->resync_finish(mddev); - cluster_resync_finished = true; - } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && @@ -8130,12 +8147,18 @@ void md_do_sync(struct md_thread *thread) } } skip: - set_bit(MD_CHANGE_DEVS, &mddev->flags); - if (mddev_is_clustered(mddev) && - test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !cluster_resync_finished) + ret == 0) { + /* set CHANGE_PENDING here since maybe another + * update is needed, so other nodes are informed */ + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); md_cluster_ops->resync_finish(mddev); + } else + set_bit(MD_CHANGE_DEVS, &mddev->flags); spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -8226,18 +8249,9 @@ static void md_start_sync(struct work_struct *ws) struct mddev *mddev = container_of(ws, struct mddev, del_work); int ret = 0; - if (mddev_is_clustered(mddev)) { - ret = md_cluster_ops->resync_start(mddev); - if (ret) { - mddev->sync_thread = NULL; - goto out; - } - } - mddev->sync_thread = md_register_thread(md_do_sync, mddev, "resync"); -out: if (!mddev->sync_thread) { if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) printk(KERN_ERR "%s: could not start resync" @@ -8536,6 +8550,7 @@ EXPORT_SYMBOL(md_finish_reshape); int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { + struct mddev *mddev = rdev->mddev; int rv; if (is_new) s += rdev->new_data_offset; @@ -8545,8 +8560,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, if (rv == 0) { /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); - set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); - set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING)); md_wakeup_thread(rdev->mddev->thread); return 1; } else @@ -8680,6 +8695,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %s\n", bdevname(rdev2->bdev,b)); + /* wakeup mddev->thread here, so array could + * perform resync with the new activated disk */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } /* device faulty * We just want to do the minimum to mark the disk diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a7f2b9c9f8a0..c7c8cde0ab21 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1474,8 +1474,8 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" "md/raid1:%s: Operation continuing on %d devices.\n", diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e3fd725d5c4d..c7de2a53e625 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1102,8 +1102,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio) bio->bi_iter.bi_sector < conf->reshape_progress))) { /* Need to update reshape_position in metadata */ mddev->reshape_position = conf->reshape_progress; - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_PENDING, &mddev->flags)); @@ -1591,8 +1591,8 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); spin_unlock_irqrestore(&conf->device_lock, flags); printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" @@ -3782,8 +3782,10 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, size); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } if (sectors > mddev->dev_sectors && mddev->recovery_cp > oldsize) { mddev->recovery_cp = oldsize; @@ -4593,8 +4595,10 @@ static void raid10_finish_reshape(struct mddev *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->resync_max_sectors = size; - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } } else { int d; for (d = conf->geo.raid_disks ; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 26f14970a858..e889e2deb7b3 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -712,8 +712,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, * in_teardown check workaround this issue. */ if (!log->in_teardown) { - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_PENDING, &mddev->flags) || diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e48c262ce032..8959e6dd31dd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2514,8 +2514,8 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" "md/raid:%s: Operation continuing on %d devices.\n", @@ -7572,8 +7572,10 @@ static void raid5_finish_reshape(struct mddev *mddev) if (mddev->delta_disks > 0) { md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } } else { int d; spin_lock_irq(&conf->device_lock); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index defeaac0745f..299e76b59fe9 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -227,6 +227,22 @@ static inline unsigned long __ffs64(u64 word) }) #endif +#ifndef bit_clear_unless +#define bit_clear_unless(ptr, _clear, _test) \ +({ \ + const typeof(*ptr) clear = (_clear), test = (_test); \ + typeof(*ptr) old, new; \ + \ + do { \ + old = ACCESS_ONCE(*ptr); \ + new = old & ~clear; \ + } while (!(old & test) && \ + cmpxchg(ptr, old, new) != old); \ + \ + !(old & test); \ +}) +#endif + #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region |