diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-bufio.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 13 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-delay.c | 23 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 71 | ||||
-rw-r--r-- | drivers/md/dm-stats.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.h | 1 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 66 | ||||
-rw-r--r-- | drivers/md/md.c | 147 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-array.c | 10 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-block-manager.c | 6 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-block-manager.h | 7 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-common.c | 32 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-metadata.c | 8 | ||||
-rw-r--r-- | drivers/md/raid1.c | 162 | ||||
-rw-r--r-- | drivers/md/raid1.h | 15 | ||||
-rw-r--r-- | drivers/md/raid10.c | 6 | ||||
-rw-r--r-- | drivers/md/raid5.c | 425 | ||||
-rw-r--r-- | drivers/md/raid5.h | 16 |
21 files changed, 760 insertions, 269 deletions
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 173cbb20d104..54bdd923316f 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void) { __u64 mem; + dm_bufio_allocated_kmem_cache = 0; + dm_bufio_allocated_get_free_pages = 0; + dm_bufio_allocated_vmalloc = 0; + dm_bufio_current_allocated = 0; + memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 416b7b752a6e..64780ad73bb0 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, int r = 0; bool updated = updated_this_tick(mq, e); - requeue_and_update_tick(mq, e); - if ((!discarded_oblock && updated) || - !should_promote(mq, e, discarded_oblock, data_dir)) + !should_promote(mq, e, discarded_oblock, data_dir)) { + requeue_and_update_tick(mq, e); result->op = POLICY_MISS; - else if (!can_migrate) + + } else if (!can_migrate) r = -EWOULDBLOCK; - else + + else { + requeue_and_update_tick(mq, e); r = pre_cache_to_cache(mq, e, result); + } return r; } diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9efcf1059b99..1b1469ebe5cb 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2755,7 +2755,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) { int r; - r = dm_cache_resize(cache->cmd, cache->cache_size); + r = dm_cache_resize(cache->cmd, new_size); if (r) { DMERR("could not resize cache metadata"); return r; diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 496d5f3646a5..2f91d6d4a2cc 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -20,6 +20,7 @@ struct delay_c { struct timer_list delay_timer; struct mutex timer_lock; + struct workqueue_struct *kdelayd_wq; struct work_struct flush_expired_bios; struct list_head delayed_bios; atomic_t may_delay; @@ -45,14 +46,13 @@ struct dm_delay_info { static DEFINE_MUTEX(delayed_bios_lock); -static struct workqueue_struct *kdelayd_wq; static struct kmem_cache *delayed_cache; static void handle_delayed_timer(unsigned long data) { struct delay_c *dc = (struct delay_c *)data; - queue_work(kdelayd_wq, &dc->flush_expired_bios); + queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); } static void queue_timeout(struct delay_c *dc, unsigned long expires) @@ -191,6 +191,12 @@ out: goto bad_dev_write; } + dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + if (!dc->kdelayd_wq) { + DMERR("Couldn't start kdelayd"); + goto bad_queue; + } + setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); @@ -203,6 +209,8 @@ out: ti->private = dc; return 0; +bad_queue: + mempool_destroy(dc->delayed_pool); bad_dev_write: if (dc->dev_write) dm_put_device(ti, dc->dev_write); @@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti) { struct delay_c *dc = ti->private; - flush_workqueue(kdelayd_wq); + destroy_workqueue(dc->kdelayd_wq); dm_put_device(ti, dc->dev_read); @@ -350,12 +358,6 @@ static int __init dm_delay_init(void) { int r = -ENOMEM; - kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); - if (!kdelayd_wq) { - DMERR("Couldn't start kdelayd"); - goto bad_queue; - } - delayed_cache = KMEM_CACHE(dm_delay_info, 0); if (!delayed_cache) { DMERR("Couldn't create delayed bio cache."); @@ -373,8 +375,6 @@ static int __init dm_delay_init(void) bad_register: kmem_cache_destroy(delayed_cache); bad_memcache: - destroy_workqueue(kdelayd_wq); -bad_queue: return r; } @@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void) { dm_unregister_target(&delay_target); kmem_cache_destroy(delayed_cache); - destroy_workqueue(kdelayd_wq); } /* Module hooks */ diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index aec57d76db5d..944690bafd93 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -66,6 +66,18 @@ struct dm_snapshot { atomic_t pending_exceptions_count; + /* Protected by "lock" */ + sector_t exception_start_sequence; + + /* Protected by kcopyd single-threaded callback */ + sector_t exception_complete_sequence; + + /* + * A list of pending exceptions that completed out of order. + * Protected by kcopyd single-threaded callback. + */ + struct list_head out_of_order_list; + mempool_t *pending_pool; struct dm_exception_table pending; @@ -173,6 +185,14 @@ struct dm_snap_pending_exception { */ int started; + /* There was copying error. */ + int copy_error; + + /* A sequence number, it is used for in-order completion. */ + sector_t exception_sequence; + + struct list_head out_of_order_entry; + /* * For writing a complete chunk, bypassing the copy. */ @@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->valid = 1; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); + s->exception_start_sequence = 0; + s->exception_complete_sequence = 0; + INIT_LIST_HEAD(&s->out_of_order_list); init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); @@ -1443,6 +1466,19 @@ static void commit_callback(void *context, int success) pending_complete(pe, success); } +static void complete_exception(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + + if (unlikely(pe->copy_error)) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store->type->commit_exception(s->store, &pe->e, + commit_callback, pe); +} + /* * Called when the copy I/O has finished. kcopyd actually runs * this code so don't block. @@ -1452,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) struct dm_snap_pending_exception *pe = context; struct dm_snapshot *s = pe->snap; - if (read_err || write_err) - pending_complete(pe, 0); + pe->copy_error = read_err || write_err; - else - /* Update the metadata if we are persistent */ - s->store->type->commit_exception(s->store, &pe->e, - commit_callback, pe); + if (pe->exception_sequence == s->exception_complete_sequence) { + s->exception_complete_sequence++; + complete_exception(pe); + + while (!list_empty(&s->out_of_order_list)) { + pe = list_entry(s->out_of_order_list.next, + struct dm_snap_pending_exception, out_of_order_entry); + if (pe->exception_sequence != s->exception_complete_sequence) + break; + s->exception_complete_sequence++; + list_del(&pe->out_of_order_entry); + complete_exception(pe); + } + } else { + struct list_head *lh; + struct dm_snap_pending_exception *pe2; + + list_for_each_prev(lh, &s->out_of_order_list) { + pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry); + if (pe2->exception_sequence < pe->exception_sequence) + break; + } + list_add(&pe->out_of_order_entry, lh); + } } /* @@ -1553,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s, return NULL; } + pe->exception_sequence = s->exception_start_sequence++; + dm_insert_exception(&s->pending, &pe->e); return pe; @@ -2192,7 +2249,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 11, 1}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 3d404c1371ed..28a90122a5a8 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, int __init dm_statistics_init(void) { + shared_memory_amount = 0; dm_stat_need_rcu_barrier = 0; return 0; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 465f08ca62b1..3ba6a3859ce3 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode, num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + if (!num_targets) { + kfree(t); + return -ENOMEM; + } + if (alloc_targets(t, num_targets)) { kfree(t); return -ENOMEM; diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 60bce435f4fa..8a30ad54bd46 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) up_write(&pmd->root_lock); } +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) +{ + down_write(&pmd->root_lock); + pmd->read_only = false; + dm_bm_set_read_write(pmd->bm); + up_write(&pmd->root_lock); +} + int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_block_t threshold, dm_sm_threshold_fn fn, diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 845ebbe589a9..7bcc0e1d6238 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz * that nothing is changing. */ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd); int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_block_t threshold, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 2c0cf511ec23..ee29037ffc2e 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -640,7 +640,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) */ r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); if (r) { - DMERR_LIMIT("dm_thin_insert_block() failed"); + DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", + dm_device_name(pool->pool_md), r); + set_pool_mode(pool, PM_READ_ONLY); cell_error(pool, m->cell); goto out; } @@ -881,32 +883,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, } } -static int commit(struct pool *pool) -{ - int r; - - r = dm_pool_commit_metadata(pool->pmd); - if (r) - DMERR_LIMIT("%s: commit failed: error = %d", - dm_device_name(pool->pool_md), r); - - return r; -} - /* * A non-zero return indicates read_only or fail_io mode. * Many callers don't care about the return value. */ -static int commit_or_fallback(struct pool *pool) +static int commit(struct pool *pool) { int r; if (get_pool_mode(pool) != PM_WRITE) return -EINVAL; - r = commit(pool); - if (r) + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d", + dm_device_name(pool->pool_md), r); set_pool_mode(pool, PM_READ_ONLY); + } return r; } @@ -943,7 +936,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) * Try to commit to see if that will free up some * more space. */ - (void) commit_or_fallback(pool); + r = commit(pool); + if (r) + return r; r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); if (r) @@ -957,7 +952,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) * table reload). */ if (!free_blocks) { - DMWARN("%s: no free space available.", + DMWARN("%s: no free data space available.", dm_device_name(pool->pool_md)); spin_lock_irqsave(&pool->lock, flags); pool->no_free_space = 1; @@ -967,8 +962,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) } r = dm_pool_alloc_data_block(pool->pmd, result); - if (r) + if (r) { + if (r == -ENOSPC && + !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && + !free_blocks) { + DMWARN("%s: no free metadata space available.", + dm_device_name(pool->pool_md)); + set_pool_mode(pool, PM_READ_ONLY); + } return r; + } return 0; } @@ -1349,7 +1352,7 @@ static void process_deferred_bios(struct pool *pool) if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) return; - if (commit_or_fallback(pool)) { + if (commit(pool)) { while ((bio = bio_list_pop(&bios))) bio_io_error(bio); return; @@ -1397,6 +1400,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) case PM_FAIL: DMERR("%s: switching pool to failure mode", dm_device_name(pool->pool_md)); + dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; pool->process_prepared_mapping = process_prepared_mapping_fail; @@ -1421,6 +1425,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) break; case PM_WRITE: + dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; pool->process_discard = process_discard; pool->process_prepared_mapping = process_prepared_mapping; @@ -1637,12 +1642,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) struct pool_c *pt = ti->private; /* - * We want to make sure that degraded pools are never upgraded. + * We want to make sure that a pool in PM_FAIL mode is never upgraded. */ enum pool_mode old_mode = pool->pf.mode; enum pool_mode new_mode = pt->adjusted_pf.mode; - if (old_mode > new_mode) + /* + * If we were in PM_FAIL mode, rollback of metadata failed. We're + * not going to recover without a thin_repair. So we never let the + * pool move out of the old mode. On the other hand a PM_READ_ONLY + * may have been due to a lack of metadata or data space, and may + * now work (ie. if the underlying devices have been resized). + */ + if (old_mode == PM_FAIL) new_mode = old_mode; pool->ti = ti; @@ -2266,7 +2278,7 @@ static int pool_preresume(struct dm_target *ti) return r; if (need_commit1 || need_commit2) - (void) commit_or_fallback(pool); + (void) commit(pool); return 0; } @@ -2293,7 +2305,7 @@ static void pool_postsuspend(struct dm_target *ti) cancel_delayed_work(&pool->waker); flush_workqueue(pool->wq); - (void) commit_or_fallback(pool); + (void) commit(pool); } static int check_arg_count(unsigned argc, unsigned args_required) @@ -2427,7 +2439,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct if (r) return r; - (void) commit_or_fallback(pool); + (void) commit(pool); r = dm_pool_reserve_metadata_snap(pool->pmd); if (r) @@ -2489,7 +2501,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) DMWARN("Unrecognised thin pool target message received: %s", argv[0]); if (!r) - (void) commit_or_fallback(pool); + (void) commit(pool); return r; } @@ -2544,7 +2556,7 @@ static void pool_status(struct dm_target *ti, status_type_t type, /* Commit to ensure statistics aren't out-of-date */ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) - (void) commit_or_fallback(pool); + (void) commit(pool); r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); if (r) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 8766eabb0014..21f4d7ff0da2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev) static struct ctl_table_header *raid_table_header; -static ctl_table raid_table[] = { +static struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, @@ -130,7 +130,7 @@ static ctl_table raid_table[] = { { } }; -static ctl_table raid_dir_table[] = { +static struct ctl_table raid_dir_table[] = { { .procname = "raid", .maxlen = 0, @@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = { { } }; -static ctl_table raid_root_table[] = { +static struct ctl_table raid_root_table[] = { { .procname = "dev", .maxlen = 0, @@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit) goto retry; } -static inline int mddev_lock(struct mddev * mddev) +static inline int __must_check mddev_lock(struct mddev * mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); } +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev * mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + static inline int mddev_is_locked(struct mddev *mddev) { return mutex_is_locked(&mddev->reconfig_mutex); @@ -768,16 +776,10 @@ void md_super_wait(struct mddev *mddev) finish_wait(&mddev->sb_wait, &wq); } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio->bi_private); -} - int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); - struct completion event; int ret; rw |= REQ_SYNC; @@ -793,11 +795,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, else bio->bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); + submit_bio_wait(rw, bio); ret = test_bit(BIO_UPTODATE, &bio->bi_flags); bio_put(bio); @@ -2978,7 +2976,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) for_each_mddev(mddev, tmp) { struct md_rdev *rdev2; - mddev_lock(mddev); + mddev_lock_nointr(mddev); rdev_for_each(rdev2, mddev) if (rdev->bdev == rdev2->bdev && rdev != rdev2 && @@ -2994,7 +2992,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) break; } } - mddev_lock(my_mddev); + mddev_lock_nointr(my_mddev); if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. @@ -3580,6 +3578,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->in_sync = 1; del_timer_sync(&mddev->safemode_timer); } + blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev_resume(mddev); @@ -5258,7 +5257,7 @@ static void __md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev) { - mddev_lock(mddev); + mddev_lock_nointr(mddev); __md_stop_writes(mddev); mddev_unlock(mddev); } @@ -5291,20 +5290,35 @@ EXPORT_SYMBOL_GPL(md_stop); static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) { int err = 0; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); + mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > !!bdev) { + if (atomic_read(&mddev->openers) > !!bdev || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } err = -EBUSY; goto out; } - if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { - /* Someone opened the device since we flushed it - * so page cache could be dirty and it is too late - * to flush. So abort - */ - mutex_unlock(&mddev->open_mutex); - return -EBUSY; - } if (mddev->pers) { __md_stop_writes(mddev); @@ -5315,7 +5329,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_state); - err = 0; + err = 0; } out: mutex_unlock(&mddev->open_mutex); @@ -5331,20 +5345,34 @@ static int do_md_stop(struct mddev * mddev, int mode, { struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); if (atomic_read(&mddev->openers) > !!bdev || - mddev->sysfs_active) { + mddev->sysfs_active || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); mutex_unlock(&mddev->open_mutex); - return -EBUSY; - } - if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { - /* Someone opened the device since we flushed it - * so page cache could be dirty and it is too late - * to flush. So abort - */ - mutex_unlock(&mddev->open_mutex); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } return -EBUSY; } if (mddev->pers) { @@ -6551,7 +6579,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) && !test_bit(MD_CHANGE_PENDING, &mddev->flags)); - mddev_lock(mddev); + mddev_lock_nointr(mddev); } } else { err = -EROFS; @@ -7361,9 +7389,6 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync = 2; try_again: - if (kthread_should_stop()) - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; for_each_mddev(mddev2, tmp) { @@ -7388,7 +7413,7 @@ void md_do_sync(struct md_thread *thread) * be caught by 'softlockup' */ prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); - if (!kthread_should_stop() && + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { printk(KERN_INFO "md: delaying %s of %s" " until %s has finished (they" @@ -7464,7 +7489,7 @@ void md_do_sync(struct md_thread *thread) last_check = 0; if (j>2) { - printk(KERN_INFO + printk(KERN_INFO "md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; @@ -7501,7 +7526,8 @@ void md_do_sync(struct md_thread *thread) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - while (j >= mddev->resync_max && !kthread_should_stop()) { + while (j >= mddev->resync_max && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* As this condition is controlled by user-space, * we can block indefinitely, so use '_interruptible' * to avoid triggering warnings. @@ -7509,17 +7535,18 @@ void md_do_sync(struct md_thread *thread) flush_signals(current); /* just in case */ wait_event_interruptible(mddev->recovery_wait, mddev->resync_max > j - || kthread_should_stop()); + || test_bit(MD_RECOVERY_INTR, + &mddev->recovery)); } - if (kthread_should_stop()) - goto interrupted; + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; + break; } if (!skipped) { /* actual IO requested */ @@ -7556,10 +7583,8 @@ void md_do_sync(struct md_thread *thread) last_mark = next; } - - if (kthread_should_stop()) - goto interrupted; - + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; /* * this loop exits only if either when we are slower than @@ -7582,11 +7607,12 @@ void md_do_sync(struct md_thread *thread) } } } - printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); + printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, + test_bit(MD_RECOVERY_INTR, &mddev->recovery) + ? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop */ - out: blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); @@ -7640,16 +7666,6 @@ void md_do_sync(struct md_thread *thread) set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); return; - - interrupted: - /* - * got a signal, exit. - */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - } EXPORT_SYMBOL_GPL(md_do_sync); @@ -7751,7 +7767,7 @@ void md_check_recovery(struct mddev *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || + (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || @@ -7894,6 +7910,7 @@ void md_reap_sync_thread(struct mddev *mddev) /* resync has finished, collect result */ md_unregister_thread(&mddev->sync_thread); + wake_up(&resync_wait); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* success...*/ diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index af96e24ec328..1d75b1dc1e2e 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, * The shadow op will often be a noop. Only insert if it really * copied data. */ - if (dm_block_location(*block) != b) + if (dm_block_location(*block) != b) { + /* + * dm_tm_shadow_block will have already decremented the old + * block, but it is still referenced by the btree. We + * increment to stop the insert decrementing it below zero + * when overwriting the old value. + */ + dm_tm_inc(info->btree_info.tm, b); r = insert_ablock(info, index, *block, root); + } return r; } diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index a7e8bf296388..064a3c271baa 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm) } EXPORT_SYMBOL_GPL(dm_bm_set_read_only); +void dm_bm_set_read_write(struct dm_block_manager *bm) +{ + bm->read_only = false; +} +EXPORT_SYMBOL_GPL(dm_bm_set_read_write); + u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) { return crc32c(~(u32) 0, data, len) ^ init_xor; diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 9a82083a66b6..13cd58e1fe69 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b); int dm_bm_flush_and_unlock(struct dm_block_manager *bm, struct dm_block *superblock); - /* - * Request data be prefetched into the cache. - */ +/* + * Request data is prefetched into the cache. + */ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); /* @@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); * be returned if you do. */ void dm_bm_set_read_only(struct dm_block_manager *bm); +void dm_bm_set_read_write(struct dm_block_manager *bm); u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 6058569fe86c..466a60bbd716 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, } static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, - uint32_t (*mutator)(void *context, uint32_t old), + int (*mutator)(void *context, uint32_t old, uint32_t *new), void *context, enum allocation_event *ev) { int r; @@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, if (old > 2) { r = sm_ll_lookup_big_ref_count(ll, b, &old); - if (r < 0) + if (r < 0) { + dm_tm_unlock(ll->tm, nb); return r; + } } - ref_count = mutator(context, old); + r = mutator(context, old, &ref_count); + if (r) { + dm_tm_unlock(ll->tm, nb); + return r; + } if (ref_count <= 2) { sm_set_bitmap(bm_le, bit, ref_count); @@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, return ll->save_ie(ll, index, &ie_disk); } -static uint32_t set_ref_count(void *context, uint32_t old) +static int set_ref_count(void *context, uint32_t old, uint32_t *new) { - return *((uint32_t *) context); + *new = *((uint32_t *) context); + return 0; } int sm_ll_insert(struct ll_disk *ll, dm_block_t b, @@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); } -static uint32_t inc_ref_count(void *context, uint32_t old) +static int inc_ref_count(void *context, uint32_t old, uint32_t *new) { - return old + 1; + *new = old + 1; + return 0; } int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) @@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); } -static uint32_t dec_ref_count(void *context, uint32_t old) +static int dec_ref_count(void *context, uint32_t old, uint32_t *new) { - return old - 1; + if (!old) { + DMERR_LIMIT("unable to decrement a reference count below 0"); + return -EINVAL; + } + + *new = old - 1; + return 0; } int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 1c959684caef..58fc1eef7499 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); int r = sm_metadata_new_block_(sm, b); - if (r) + if (r) { DMERR("unable to allocate new metadata block"); + return r; + } r = sm_metadata_get_nr_free(sm, &count); - if (r) + if (r) { DMERR("couldn't get free block count"); + return r; + } check_threshold(&smm->threshold, count); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index af6681b19776..1e5a540995e9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -66,7 +66,8 @@ */ static int max_queued_requests = 1024; -static void allow_barrier(struct r1conf *conf); +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector); static void lower_barrier(struct r1conf *conf); static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) @@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data) } #define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_DEPTH 32 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) +#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) +#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { @@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio) struct bio *bio = r1_bio->master_bio; int done; struct r1conf *conf = r1_bio->mddev->private; + sector_t start_next_window = r1_bio->start_next_window; + sector_t bi_sector = bio->bi_sector; if (bio->bi_phys_segments) { unsigned long flags; @@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio) bio->bi_phys_segments--; done = (bio->bi_phys_segments == 0); spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * make_request() might be waiting for + * bi_phys_segments to decrease + */ + wake_up(&conf->wait_barrier); } else done = 1; @@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio) * Wake up any possible resync thread that waits for the device * to go idle. */ - allow_barrier(conf); + allow_barrier(conf, start_next_window, bi_sector); } } @@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -#define RESYNC_DEPTH 32 - static void raise_barrier(struct r1conf *conf) { spin_lock_irq(&conf->resync_lock); @@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf) /* block any new IO from starting */ conf->barrier++; - /* Now wait for all pending IO to complete */ + /* For these conditions we must wait: + * A: while the array is in frozen state + * B: while barrier >= RESYNC_DEPTH, meaning resync reach + * the max count which allowed. + * C: next_resync + RESYNC_SECTORS > start_next_window, meaning + * next resync will reach to the window which normal bios are + * handling. + */ wait_event_lock_irq(conf->wait_barrier, - !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + !conf->array_frozen && + conf->barrier < RESYNC_DEPTH && + (conf->start_next_window >= + conf->next_resync + RESYNC_SECTORS), conf->resync_lock); spin_unlock_irq(&conf->resync_lock); @@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf) wake_up(&conf->wait_barrier); } -static void wait_barrier(struct r1conf *conf) +static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) { + bool wait = false; + + if (conf->array_frozen || !bio) + wait = true; + else if (conf->barrier && bio_data_dir(bio) == WRITE) { + if (conf->next_resync < RESYNC_WINDOW_SECTORS) + wait = true; + else if ((conf->next_resync - RESYNC_WINDOW_SECTORS + >= bio_end_sector(bio)) || + (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_sector)) + wait = false; + else + wait = true; + } + + return wait; +} + +static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) +{ + sector_t sector = 0; + spin_lock_irq(&conf->resync_lock); - if (conf->barrier) { + if (need_to_wait_for_sync(conf, bio)) { conf->nr_waiting++; /* Wait for the barrier to drop. * However if there are already pending @@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf) * count down. */ wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (conf->nr_pending && + !conf->array_frozen && + (!conf->barrier || + ((conf->start_next_window < + conf->next_resync + RESYNC_SECTORS) && current->bio_list && - !bio_list_empty(current->bio_list)), + !bio_list_empty(current->bio_list))), conf->resync_lock); conf->nr_waiting--; } + + if (bio && bio_data_dir(bio) == WRITE) { + if (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_sector) { + if (conf->start_next_window == MaxSector) + conf->start_next_window = + conf->next_resync + + NEXT_NORMALIO_DISTANCE; + + if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) + <= bio->bi_sector) + conf->next_window_requests++; + else + conf->current_window_requests++; + } + if (bio->bi_sector >= conf->start_next_window) + sector = conf->start_next_window; + } + conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); + return sector; } -static void allow_barrier(struct r1conf *conf) +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector) { unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); conf->nr_pending--; + if (start_next_window) { + if (start_next_window == conf->start_next_window) { + if (conf->start_next_window + NEXT_NORMALIO_DISTANCE + <= bi_sector) + conf->next_window_requests--; + else + conf->current_window_requests--; + } else + conf->current_window_requests--; + + if (!conf->current_window_requests) { + if (conf->next_window_requests) { + conf->current_window_requests = + conf->next_window_requests; + conf->next_window_requests = 0; + conf->start_next_window += + NEXT_NORMALIO_DISTANCE; + } else + conf->start_next_window = MaxSector; + } + } spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } @@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quite. - * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+extra + * We wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for @@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra) * we continue. */ spin_lock_irq(&conf->resync_lock); - conf->barrier++; - conf->nr_waiting++; + conf->array_frozen = 1; wait_event_lock_irq_cmd(conf->wait_barrier, conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, @@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf) { /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); - conf->barrier--; - conf->nr_waiting--; + conf->array_frozen = 0; wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); } @@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) int first_clone; int sectors_handled; int max_sectors; + sector_t start_next_window; /* * Register the new request and wait if the reconstruction @@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) finish_wait(&conf->wait_barrier, &w); } - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); bitmap = mddev->bitmap; @@ -1163,6 +1247,7 @@ read_again: disks = conf->raid_disks * 2; retry_write: + r1_bio->start_next_window = start_next_window; blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1231,14 +1316,24 @@ read_again: if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ int j; + sector_t old = start_next_window; for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; - allow_barrier(conf); + allow_barrier(conf, start_next_window, bio->bi_sector); md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); + /* + * We must make sure the multi r1bios of bio have + * the same value of bi_phys_segments + */ + if (bio->bi_phys_segments && old && + old != start_next_window) + /* Wait for the former r1bio(s) to complete */ + wait_event(conf->wait_barrier, + bio->bi_phys_segments == 1); goto retry_write; } @@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_barrier(conf); - allow_barrier(conf); + wait_barrier(conf, NULL); + allow_barrier(conf, 0, 0); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; + + conf->next_resync = 0; + conf->start_next_window = MaxSector; } static int raid1_spare_active(struct mddev *mddev) @@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; + conf->start_next_window = MaxSector; + conf->current_window_requests = conf->next_window_requests = 0; + err = -EIO; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev) atomic_read(&bitmap->behind_writes) == 0); } - raise_barrier(conf); - lower_barrier(conf); + freeze_array(conf, 0); + unfreeze_array(conf); md_unregister_thread(&mddev->thread); if (conf->r1bio_pool) @@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state) wake_up(&conf->wait_barrier); break; case 1: - raise_barrier(conf); + freeze_array(conf, 0); break; case 0: - lower_barrier(conf); + unfreeze_array(conf); break; } } @@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev) mddev->new_chunk_sectors = 0; conf = setup_conf(mddev); if (!IS_ERR(conf)) - conf->barrier = 1; + /* Array must appear to be quiesced */ + conf->array_frozen = 1; return conf; } return ERR_PTR(-EINVAL); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 0ff3715fb7eb..9bebca7bff2f 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -41,6 +41,19 @@ struct r1conf { */ sector_t next_resync; + /* When raid1 starts resync, we divide array into four partitions + * |---------|--------------|---------------------|-------------| + * next_resync start_next_window end_window + * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE + * end_window = start_next_window + NEXT_NORMALIO_DISTANCE + * current_window_requests means the count of normalIO between + * start_next_window and end_window. + * next_window_requests means the count of normalIO after end_window. + * */ + sector_t start_next_window; + int current_window_requests; + int next_window_requests; + spinlock_t device_lock; /* list of 'struct r1bio' that need to be processed by raid1d, @@ -65,6 +78,7 @@ struct r1conf { int nr_waiting; int nr_queued; int barrier; + int array_frozen; /* Set to 1 if a full sync is needed, (fresh device added). * Cleared when a sync completes. @@ -111,6 +125,7 @@ struct r1bio { * in this BehindIO request */ sector_t sector; + sector_t start_next_window; int sectors; unsigned long state; struct mddev *mddev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7c3508abb5e1..c504e8389e69 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || - kthread_should_stop()); + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + allow_barrier(conf); + return sectors_done; + } conf->reshape_safe = mddev->reshape_position; allow_barrier(conf); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7f0e17a27aeb..cc055da02e2a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) return &conf->stripe_hashtbl[hash]; } +static inline int stripe_hash_locks_hash(sector_t sect) +{ + return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; +} + +static inline void lock_device_hash_lock(struct r5conf *conf, int hash) +{ + spin_lock_irq(conf->hash_locks + hash); + spin_lock(&conf->device_lock); +} + +static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) +{ + spin_unlock(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); +} + +static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) +{ + int i; + local_irq_disable(); + spin_lock(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); + spin_lock(&conf->device_lock); +} + +static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) +{ + int i; + spin_unlock(&conf->device_lock); + for (i = NR_STRIPE_HASH_LOCKS; i; i--) + spin_unlock(conf->hash_locks + i - 1); + local_irq_enable(); +} + /* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap. There may be several bio's per stripe+device, and * a bio could span several devices. @@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) } } -static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) { BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); @@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); atomic_dec(&conf->active_stripes); - if (!test_bit(STRIPE_EXPANDING, &sh->state)) { - list_add_tail(&sh->lru, &conf->inactive_list); - wake_up(&conf->wait_for_stripe); - if (conf->retry_read_aligned) - md_wakeup_thread(conf->mddev->thread); - } + if (!test_bit(STRIPE_EXPANDING, &sh->state)) + list_add_tail(&sh->lru, temp_inactive_list); } } -static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) { if (atomic_dec_and_test(&sh->count)) - do_release_stripe(conf, sh); + do_release_stripe(conf, sh, temp_inactive_list); +} + +/* + * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list + * + * Be careful: Only one task can add/delete stripes from temp_inactive_list at + * given time. Adding stripes only takes device lock, while deleting stripes + * only takes hash lock. + */ +static void release_inactive_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list, + int hash) +{ + int size; + bool do_wakeup = false; + unsigned long flags; + + if (hash == NR_STRIPE_HASH_LOCKS) { + size = NR_STRIPE_HASH_LOCKS; + hash = NR_STRIPE_HASH_LOCKS - 1; + } else + size = 1; + while (size) { + struct list_head *list = &temp_inactive_list[size - 1]; + + /* + * We don't hold any lock here yet, get_active_stripe() might + * remove stripes from the list + */ + if (!list_empty_careful(list)) { + spin_lock_irqsave(conf->hash_locks + hash, flags); + if (list_empty(conf->inactive_list + hash) && + !list_empty(list)) + atomic_dec(&conf->empty_inactive_list_nr); + list_splice_tail_init(list, conf->inactive_list + hash); + do_wakeup = true; + spin_unlock_irqrestore(conf->hash_locks + hash, flags); + } + size--; + hash--; + } + + if (do_wakeup) { + wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); + } } /* should hold conf->device_lock already */ -static int release_stripe_list(struct r5conf *conf) +static int release_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list) { struct stripe_head *sh; int count = 0; @@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf) head = llist_del_all(&conf->released_stripes); head = llist_reverse_order(head); while (head) { + int hash; + sh = llist_entry(head, struct stripe_head, release_list); head = llist_next(head); /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ @@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf) * again, the count is always > 1. This is true for * STRIPE_ON_UNPLUG_LIST bit too. */ - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); count++; } @@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; unsigned long flags; + struct list_head list; + int hash; bool wakeup; - if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) + if (unlikely(!conf->mddev->thread) || + test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) goto slow_path; wakeup = llist_add(&sh->release_list, &conf->released_stripes); if (wakeup) @@ -336,8 +424,11 @@ slow_path: local_irq_save(flags); /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { - do_release_stripe(conf, sh); + INIT_LIST_HEAD(&list); + hash = sh->hash_lock_index; + do_release_stripe(conf, sh, &list); spin_unlock(&conf->device_lock); + release_inactive_stripe_list(conf, &list, hash); } local_irq_restore(flags); } @@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) /* find an idle stripe, make sure it is unhashed, and return it. */ -static struct stripe_head *get_free_stripe(struct r5conf *conf) +static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh = NULL; struct list_head *first; - if (list_empty(&conf->inactive_list)) + if (list_empty(conf->inactive_list + hash)) goto out; - first = conf->inactive_list.next; + first = (conf->inactive_list + hash)->next; sh = list_entry(first, struct stripe_head, lru); list_del_init(first); remove_hash(sh); atomic_inc(&conf->active_stripes); + BUG_ON(hash != sh->hash_lock_index); + if (list_empty(conf->inactive_list + hash)) + atomic_inc(&conf->empty_inactive_list_nr); out: return sh; } @@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { struct r5conf *conf = sh->raid_conf; - int i; + int i, seq; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); @@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) (unsigned long long)sh->sector); remove_hash(sh); - +retry: + seq = read_seqcount_begin(&conf->gen_lock); sh->generation = conf->generation - previous; sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; @@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) dev->flags = 0; raid5_build_block(sh, i, previous); } + if (read_seqcount_retry(&conf->gen_lock, seq)) + goto retry; insert_hash(conf, sh); sh->cpu = smp_processor_id(); } @@ -552,57 +649,59 @@ get_active_stripe(struct r5conf *conf, sector_t sector, int previous, int noblock, int noquiesce) { struct stripe_head *sh; + int hash = stripe_hash_locks_hash(sector); pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); - spin_lock_irq(&conf->device_lock); + spin_lock_irq(conf->hash_locks + hash); do { wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0 || noquiesce, - conf->device_lock); + *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!conf->inactive_blocked) - sh = get_free_stripe(conf); + sh = get_free_stripe(conf, hash); if (noblock && sh == NULL) break; if (!sh) { conf->inactive_blocked = 1; - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list) && - (atomic_read(&conf->active_stripes) - < (conf->max_nr_stripes *3/4) - || !conf->inactive_blocked), - conf->device_lock); + wait_event_lock_irq( + conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash) && + (atomic_read(&conf->active_stripes) + < (conf->max_nr_stripes * 3 / 4) + || !conf->inactive_blocked), + *(conf->hash_locks + hash)); conf->inactive_blocked = 0; } else init_stripe(sh, sector, previous); } else { + spin_lock(&conf->device_lock); if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru) && !test_bit(STRIPE_EXPANDING, &sh->state) && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) - && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); + ); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - if (list_empty(&sh->lru) && - !test_bit(STRIPE_EXPANDING, &sh->state)) - BUG(); + BUG_ON(list_empty(&sh->lru)); list_del_init(&sh->lru); if (sh->group) { sh->group->stripes_cnt--; sh->group = NULL; } } + spin_unlock(&conf->device_lock); } } while (sh == NULL); if (sh) atomic_inc(&sh->count); - spin_unlock_irq(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); return sh; } @@ -758,7 +857,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_sector = (sh->sector + rdev->data_offset); if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) - bi->bi_rw |= REQ_FLUSH; + bi->bi_rw |= REQ_NOMERGE; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -1582,7 +1681,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } -static int grow_one_stripe(struct r5conf *conf) +static int grow_one_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh; sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); @@ -1598,6 +1697,7 @@ static int grow_one_stripe(struct r5conf *conf) kmem_cache_free(conf->slab_cache, sh); return 0; } + sh->hash_lock_index = hash; /* we just created an active stripe so... */ atomic_set(&sh->count, 1); atomic_inc(&conf->active_stripes); @@ -1610,6 +1710,7 @@ static int grow_stripes(struct r5conf *conf, int num) { struct kmem_cache *sc; int devs = max(conf->raid_disks, conf->previous_raid_disks); + int hash; if (conf->mddev->gendisk) sprintf(conf->cache_name[0], @@ -1627,9 +1728,13 @@ static int grow_stripes(struct r5conf *conf, int num) return 1; conf->slab_cache = sc; conf->pool_size = devs; - while (num--) - if (!grow_one_stripe(conf)) + hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; + while (num--) { + if (!grow_one_stripe(conf, hash)) return 1; + conf->max_nr_stripes++; + hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; + } return 0; } @@ -1687,6 +1792,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) int err; struct kmem_cache *sc; int i; + int hash, cnt; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ @@ -1726,19 +1832,29 @@ static int resize_stripes(struct r5conf *conf, int newsize) * OK, we have enough stripes, start collecting inactive * stripes and copying them over */ + hash = 0; + cnt = 0; list_for_each_entry(nsh, &newstripes, lru) { - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list), - conf->device_lock); - osh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); + lock_device_hash_lock(conf, hash); + wait_event_cmd(conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash), + unlock_device_hash_lock(conf, hash), + lock_device_hash_lock(conf, hash)); + osh = get_free_stripe(conf, hash); + unlock_device_hash_lock(conf, hash); atomic_set(&nsh->count, 1); for(i=0; i<conf->pool_size; i++) nsh->dev[i].page = osh->dev[i].page; for( ; i<newsize; i++) nsh->dev[i].page = NULL; + nsh->hash_lock_index = hash; kmem_cache_free(conf->slab_cache, osh); + cnt++; + if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + + !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { + hash++; + cnt = 0; + } } kmem_cache_destroy(conf->slab_cache); @@ -1797,13 +1913,13 @@ static int resize_stripes(struct r5conf *conf, int newsize) return err; } -static int drop_one_stripe(struct r5conf *conf) +static int drop_one_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh; - spin_lock_irq(&conf->device_lock); - sh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); + spin_lock_irq(conf->hash_locks + hash); + sh = get_free_stripe(conf, hash); + spin_unlock_irq(conf->hash_locks + hash); if (!sh) return 0; BUG_ON(atomic_read(&sh->count)); @@ -1815,8 +1931,10 @@ static int drop_one_stripe(struct r5conf *conf) static void shrink_stripes(struct r5conf *conf) { - while (drop_one_stripe(conf)) - ; + int hash; + for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) + while (drop_one_stripe(conf, hash)) + ; if (conf->slab_cache) kmem_cache_destroy(conf->slab_cache); @@ -1921,6 +2039,9 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), bdn); else retry = 1; + if (set_bad && test_bit(In_sync, &rdev->flags) + && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + retry = 1; if (retry) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { set_bit(R5_ReadError, &sh->dev[i].flags); @@ -3900,7 +4021,8 @@ static void raid5_activate_delayed(struct r5conf *conf) } } -static void activate_bit_delay(struct r5conf *conf) +static void activate_bit_delay(struct r5conf *conf, + struct list_head *temp_inactive_list) { /* device_lock is held */ struct list_head head; @@ -3908,9 +4030,11 @@ static void activate_bit_delay(struct r5conf *conf) list_del_init(&conf->bitmap_list); while (!list_empty(&head)) { struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + int hash; list_del_init(&sh->lru); atomic_inc(&sh->count); - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); } } @@ -3926,7 +4050,7 @@ int md_raid5_congested(struct mddev *mddev, int bits) return 1; if (conf->quiesce) return 1; - if (list_empty_careful(&conf->inactive_list)) + if (atomic_read(&conf->empty_inactive_list_nr)) return 1; return 0; @@ -4256,6 +4380,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) struct raid5_plug_cb { struct blk_plug_cb cb; struct list_head list; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; }; static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) @@ -4266,6 +4391,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) struct mddev *mddev = cb->cb.data; struct r5conf *conf = mddev->private; int cnt = 0; + int hash; if (cb->list.next && !list_empty(&cb->list)) { spin_lock_irq(&conf->device_lock); @@ -4283,11 +4409,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) * STRIPE_ON_RELEASE_LIST could be set here. In that * case, the count is always > 1 here */ - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); cnt++; } spin_unlock_irq(&conf->device_lock); } + release_inactive_stripe_list(conf, cb->temp_inactive_list, + NR_STRIPE_HASH_LOCKS); if (mddev->queue) trace_block_unplug(mddev->queue, cnt, !from_schedule); kfree(cb); @@ -4308,8 +4437,12 @@ static void release_stripe_plug(struct mddev *mddev, cb = container_of(blk_cb, struct raid5_plug_cb, cb); - if (cb->list.next == NULL) + if (cb->list.next == NULL) { + int i; INIT_LIST_HEAD(&cb->list); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(cb->temp_inactive_list + i); + } if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) list_add_tail(&sh->lru, &cb->list); @@ -4692,14 +4825,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes)==0); + atomic_read(&conf->reshape_stripes)==0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + return 0; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || - kthread_should_stop()); + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + return 0; spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); @@ -4782,7 +4920,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes) == 0); + atomic_read(&conf->reshape_stripes) == 0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + goto ret; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; @@ -4790,13 +4931,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) - || kthread_should_stop()); + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto ret; spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } +ret: return reshape_sectors; } @@ -4954,27 +5098,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) } static int handle_active_stripes(struct r5conf *conf, int group, - struct r5worker *worker) + struct r5worker *worker, + struct list_head *temp_inactive_list) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; - int i, batch_size = 0; + int i, batch_size = 0, hash; + bool release_inactive = false; while (batch_size < MAX_STRIPE_BATCH && (sh = __get_priority_stripe(conf, group)) != NULL) batch[batch_size++] = sh; - if (batch_size == 0) - return batch_size; + if (batch_size == 0) { + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + if (!list_empty(temp_inactive_list + i)) + break; + if (i == NR_STRIPE_HASH_LOCKS) + return batch_size; + release_inactive = true; + } spin_unlock_irq(&conf->device_lock); + release_inactive_stripe_list(conf, temp_inactive_list, + NR_STRIPE_HASH_LOCKS); + + if (release_inactive) { + spin_lock_irq(&conf->device_lock); + return 0; + } + for (i = 0; i < batch_size; i++) handle_stripe(batch[i]); cond_resched(); spin_lock_irq(&conf->device_lock); - for (i = 0; i < batch_size; i++) - __release_stripe(conf, batch[i]); + for (i = 0; i < batch_size; i++) { + hash = batch[i]->hash_lock_index; + __release_stripe(conf, batch[i], &temp_inactive_list[hash]); + } return batch_size; } @@ -4995,9 +5157,10 @@ static void raid5_do_work(struct work_struct *work) while (1) { int batch_size, released; - released = release_stripe_list(conf); + released = release_stripe_list(conf, worker->temp_inactive_list); - batch_size = handle_active_stripes(conf, group_id, worker); + batch_size = handle_active_stripes(conf, group_id, worker, + worker->temp_inactive_list); worker->working = false; if (!batch_size && !released) break; @@ -5036,7 +5199,7 @@ static void raid5d(struct md_thread *thread) struct bio *bio; int batch_size, released; - released = release_stripe_list(conf); + released = release_stripe_list(conf, conf->temp_inactive_list); if ( !list_empty(&conf->bitmap_list)) { @@ -5046,7 +5209,7 @@ static void raid5d(struct md_thread *thread) bitmap_unplug(mddev->bitmap); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; - activate_bit_delay(conf); + activate_bit_delay(conf, conf->temp_inactive_list); } raid5_activate_delayed(conf); @@ -5060,7 +5223,8 @@ static void raid5d(struct md_thread *thread) handled++; } - batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); + batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, + conf->temp_inactive_list); if (!batch_size && !released) break; handled += batch_size; @@ -5096,22 +5260,29 @@ raid5_set_cache_size(struct mddev *mddev, int size) { struct r5conf *conf = mddev->private; int err; + int hash; if (size <= 16 || size > 32768) return -EINVAL; + hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; while (size < conf->max_nr_stripes) { - if (drop_one_stripe(conf)) + if (drop_one_stripe(conf, hash)) conf->max_nr_stripes--; else break; + hash--; + if (hash < 0) + hash = NR_STRIPE_HASH_LOCKS - 1; } err = md_allow_write(mddev); if (err) return err; + hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; while (size > conf->max_nr_stripes) { - if (grow_one_stripe(conf)) + if (grow_one_stripe(conf, hash)) conf->max_nr_stripes++; else break; + hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; } return 0; } @@ -5199,15 +5370,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) return 0; } -static int alloc_thread_groups(struct r5conf *conf, int cnt); +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + int *worker_cnt_per_group, + struct r5worker_group **worker_groups); static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { struct r5conf *conf = mddev->private; unsigned long new; int err; - struct r5worker_group *old_groups; - int old_group_cnt; + struct r5worker_group *new_groups, *old_groups; + int group_cnt, worker_cnt_per_group; if (len >= PAGE_SIZE) return -EINVAL; @@ -5223,14 +5397,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) mddev_suspend(mddev); old_groups = conf->worker_groups; - old_group_cnt = conf->worker_cnt_per_group; + if (old_groups) + flush_workqueue(raid5_wq); + + err = alloc_thread_groups(conf, new, + &group_cnt, &worker_cnt_per_group, + &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); - conf->worker_groups = NULL; - err = alloc_thread_groups(conf, new); - if (err) { - conf->worker_groups = old_groups; - conf->worker_cnt_per_group = old_group_cnt; - } else { if (old_groups) kfree(old_groups[0].workers); kfree(old_groups); @@ -5260,40 +5439,47 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int alloc_thread_groups(struct r5conf *conf, int cnt) +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + int *worker_cnt_per_group, + struct r5worker_group **worker_groups) { - int i, j; + int i, j, k; ssize_t size; struct r5worker *workers; - conf->worker_cnt_per_group = cnt; + *worker_cnt_per_group = cnt; if (cnt == 0) { - conf->worker_groups = NULL; + *group_cnt = 0; + *worker_groups = NULL; return 0; } - conf->group_cnt = num_possible_nodes(); + *group_cnt = num_possible_nodes(); size = sizeof(struct r5worker) * cnt; - workers = kzalloc(size * conf->group_cnt, GFP_NOIO); - conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * - conf->group_cnt, GFP_NOIO); - if (!conf->worker_groups || !workers) { + workers = kzalloc(size * *group_cnt, GFP_NOIO); + *worker_groups = kzalloc(sizeof(struct r5worker_group) * + *group_cnt, GFP_NOIO); + if (!*worker_groups || !workers) { kfree(workers); - kfree(conf->worker_groups); - conf->worker_groups = NULL; + kfree(*worker_groups); return -ENOMEM; } - for (i = 0; i < conf->group_cnt; i++) { + for (i = 0; i < *group_cnt; i++) { struct r5worker_group *group; - group = &conf->worker_groups[i]; + group = &(*worker_groups)[i]; INIT_LIST_HEAD(&group->handle_list); group->conf = conf; group->workers = workers + i * cnt; for (j = 0; j < cnt; j++) { - group->workers[j].group = group; - INIT_WORK(&group->workers[j].work, raid5_do_work); + struct r5worker *worker = group->workers + j; + worker->group = group; + INIT_WORK(&worker->work, raid5_do_work); + + for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) + INIT_LIST_HEAD(worker->temp_inactive_list + k); } } @@ -5444,6 +5630,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) struct md_rdev *rdev; struct disk_info *disk; char pers_name[6]; + int i; + int group_cnt, worker_cnt_per_group; + struct r5worker_group *new_group; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -5478,7 +5667,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (conf == NULL) goto abort; /* Don't enable multi-threading by default*/ - if (alloc_thread_groups(conf, 0)) + if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, + &new_group)) { + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_group; + } else goto abort; spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); @@ -5488,7 +5682,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); - INIT_LIST_HEAD(&conf->inactive_list); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -5514,6 +5707,21 @@ static struct r5conf *setup_conf(struct mddev *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; + /* We init hash_locks[0] separately to that it can be used + * as the reference lock in the spin_lock_nest_lock() call + * in lock_all_device_hash_locks_irq in order to convince + * lockdep that we know what we are doing. + */ + spin_lock_init(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_init(conf->hash_locks + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->inactive_list + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->temp_inactive_list + i); + conf->level = mddev->new_level; if (raid5_alloc_percpu(conf) != 0) goto abort; @@ -5554,7 +5762,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) else conf->max_degraded = 1; conf->algorithm = mddev->new_layout; - conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { conf->prev_chunk_sectors = mddev->chunk_sectors; @@ -5563,7 +5770,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { + atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); + if (grow_stripes(conf, NR_STRIPES)) { printk(KERN_ERR "md/raid:%s: couldn't allocate %dkB for buffers\n", mdname(mddev), memory); @@ -6369,12 +6577,18 @@ static int raid5_start_reshape(struct mddev *mddev) if (!mddev->sync_thread) { mddev->recovery = 0; spin_lock_irq(&conf->device_lock); + write_seqcount_begin(&conf->gen_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + mddev->new_chunk_sectors = + conf->chunk_sectors = conf->prev_chunk_sectors; + mddev->new_layout = conf->algorithm = conf->prev_algo; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; smp_wmb(); + conf->generation --; conf->reshape_progress = MaxSector; mddev->reshape_position = MaxSector; + write_seqcount_end(&conf->gen_lock); spin_unlock_irq(&conf->device_lock); return -EAGAIN; } @@ -6462,27 +6676,28 @@ static void raid5_quiesce(struct mddev *mddev, int state) break; case 1: /* stop all writes */ - spin_lock_irq(&conf->device_lock); + lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain */ conf->quiesce = 2; - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_cmd(conf->wait_for_stripe, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, - conf->device_lock); + unlock_all_device_hash_locks_irq(conf), + lock_all_device_hash_locks_irq(conf)); conf->quiesce = 1; - spin_unlock_irq(&conf->device_lock); + unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); break; case 0: /* re-enable writes */ - spin_lock_irq(&conf->device_lock); + lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_stripe); wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); + unlock_all_device_hash_locks_irq(conf); break; } } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index b42e6b462eda..01ad8ae8f578 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -205,6 +205,7 @@ struct stripe_head { short pd_idx; /* parity disk index */ short qd_idx; /* 'Q' disk index for raid6 */ short ddf_layout;/* use DDF ordering to calculate Q */ + short hash_lock_index; unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ int bm_seq; /* sequence number for bitmap flushes */ @@ -367,9 +368,18 @@ struct disk_info { struct md_rdev *rdev, *replacement; }; +/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. + * This is because we sometimes take all the spinlocks + * and creating that much locking depth can cause + * problems. + */ +#define NR_STRIPE_HASH_LOCKS 8 +#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1) + struct r5worker { struct work_struct work; struct r5worker_group *group; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; bool working; }; @@ -382,6 +392,8 @@ struct r5worker_group { struct r5conf { struct hlist_head *stripe_hashtbl; + /* only protect corresponding hash list and inactive_list */ + spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; struct mddev *mddev; int chunk_sectors; int level, algorithm; @@ -462,7 +474,8 @@ struct r5conf { * Free stripes pool */ atomic_t active_stripes; - struct list_head inactive_list; + struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; + atomic_t empty_inactive_list_nr; struct llist_head released_stripes; wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; @@ -477,6 +490,7 @@ struct r5conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; struct r5worker_group *worker_groups; int group_cnt; int worker_cnt_per_group; |