From 2b74e12e567feb4163e32815bce0be57489e73b9 Mon Sep 17 00:00:00 2001
From: NeilBrown
Date: Thu, 9 Dec 2010 15:59:01 +1100
Subject: md: remove handling of flush_pending in md_submit_flush_data

None of the functions called between setting flush_pending to 1, and
atomic_dec_and_test can change flush_pending, or will anything
running in any other thread (as ->flush_bio is not NULL).  So the
atomic_dec_and_test will always succeed.
So remove the atomic_sec and the atomic_dec_and_test.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 84c46a161927..83b6cb3e7025 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -404,8 +404,6 @@ static void md_submit_flush_data(struct work_struct *ws)
 	mddev_t *mddev = container_of(ws, mddev_t, flush_work);
 	struct bio *bio = mddev->flush_bio;
 
-	atomic_set(&mddev->flush_pending, 1);
-
 	if (bio->bi_size == 0)
 		/* an empty barrier - all done */
 		bio_endio(bio, 0);
@@ -414,10 +412,9 @@ static void md_submit_flush_data(struct work_struct *ws)
 		if (mddev->pers->make_request(mddev, bio))
 			generic_make_request(bio);
 	}
-	if (atomic_dec_and_test(&mddev->flush_pending)) {
-		mddev->flush_bio = NULL;
-		wake_up(&mddev->sb_wait);
-	}
+
+	mddev->flush_bio = NULL;
+	wake_up(&mddev->sb_wait);
 }
 
 void md_flush_request(mddev_t *mddev, struct bio *bio)
-- 
cgit v1.2.3


From a7a07e69653acf8540daa1da053cd84bf86e8e66 Mon Sep 17 00:00:00 2001
From: NeilBrown
Date: Thu, 9 Dec 2010 16:04:25 +1100
Subject: md: move code in to submit_flushes.

submit_flushes is called from exactly one place.
Move the code that is before and after that call into
submit_flushes.

This has not functional change, but will make the next patch
smaller and easier to follow.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 83b6cb3e7025..31f8e151d893 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -371,10 +371,14 @@ static void md_end_flush(struct bio *bio, int err)
 	bio_put(bio);
 }
 
+static void md_submit_flush_data(struct work_struct *ws);
+
 static void submit_flushes(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
 
+	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
+	atomic_set(&mddev->flush_pending, 1);
 	rcu_read_lock();
 	list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
 		if (rdev->raid_disk >= 0 &&
@@ -397,6 +401,8 @@ static void submit_flushes(mddev_t *mddev)
 			rdev_dec_pending(rdev, mddev);
 		}
 	rcu_read_unlock();
+	if (atomic_dec_and_test(&mddev->flush_pending))
+		queue_work(md_wq, &mddev->flush_work);
 }
 
 static void md_submit_flush_data(struct work_struct *ws)
@@ -426,13 +432,7 @@ void md_flush_request(mddev_t *mddev, struct bio *bio)
 	mddev->flush_bio = bio;
 	spin_unlock_irq(&mddev->write_lock);
 
-	atomic_set(&mddev->flush_pending, 1);
-	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
-
 	submit_flushes(mddev);
-
-	if (atomic_dec_and_test(&mddev->flush_pending))
-		queue_work(md_wq, &mddev->flush_work);
 }
 EXPORT_SYMBOL(md_flush_request);
 
-- 
cgit v1.2.3


From a035fc3e2531703b539f23bec4ca7943cfc69349 Mon Sep 17 00:00:00 2001
From: NeilBrown
Date: Thu, 9 Dec 2010 16:17:51 +1100
Subject: md: fix possible deadlock in handling flush requests.

As recorded in
    https://bugzilla.kernel.org/show_bug.cgi?id=24012

it is possible for a flush request through md to hang.  This is due to
an interaction between the recursion avoidance in
generic_make_request, the insistence in md of only having one flush
active at a time, and the possibility of dm (or md) submitting two
flush requests to a device from the one generic_make_request.

If a generic_make_request call into dm causes two flush requests to be
queued (as happens if the dm table has two targets - they get one
each), these two will be queued inside generic_make_request.

Assume they are for the same md device.
The first is processed and causes 1 or more flush requests to be sent
to lower devices.  These get queued within generic_make_request too.
Then the second flush to the md device gets handled and it blocks
waiting for the first flush to complete.  But it won't complete until
the two lower-device requests complete, and they haven't even been
submitted yet as they are on the generic_make_request queue.

The deadlock can be broken by using a separate thread to submit the
requests to lower devices.  md has such a thread readily available:
md_wq.

So use it to submit these requests.

Reported-by: Giacomo Catenazzi <cate@cateee.net>
Tested-by: Giacomo Catenazzi <cate@cateee.net>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 31f8e151d893..d66aaeddf95d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -373,8 +373,9 @@ static void md_end_flush(struct bio *bio, int err)
 
 static void md_submit_flush_data(struct work_struct *ws);
 
-static void submit_flushes(mddev_t *mddev)
+static void submit_flushes(struct work_struct *ws)
 {
+	mddev_t *mddev = container_of(ws, mddev_t, flush_work);
 	mdk_rdev_t *rdev;
 
 	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
@@ -432,7 +433,8 @@ void md_flush_request(mddev_t *mddev, struct bio *bio)
 	mddev->flush_bio = bio;
 	spin_unlock_irq(&mddev->write_lock);
 
-	submit_flushes(mddev);
+	INIT_WORK(&mddev->flush_work, submit_flushes);
+	queue_work(md_wq, &mddev->flush_work);
 }
 EXPORT_SYMBOL(md_flush_request);
 
-- 
cgit v1.2.3


From 1a855a0606653d2d82506281e2c686bacb4b2f45 Mon Sep 17 00:00:00 2001
From: NeilBrown
Date: Thu, 9 Dec 2010 16:36:28 +1100
Subject: md: fix bug with re-adding of partially recovered device.

With v0.90 metadata, a hot-spare does not become a full member of the
array until recovery is complete.  So if we re-add such a device to
the array, we know that all of it is as up-to-date as the event count
would suggest, and so it a bitmap-based recovery is possible.

However with v1.x metadata, the hot-spare immediately becomes a full
member of the array, but it record how much of the device has been
recovered.  If the array is stopped and re-assembled recovery starts
from this point.

When such a device is hot-added to an array we currently lose the 'how
much is recovered' information and incorrectly included it as a full
in-sync member (after bitmap-based fixup).
This is wrong and unsafe and could corrupt data.

So be more careful about setting saved_raid_disk - which is what
guides the re-adding of devices back into an array.
The new code matches the code in slot_store which does a similar
thing, which is encouraging.

This is suitable for any -stable kernel.

Reported-by: "Dailey, Nate" <Nate.Dailey@stratus.com>
Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d66aaeddf95d..b757da175180 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5159,7 +5159,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 				PTR_ERR(rdev));
 			return PTR_ERR(rdev);
 		}
-		/* set save_raid_disk if appropriate */
+		/* set saved_raid_disk if appropriate */
 		if (!mddev->persistent) {
 			if (info->state & (1<<MD_DISK_SYNC)  &&
 			    info->raid_disk < mddev->raid_disks)
@@ -5169,7 +5169,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		} else
 			super_types[mddev->major_version].
 				validate_super(mddev, rdev);
-		rdev->saved_raid_disk = rdev->raid_disk;
+		if (test_bit(In_sync, &rdev->flags))
+			rdev->saved_raid_disk = rdev->raid_disk;
+		else
+			rdev->saved_raid_disk = -1;
 
 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
-- 
cgit v1.2.3


From 589a594be1fb8815b3f18e517be696c48664f728 Mon Sep 17 00:00:00 2001
From: NeilBrown
Date: Thu, 9 Dec 2010 17:02:14 +1100
Subject: md: protect against NULL reference when waiting to start a raid10.

When we fail to start a raid10 for some reason, we call
md_unregister_thread to kill the thread that was created.

Unfortunately md_thread() will then make one call into the handler
(raid10d) even though md_wakeup_thread has not been called.  This is
not safe and as md_unregister_thread is called after mddev->private
has been set to NULL, it will definitely cause a NULL dereference.

So fix this at both ends:
 - md_thread should only call the handler if THREAD_WAKEUP has been
   set.
 - raid10 should call md_unregister_thread before setting things
   to NULL just like all the other raid modules do.

This is applicable to 2.6.35 and later.

Cc: stable@kernel.org
Reported-by: "Citizen" <citizen_lee@thecus.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     | 5 ++---
 drivers/md/raid10.c | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b757da175180..e71c5fa527f5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6044,9 +6044,8 @@ static int md_thread(void * arg)
 			 || kthread_should_stop(),
 			 thread->timeout);
 
-		clear_bit(THREAD_WAKEUP, &thread->flags);
-
-		thread->run(thread->mddev);
+		if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags))
+			thread->run(thread->mddev);
 	}
 
 	return 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c67aa54694ae..0641674827f0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2397,13 +2397,13 @@ static int run(mddev_t *mddev)
 	return 0;
 
 out_free_conf:
+	md_unregister_thread(mddev->thread);
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
 	safe_put_page(conf->tmppage);
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
-	md_unregister_thread(mddev->thread);
 out:
 	return -EIO;
 }
-- 
cgit v1.2.3