From 1ba64edef6051d2ec79bb2fbd3a0c8f0df00ab55 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:37 +0100
Subject: block, sx8: kill blk_insert_request()

The only user left for blk_insert_request() is sx8 and it can be
trivially switched to use blk_execute_rq_nowait() - special requests
aren't included in io stat and sx8 doesn't use block layer tagging.
Switch sx8 and kill blk_insert_requeset().

This patch doesn't introduce any functional difference.

Only compile tested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 48 ------------------------------------------------
 drivers/block/sx8.c    | 12 ++++++++----
 include/linux/blkdev.h |  1 -
 3 files changed, 8 insertions(+), 53 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ea70e6c80cd3..435af2378614 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1010,54 +1010,6 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
 	__elv_add_request(q, rq, where);
 }
 
-/**
- * blk_insert_request - insert a special request into a request queue
- * @q:		request queue where request should be inserted
- * @rq:		request to be inserted
- * @at_head:	insert request at head or tail of queue
- * @data:	private data
- *
- * Description:
- *    Many block devices need to execute commands asynchronously, so they don't
- *    block the whole kernel from preemption during request execution.  This is
- *    accomplished normally by inserting aritficial requests tagged as
- *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
- *    be scheduled for actual execution by the request queue.
- *
- *    We have the option of inserting the head or the tail of the queue.
- *    Typically we use the tail for new ioctls and so forth.  We use the head
- *    of the queue for things like a QUEUE_FULL message from a device, or a
- *    host that is unable to accept a particular command.
- */
-void blk_insert_request(struct request_queue *q, struct request *rq,
-			int at_head, void *data)
-{
-	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-	unsigned long flags;
-
-	/*
-	 * tell I/O scheduler that this isn't a regular read/write (ie it
-	 * must not attempt merges on this) and that it acts as a soft
-	 * barrier
-	 */
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-
-	rq->special = data;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-
-	/*
-	 * If command is tagged, release the tag
-	 */
-	if (blk_rq_tagged(rq))
-		blk_queue_end_tag(q, rq);
-
-	add_acct_request(q, rq, where);
-	__blk_run_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_insert_request);
-
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index b70f0fca9a42..e7472f567c9d 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -619,8 +619,10 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	       host->state == HST_DEV_SCAN);
 	spin_unlock_irq(&host->lock);
 
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
 	return 0;
 
@@ -658,8 +660,10 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	BUG_ON(rc < 0);
 	crq->msg_bucket = (u32) rc;
 
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
 	return 0;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c7a6d3b5bc7b..8a6b51b13a1c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -660,7 +660,6 @@ extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern struct request *blk_make_request(struct request_queue *, struct bio *,
 					gfp_t);
-extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
-- 
cgit v1.2.3


From 34f6055c80285e4efb3f602a9119db75239744dc Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:37 +0100
Subject: block: add blk_queue_dead()

There are a number of QUEUE_FLAG_DEAD tests.  Add blk_queue_dead()
macro and use it.

This patch doesn't introduce any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 6 +++---
 block/blk-exec.c       | 2 +-
 block/blk-sysfs.c      | 4 ++--
 block/blk-throttle.c   | 4 ++--
 block/blk.h            | 2 +-
 include/linux/blkdev.h | 1 +
 6 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 435af2378614..b5ed4f4a8d96 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -608,7 +608,7 @@ EXPORT_SYMBOL(blk_init_allocated_queue_node);
 
 int blk_get_queue(struct request_queue *q)
 {
-	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+	if (likely(!blk_queue_dead(q))) {
 		kobject_get(&q->kobj);
 		return 0;
 	}
@@ -755,7 +755,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	int may_queue;
 
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+	if (unlikely(blk_queue_dead(q)))
 		return NULL;
 
 	may_queue = elv_may_queue(q, rw_flags);
@@ -875,7 +875,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 
-		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+		if (unlikely(blk_queue_dead(q)))
 			return NULL;
 
 		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
diff --git a/block/blk-exec.c b/block/blk-exec.c
index a1ebceb332f9..60532852b3ab 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -50,7 +50,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+	if (unlikely(blk_queue_dead(q))) {
 		rq->errors = -ENXIO;
 		if (rq->end_io)
 			rq->end_io(rq, rq->errors);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e7f9f657f105..f0b2ca8f66d0 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -425,7 +425,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 	if (!entry->show)
 		return -EIO;
 	mutex_lock(&q->sysfs_lock);
-	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+	if (blk_queue_dead(q)) {
 		mutex_unlock(&q->sysfs_lock);
 		return -ENOENT;
 	}
@@ -447,7 +447,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 
 	q = container_of(kobj, struct request_queue, kobj);
 	mutex_lock(&q->sysfs_lock);
-	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+	if (blk_queue_dead(q)) {
 		mutex_unlock(&q->sysfs_lock);
 		return -ENOENT;
 	}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 4553245d9317..5eed6a76721d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -310,7 +310,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	struct request_queue *q = td->queue;
 
 	/* no throttling for dead queue */
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+	if (unlikely(blk_queue_dead(q)))
 		return NULL;
 
 	rcu_read_lock();
@@ -335,7 +335,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 	spin_lock_irq(q->queue_lock);
 
 	/* Make sure @q is still alive */
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+	if (unlikely(blk_queue_dead(q))) {
 		kfree(tg);
 		return NULL;
 	}
diff --git a/block/blk.h b/block/blk.h
index 3f6551b3c92d..e38691dbb329 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -85,7 +85,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			q->flush_queue_delayed = 1;
 			return NULL;
 		}
-		if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
+		if (unlikely(blk_queue_dead(q)) ||
 		    !q->elevator->ops->elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8a6b51b13a1c..783f97c14d0a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -481,6 +481,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
+#define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_noxmerges(q)	\
 	test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
-- 
cgit v1.2.3


From 481a7d64790cd7ca61a8bbcbd9d017ce58e6fe39 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:37 +0100
Subject: block: fix drain_all condition in blk_drain_queue()

When trying to drain all requests, blk_drain_queue() checked only
q->rq.count[]; however, this only tracks REQ_ALLOCED requests.  This
patch updates blk_drain_queue() such that it looks at all the counters
and queues so that request_queue is actually empty on completion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index b5ed4f4a8d96..c37e9e7c9d07 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -358,7 +358,8 @@ EXPORT_SYMBOL(blk_put_queue);
 void blk_drain_queue(struct request_queue *q, bool drain_all)
 {
 	while (true) {
-		int nr_rqs;
+		bool drain = false;
+		int i;
 
 		spin_lock_irq(q->queue_lock);
 
@@ -368,14 +369,25 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
 
 		__blk_run_queue(q);
 
-		if (drain_all)
-			nr_rqs = q->rq.count[0] + q->rq.count[1];
-		else
-			nr_rqs = q->rq.elvpriv;
+		drain |= q->rq.elvpriv;
+
+		/*
+		 * Unfortunately, requests are queued at and tracked from
+		 * multiple places and there's no single counter which can
+		 * be drained.  Check all the queues and counters.
+		 */
+		if (drain_all) {
+			drain |= !list_empty(&q->queue_head);
+			for (i = 0; i < 2; i++) {
+				drain |= q->rq.count[i];
+				drain |= q->in_flight[i];
+				drain |= !list_empty(&q->flush_queue[i]);
+			}
+		}
 
 		spin_unlock_irq(q->queue_lock);
 
-		if (!nr_rqs)
+		if (!drain)
 			break;
 		msleep(10);
 	}
-- 
cgit v1.2.3


From 8ba61435d73f2274e12d4d823fde06735e8f6a54 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:37 +0100
Subject: block: add missing blk_queue_dead() checks

blk_insert_cloned_request(), blk_execute_rq_nowait() and
blk_flush_plug_list() either didn't check whether the queue was dead
or did it without holding queue_lock.  Update them so that dead state
is checked while holding queue_lock.

AFAICS, this plugs all holes (requeue doesn't matter as the request is
transitioning atomically from in_flight to queued).

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 21 +++++++++++++++++++++
 block/blk-exec.c |  6 ++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index c37e9e7c9d07..30add45a87ef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1731,6 +1731,10 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 		return -EIO;
 
 	spin_lock_irqsave(q->queue_lock, flags);
+	if (unlikely(blk_queue_dead(q))) {
+		spin_unlock_irqrestore(q->queue_lock, flags);
+		return -ENODEV;
+	}
 
 	/*
 	 * Submitting request must be dequeued before calling this function
@@ -2704,6 +2708,14 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 {
 	trace_block_unplug(q, depth, !from_schedule);
 
+	/*
+	 * Don't mess with dead queue.
+	 */
+	if (unlikely(blk_queue_dead(q))) {
+		spin_unlock(q->queue_lock);
+		return;
+	}
+
 	/*
 	 * If we are punting this to kblockd, then we can safely drop
 	 * the queue_lock before waking kblockd (which needs to take
@@ -2780,6 +2792,15 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 			depth = 0;
 			spin_lock(q->queue_lock);
 		}
+
+		/*
+		 * Short-circuit if @q is dead
+		 */
+		if (unlikely(blk_queue_dead(q))) {
+			__blk_end_request_all(rq, -ENODEV);
+			continue;
+		}
+
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 60532852b3ab..fb2cbd551621 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -50,7 +50,11 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
+	WARN_ON(irqs_disabled());
+	spin_lock_irq(q->queue_lock);
+
 	if (unlikely(blk_queue_dead(q))) {
+		spin_unlock_irq(q->queue_lock);
 		rq->errors = -ENXIO;
 		if (rq->end_io)
 			rq->end_io(rq, rq->errors);
@@ -59,8 +63,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq->rq_disk = bd_disk;
 	rq->end_io = done;
-	WARN_ON(irqs_disabled());
-	spin_lock_irq(q->queue_lock);
 	__elv_add_request(q, rq, where);
 	__blk_run_queue(q);
 	/* the queue is stopped so it won't be run */
-- 
cgit v1.2.3


From a73f730d013ff2788389fd0c46ad3e5510f124e6 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:37 +0100
Subject: block, cfq: move cfqd->cic_index to q->id

cfq allocates per-queue id using ida and uses it to index cic radix
tree from io_context.  Move it to q->id and allocate on queue init and
free on queue release.  This simplifies cfq a bit and will allow for
further improvements of io context life-cycle management.

This patch doesn't introduce any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 24 +++++++++++++++--------
 block/blk-sysfs.c      |  2 ++
 block/blk.h            |  3 +++
 block/cfq-iosched.c    | 52 +++++---------------------------------------------
 include/linux/blkdev.h |  6 ++++++
 5 files changed, 32 insertions(+), 55 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 30add45a87ef..af7301581172 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
+DEFINE_IDA(blk_queue_ida);
+
 /*
  * For the allocated request tables
  */
@@ -474,6 +476,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (!q)
 		return NULL;
 
+	q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
+	if (q->id < 0)
+		goto fail_q;
+
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
@@ -481,15 +487,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->backing_dev_info.name = "block";
 
 	err = bdi_init(&q->backing_dev_info);
-	if (err) {
-		kmem_cache_free(blk_requestq_cachep, q);
-		return NULL;
-	}
+	if (err)
+		goto fail_id;
 
-	if (blk_throtl_init(q)) {
-		kmem_cache_free(blk_requestq_cachep, q);
-		return NULL;
-	}
+	if (blk_throtl_init(q))
+		goto fail_id;
 
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
@@ -512,6 +514,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->queue_lock = &q->__queue_lock;
 
 	return q;
+
+fail_id:
+	ida_simple_remove(&blk_queue_ida, q->id);
+fail_q:
+	kmem_cache_free(blk_requestq_cachep, q);
+	return NULL;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f0b2ca8f66d0..5b4b4ab5e785 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -494,6 +494,8 @@ static void blk_release_queue(struct kobject *kobj)
 	blk_trace_shutdown(q);
 
 	bdi_destroy(&q->backing_dev_info);
+
+	ida_simple_remove(&blk_queue_ida, q->id);
 	kmem_cache_free(blk_requestq_cachep, q);
 }
 
diff --git a/block/blk.h b/block/blk.h
index e38691dbb329..aae4d88fc523 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -1,6 +1,8 @@
 #ifndef BLK_INTERNAL_H
 #define BLK_INTERNAL_H
 
+#include <linux/idr.h>
+
 /* Amount of time in which a process may batch requests */
 #define BLK_BATCH_TIME	(HZ/50UL)
 
@@ -9,6 +11,7 @@
 
 extern struct kmem_cache *blk_requestq_cachep;
 extern struct kobj_type blk_queue_ktype;
+extern struct ida blk_queue_ida;
 
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 16ace89613bc..ec3f5e8ba564 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -65,9 +65,6 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
 static struct completion *ioc_gone;
 static DEFINE_SPINLOCK(ioc_gone_lock);
 
-static DEFINE_SPINLOCK(cic_index_lock);
-static DEFINE_IDA(cic_index_ida);
-
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
@@ -290,7 +287,6 @@ struct cfq_data {
 	unsigned int cfq_group_idle;
 	unsigned int cfq_latency;
 
-	unsigned int cic_index;
 	struct list_head cic_list;
 
 	/*
@@ -484,7 +480,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
 
 static inline void *cfqd_dead_key(struct cfq_data *cfqd)
 {
-	return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
+	return (void *)(cfqd->queue->id << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
 }
 
 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
@@ -3105,7 +3101,7 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
 	BUG_ON(rcu_dereference_check(ioc->ioc_data,
 		lockdep_is_held(&ioc->lock)) == cic);
 
-	radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
+	radix_tree_delete(&ioc->radix_root, cfqd->queue->id);
 	hlist_del_rcu(&cic->cic_list);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 
@@ -3133,7 +3129,7 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	}
 
 	do {
-		cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
+		cic = radix_tree_lookup(&ioc->radix_root, cfqd->queue->id);
 		rcu_read_unlock();
 		if (!cic)
 			break;
@@ -3169,8 +3165,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 		cic->key = cfqd;
 
 		spin_lock_irqsave(&ioc->lock, flags);
-		ret = radix_tree_insert(&ioc->radix_root,
-						cfqd->cic_index, cic);
+		ret = radix_tree_insert(&ioc->radix_root, cfqd->queue->id, cic);
 		if (!ret)
 			hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
 		spin_unlock_irqrestore(&ioc->lock, flags);
@@ -3944,10 +3939,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
 
 	cfq_shutdown_timer_wq(cfqd);
 
-	spin_lock(&cic_index_lock);
-	ida_remove(&cic_index_ida, cfqd->cic_index);
-	spin_unlock(&cic_index_lock);
-
 	/*
 	 * Wait for cfqg->blkg->key accessors to exit their grace periods.
 	 * Do this wait only if there are other unlinked groups out
@@ -3969,24 +3960,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	kfree(cfqd);
 }
 
-static int cfq_alloc_cic_index(void)
-{
-	int index, error;
-
-	do {
-		if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
-			return -ENOMEM;
-
-		spin_lock(&cic_index_lock);
-		error = ida_get_new(&cic_index_ida, &index);
-		spin_unlock(&cic_index_lock);
-		if (error && error != -EAGAIN)
-			return error;
-	} while (error);
-
-	return index;
-}
-
 static void *cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
@@ -3994,23 +3967,9 @@ static void *cfq_init_queue(struct request_queue *q)
 	struct cfq_group *cfqg;
 	struct cfq_rb_root *st;
 
-	i = cfq_alloc_cic_index();
-	if (i < 0)
-		return NULL;
-
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-	if (!cfqd) {
-		spin_lock(&cic_index_lock);
-		ida_remove(&cic_index_ida, i);
-		spin_unlock(&cic_index_lock);
+	if (!cfqd)
 		return NULL;
-	}
-
-	/*
-	 * Don't need take queue_lock in the routine, since we are
-	 * initializing the ioscheduler, and nobody is using cfqd
-	 */
-	cfqd->cic_index = i;
 
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -4294,7 +4253,6 @@ static void __exit cfq_exit(void)
 	 */
 	if (elv_ioc_count_read(cfq_ioc_count))
 		wait_for_completion(&all_gone);
-	ida_destroy(&cic_index_ida);
 	cfq_slab_kill();
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 783f97c14d0a..8c8dbc4738ea 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -310,6 +310,12 @@ struct request_queue {
 	 */
 	unsigned long		queue_flags;
 
+	/*
+	 * ida allocated id for this queue.  Used to index queues from
+	 * ioctx.
+	 */
+	int			id;
+
 	/*
 	 * queue needs bounce pages for pages above this limit
 	 */
-- 
cgit v1.2.3


From 42ec57a8f68311bbbf4ff96a5d33c8a2e90b9d05 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:37 +0100
Subject: block: misc ioc cleanups

* int return from put_io_context() wasn't used by anybody.  Make it
  return void like other put functions and docbook-fy the function
  comment.

* Reorder dummy declarations for !CONFIG_BLOCK case a bit.

* Make alloc_ioc_context() use __GFP_ZERO allocation, take init out of
  if block and drop 0'ing.

* Docbook-fy current_io_context() comment.

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c           | 72 +++++++++++++++++++++++------------------------
 include/linux/iocontext.h | 12 ++------
 2 files changed, 39 insertions(+), 45 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 6f9bbd978653..8bebf06bac76 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -27,26 +27,28 @@ static void cfq_dtor(struct io_context *ioc)
 	}
 }
 
-/*
- * IO Context helper functions. put_io_context() returns 1 if there are no
- * more users of this io context, 0 otherwise.
+/**
+ * put_io_context - put a reference of io_context
+ * @ioc: io_context to put
+ *
+ * Decrement reference count of @ioc and release it if the count reaches
+ * zero.
  */
-int put_io_context(struct io_context *ioc)
+void put_io_context(struct io_context *ioc)
 {
 	if (ioc == NULL)
-		return 1;
+		return;
 
-	BUG_ON(atomic_long_read(&ioc->refcount) == 0);
+	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
 
-	if (atomic_long_dec_and_test(&ioc->refcount)) {
-		rcu_read_lock();
-		cfq_dtor(ioc);
-		rcu_read_unlock();
+	if (!atomic_long_dec_and_test(&ioc->refcount))
+		return;
 
-		kmem_cache_free(iocontext_cachep, ioc);
-		return 1;
-	}
-	return 0;
+	rcu_read_lock();
+	cfq_dtor(ioc);
+	rcu_read_unlock();
+
+	kmem_cache_free(iocontext_cachep, ioc);
 }
 EXPORT_SYMBOL(put_io_context);
 
@@ -84,33 +86,31 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 {
 	struct io_context *ioc;
 
-	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
-	if (ioc) {
-		atomic_long_set(&ioc->refcount, 1);
-		atomic_set(&ioc->nr_tasks, 1);
-		spin_lock_init(&ioc->lock);
-		ioc->ioprio_changed = 0;
-		ioc->ioprio = 0;
-		ioc->last_waited = 0; /* doesn't matter... */
-		ioc->nr_batch_requests = 0; /* because this is 0 */
-		INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
-		INIT_HLIST_HEAD(&ioc->cic_list);
-		ioc->ioc_data = NULL;
-#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-		ioc->cgroup_changed = 0;
-#endif
-	}
+	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
+				    node);
+	if (unlikely(!ioc))
+		return NULL;
+
+	/* initialize */
+	atomic_long_set(&ioc->refcount, 1);
+	atomic_set(&ioc->nr_tasks, 1);
+	spin_lock_init(&ioc->lock);
+	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
+	INIT_HLIST_HEAD(&ioc->cic_list);
 
 	return ioc;
 }
 
-/*
- * If the current task has no IO context then create one and initialise it.
- * Otherwise, return its existing IO context.
+/**
+ * current_io_context - get io_context of %current
+ * @gfp_flags: allocation flags, used if allocation is necessary
+ * @node: allocation node, used if allocation is necessary
  *
- * This returned IO context doesn't have a specifically elevated refcount,
- * but since the current task itself holds a reference, the context can be
- * used in general code, so long as it stays within `current` context.
+ * Return io_context of %current.  If it doesn't exist, it is created with
+ * @gfp_flags and @node.  The returned io_context does NOT have its
+ * reference count incremented.  Because io_context is exited only on task
+ * exit, %current can be sure that the returned io_context is valid and
+ * alive as long as it is executing.
  */
 struct io_context *current_io_context(gfp_t gfp_flags, int node)
 {
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 5037a0ad2312..8a6ecb66346f 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -76,20 +76,14 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 
 struct task_struct;
 #ifdef CONFIG_BLOCK
-int put_io_context(struct io_context *ioc);
+void put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
 #else
-static inline void exit_io_context(struct task_struct *task)
-{
-}
-
 struct io_context;
-static inline int put_io_context(struct io_context *ioc)
-{
-	return 1;
-}
+static inline void put_io_context(struct io_context *ioc) { }
+static inline void exit_io_context(struct task_struct *task) { }
 #endif
 
 #endif
-- 
cgit v1.2.3


From 6e736be7f282fff705db7c34a15313281b372a76 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:38 +0100
Subject: block: make ioc get/put interface more conventional and fix race on
 alloction

Ignoring copy_io() during fork, io_context can be allocated from two
places - current_io_context() and set_task_ioprio().  The former is
always called from local task while the latter can be called from
different task.  The synchornization between them are peculiar and
dubious.

* current_io_context() doesn't grab task_lock() and assumes that if it
  saw %NULL ->io_context, it would stay that way until allocation and
  assignment is complete.  It has smp_wmb() between alloc/init and
  assignment.

* set_task_ioprio() grabs task_lock() for assignment and does
  smp_read_barrier_depends() between "ioc = task->io_context" and "if
  (ioc)".  Unfortunately, this doesn't achieve anything - the latter
  is not a dependent load of the former.  ie, if ioc itself were being
  dereferenced "ioc->xxx", it would mean something (not sure what tho)
  but as the code currently stands, the dependent read barrier is
  noop.

As only one of the the two test-assignment sequences is task_lock()
protected, the task_lock() can't do much about race between the two.
Nothing prevents current_io_context() and set_task_ioprio() allocating
its own ioc for the same task and overwriting the other's.

Also, set_task_ioprio() can race with exiting task and create a new
ioc after exit_io_context() is finished.

ioc get/put doesn't have any reason to be complex.  The only hot path
is accessing the existing ioc of %current, which is simple to achieve
given that ->io_context is never destroyed as long as the task is
alive.  All other paths can happily go through task_lock() like all
other task sub structures without impacting anything.

This patch updates ioc get/put so that it becomes more conventional.

* alloc_io_context() is replaced with get_task_io_context().  This is
  the only interface which can acquire access to ioc of another task.
  On return, the caller has an explicit reference to the object which
  should be put using put_io_context() afterwards.

* The functionality of current_io_context() remains the same but when
  creating a new ioc, it shares the code path with
  get_task_io_context() and always goes through task_lock().

* get_io_context() now means incrementing ref on an ioc which the
  caller already has access to (be that an explicit refcnt or implicit
  %current one).

* PF_EXITING inhibits creation of new io_context and once
  exit_io_context() is finished, it's guaranteed that both ioc
  acquisition functions return %NULL.

* All users are updated.  Most are trivial but
  smp_read_barrier_depends() removal from cfq_get_io_context() needs a
  bit of explanation.  I suppose the original intention was to ensure
  ioc->ioprio is visible when set_task_ioprio() allocates new
  io_context and installs it; however, this wouldn't have worked
  because set_task_ioprio() doesn't have wmb between init and install.
  There are other problems with this which will be fixed in another
  patch.

* While at it, use NUMA_NO_NODE instead of -1 for wildcard node
  specification.

-v2: Vivek spotted contamination from debug patch.  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |  9 +++--
 block/blk-ioc.c           | 99 +++++++++++++++++++++++++++++++----------------
 block/blk.h               |  1 +
 block/cfq-iosched.c       | 18 ++++-----
 fs/ioprio.c               | 21 ++--------
 include/linux/iocontext.h |  4 +-
 kernel/fork.c             |  8 ++--
 7 files changed, 91 insertions(+), 69 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8f630cec906e..4b001dcd85b0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1645,11 +1645,12 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	struct io_context *ioc;
 
-	task_lock(tsk);
-	ioc = tsk->io_context;
-	if (ioc)
+	/* we don't lose anything even if ioc allocation fails */
+	ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
 		ioc->cgroup_changed = 1;
-	task_unlock(tsk);
+		put_io_context(ioc);
+	}
 }
 
 void blkio_policy_register(struct blkio_policy_type *blkiop)
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 8bebf06bac76..b13ed96776c2 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -16,6 +16,19 @@
  */
 static struct kmem_cache *iocontext_cachep;
 
+/**
+ * get_io_context - increment reference count to io_context
+ * @ioc: io_context to get
+ *
+ * Increment reference count to @ioc.
+ */
+void get_io_context(struct io_context *ioc)
+{
+	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	atomic_long_inc(&ioc->refcount);
+}
+EXPORT_SYMBOL(get_io_context);
+
 static void cfq_dtor(struct io_context *ioc)
 {
 	if (!hlist_empty(&ioc->cic_list)) {
@@ -71,6 +84,9 @@ void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
 
+	/* PF_EXITING prevents new io_context from being attached to @task */
+	WARN_ON_ONCE(!(current->flags & PF_EXITING));
+
 	task_lock(task);
 	ioc = task->io_context;
 	task->io_context = NULL;
@@ -82,7 +98,9 @@ void exit_io_context(struct task_struct *task)
 	put_io_context(ioc);
 }
 
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
+static struct io_context *create_task_io_context(struct task_struct *task,
+						 gfp_t gfp_flags, int node,
+						 bool take_ref)
 {
 	struct io_context *ioc;
 
@@ -98,6 +116,20 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->cic_list);
 
+	/* try to install, somebody might already have beaten us to it */
+	task_lock(task);
+
+	if (!task->io_context && !(task->flags & PF_EXITING)) {
+		task->io_context = ioc;
+	} else {
+		kmem_cache_free(iocontext_cachep, ioc);
+		ioc = task->io_context;
+	}
+
+	if (ioc && take_ref)
+		get_io_context(ioc);
+
+	task_unlock(task);
 	return ioc;
 }
 
@@ -114,46 +146,47 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
  */
 struct io_context *current_io_context(gfp_t gfp_flags, int node)
 {
-	struct task_struct *tsk = current;
-	struct io_context *ret;
-
-	ret = tsk->io_context;
-	if (likely(ret))
-		return ret;
-
-	ret = alloc_io_context(gfp_flags, node);
-	if (ret) {
-		/* make sure set_task_ioprio() sees the settings above */
-		smp_wmb();
-		tsk->io_context = ret;
-	}
+	might_sleep_if(gfp_flags & __GFP_WAIT);
 
-	return ret;
+	if (current->io_context)
+		return current->io_context;
+
+	return create_task_io_context(current, gfp_flags, node, false);
 }
+EXPORT_SYMBOL(current_io_context);
 
-/*
- * If the current task has no IO context then create one and initialise it.
- * If it does have a context, take a ref on it.
+/**
+ * get_task_io_context - get io_context of a task
+ * @task: task of interest
+ * @gfp_flags: allocation flags, used if allocation is necessary
+ * @node: allocation node, used if allocation is necessary
+ *
+ * Return io_context of @task.  If it doesn't exist, it is created with
+ * @gfp_flags and @node.  The returned io_context has its reference count
+ * incremented.
  *
- * This is always called in the context of the task which submitted the I/O.
+ * This function always goes through task_lock() and it's better to use
+ * current_io_context() + get_io_context() for %current.
  */
-struct io_context *get_io_context(gfp_t gfp_flags, int node)
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node)
 {
-	struct io_context *ioc = NULL;
-
-	/*
-	 * Check for unlikely race with exiting task. ioc ref count is
-	 * zero when ioc is being detached.
-	 */
-	do {
-		ioc = current_io_context(gfp_flags, node);
-		if (unlikely(!ioc))
-			break;
-	} while (!atomic_long_inc_not_zero(&ioc->refcount));
+	struct io_context *ioc;
 
-	return ioc;
+	might_sleep_if(gfp_flags & __GFP_WAIT);
+
+	task_lock(task);
+	ioc = task->io_context;
+	if (likely(ioc)) {
+		get_io_context(ioc);
+		task_unlock(task);
+		return ioc;
+	}
+	task_unlock(task);
+
+	return create_task_io_context(task, gfp_flags, node, true);
 }
-EXPORT_SYMBOL(get_io_context);
+EXPORT_SYMBOL(get_task_io_context);
 
 static int __init blk_ioc_init(void)
 {
diff --git a/block/blk.h b/block/blk.h
index aae4d88fc523..fc3c41b2fd24 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -122,6 +122,7 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
 }
 #endif
 
+void get_io_context(struct io_context *ioc);
 struct io_context *current_io_context(gfp_t gfp_flags, int node);
 
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ec3f5e8ba564..d42d89ccce1b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include "blk.h"
 #include "cfq.h"
 
 /*
@@ -3194,13 +3195,13 @@ static struct cfq_io_context *
 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
-	struct cfq_io_context *cic;
+	struct cfq_io_context *cic = NULL;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	ioc = get_io_context(gfp_mask, cfqd->queue->node);
+	ioc = current_io_context(gfp_mask, cfqd->queue->node);
 	if (!ioc)
-		return NULL;
+		goto err;
 
 	cic = cfq_cic_lookup(cfqd, ioc);
 	if (cic)
@@ -3211,10 +3212,10 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		goto err;
 
 	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
-		goto err_free;
-
+		goto err;
 out:
-	smp_read_barrier_depends();
+	get_io_context(ioc);
+
 	if (unlikely(ioc->ioprio_changed))
 		cfq_ioc_set_ioprio(ioc);
 
@@ -3223,10 +3224,9 @@ out:
 		cfq_ioc_set_cgroup(ioc);
 #endif
 	return cic;
-err_free:
-	cfq_cic_free(cic);
 err:
-	put_io_context(ioc);
+	if (cic)
+		cfq_cic_free(cic);
 	return NULL;
 }
 
diff --git a/fs/ioprio.c b/fs/ioprio.c
index f79dab83e17b..998ec239d1ea 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -48,28 +48,13 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 	if (err)
 		return err;
 
-	task_lock(task);
-	do {
-		ioc = task->io_context;
-		/* see wmb() in current_io_context() */
-		smp_read_barrier_depends();
-		if (ioc)
-			break;
-
-		ioc = alloc_io_context(GFP_ATOMIC, -1);
-		if (!ioc) {
-			err = -ENOMEM;
-			break;
-		}
-		task->io_context = ioc;
-	} while (1);
-
-	if (!err) {
+	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
 		ioc->ioprio = ioprio;
 		ioc->ioprio_changed = 1;
+		put_io_context(ioc);
 	}
 
-	task_unlock(task);
 	return err;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 8a6ecb66346f..28bb621ef5a2 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -78,8 +78,8 @@ struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
-struct io_context *get_io_context(gfp_t gfp_flags, int node);
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node);
 #else
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d088..5bcfc739bb7c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -870,6 +870,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 #ifdef CONFIG_BLOCK
 	struct io_context *ioc = current->io_context;
+	struct io_context *new_ioc;
 
 	if (!ioc)
 		return 0;
@@ -881,11 +882,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
 	} else if (ioprio_valid(ioc->ioprio)) {
-		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
-		if (unlikely(!tsk->io_context))
+		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+		if (unlikely(!new_ioc))
 			return -ENOMEM;
 
-		tsk->io_context->ioprio = ioc->ioprio;
+		new_ioc->ioprio = ioc->ioprio;
+		put_io_context(new_ioc);
 	}
 #endif
 	return 0;
-- 
cgit v1.2.3


From 09ac46c429464c919d04bb737b27edd84d944f02 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:38 +0100
Subject: block: misc updates to blk_get_queue()

* blk_get_queue() is peculiar in that it returns 0 on success and 1 on
  failure instead of 0 / -errno or boolean.  Update it such that it
  returns %true on success and %false on failure.

* Make sure the caller checks for the return value.

* Separate out __blk_get_queue() which doesn't check whether @q is
  dead and put it in blk.h.  This will be used later.

This patch doesn't introduce any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c         | 8 ++++----
 block/blk.h              | 5 +++++
 block/bsg.c              | 4 +---
 block/genhd.c            | 2 +-
 drivers/scsi/scsi_scan.c | 2 +-
 include/linux/blkdev.h   | 2 +-
 6 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index af7301581172..fd4749391e17 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -626,14 +626,14 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 }
 EXPORT_SYMBOL(blk_init_allocated_queue_node);
 
-int blk_get_queue(struct request_queue *q)
+bool blk_get_queue(struct request_queue *q)
 {
 	if (likely(!blk_queue_dead(q))) {
-		kobject_get(&q->kobj);
-		return 0;
+		__blk_get_queue(q);
+		return true;
 	}
 
-	return 1;
+	return false;
 }
 EXPORT_SYMBOL(blk_get_queue);
 
diff --git a/block/blk.h b/block/blk.h
index fc3c41b2fd24..8d421156fefb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,11 @@ extern struct kmem_cache *blk_requestq_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
+static inline void __blk_get_queue(struct request_queue *q)
+{
+	kobject_get(&q->kobj);
+}
+
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
diff --git a/block/bsg.c b/block/bsg.c
index 702f1316bb8f..167d586cece6 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -769,12 +769,10 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
 					 struct file *file)
 {
 	struct bsg_device *bd;
-	int ret;
 #ifdef BSG_DEBUG
 	unsigned char buf[32];
 #endif
-	ret = blk_get_queue(rq);
-	if (ret)
+	if (!blk_get_queue(rq))
 		return ERR_PTR(-ENXIO);
 
 	bd = bsg_alloc_device();
diff --git a/block/genhd.c b/block/genhd.c
index 02e9fca80825..c958169d24f0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -615,7 +615,7 @@ void add_disk(struct gendisk *disk)
 	 * Take an extra ref on queue which will be put on disk_release()
 	 * so that it sticks around as long as @disk is there.
 	 */
-	WARN_ON_ONCE(blk_get_queue(disk->queue));
+	WARN_ON_ONCE(!blk_get_queue(disk->queue));
 
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index b3c6d957fbd8..89da43f73c00 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -297,7 +297,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 		kfree(sdev);
 		goto out;
 	}
-	blk_get_queue(sdev->request_queue);
+	WARN_ON_ONCE(!blk_get_queue(sdev->request_queue));
 	sdev->request_queue->queuedata = sdev;
 	scsi_adjust_queue_depth(sdev, 0, sdev->host->cmd_per_lun);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8c8dbc4738ea..d1b6f4ed1f96 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -865,7 +865,7 @@ extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatte
 extern void blk_dump_rq_flags(struct request *, char *);
 extern long nr_blockdev_pages(void);
 
-int blk_get_queue(struct request_queue *);
+bool __must_check blk_get_queue(struct request_queue *);
 struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
-- 
cgit v1.2.3


From 283287a52e3c3f7f8f9da747f4b8c5202740d776 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:38 +0100
Subject: block, cfq: misc updates to cfq_io_context

Make the following changes to prepare for ioc/cic management cleanup.

* Add cic->q so that ioc can determine the associated queue without
  querying cfq.  This will eventually replace ->key.

* Factor out cfq_release_cic() from cic_free_func().  This function
  assumes that the caller handled locking.

* Rename __cfq_exit_single_io_context() to cfq_exit_cic() and make it
  take only @cic.

* Restructure cfq_cic_link() for future updates.

This patch doesn't introduce any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c       | 58 ++++++++++++++++++++++++++---------------------
 include/linux/iocontext.h |  1 +
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d42d89ccce1b..a612ca65f371 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2709,21 +2709,26 @@ static void cfq_cic_free(struct cfq_io_context *cic)
 	call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
 }
 
-static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
+static void cfq_release_cic(struct cfq_io_context *cic)
 {
-	unsigned long flags;
+	struct io_context *ioc = cic->ioc;
 	unsigned long dead_key = (unsigned long) cic->key;
 
 	BUG_ON(!(dead_key & CIC_DEAD_KEY));
-
-	spin_lock_irqsave(&ioc->lock, flags);
 	radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
 	hlist_del_rcu(&cic->cic_list);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-
 	cfq_cic_free(cic);
 }
 
+static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	cfq_release_cic(cic);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
 /*
  * Must be called with rcu_read_lock() held or preemption otherwise disabled.
  * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
@@ -2773,9 +2778,9 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_put_queue(cfqq);
 }
 
-static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
-					 struct cfq_io_context *cic)
+static void cfq_exit_cic(struct cfq_io_context *cic)
 {
+	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct io_context *ioc = cic->ioc;
 
 	list_del_init(&cic->queue_list);
@@ -2823,7 +2828,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
 		 */
 		smp_read_barrier_depends();
 		if (cic->key == cfqd)
-			__cfq_exit_single_io_context(cfqd, cic);
+			cfq_exit_cic(cic);
 
 		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
@@ -3161,28 +3166,29 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 	int ret;
 
 	ret = radix_tree_preload(gfp_mask);
-	if (!ret) {
-		cic->ioc = ioc;
-		cic->key = cfqd;
+	if (ret)
+		goto out;
 
-		spin_lock_irqsave(&ioc->lock, flags);
-		ret = radix_tree_insert(&ioc->radix_root, cfqd->queue->id, cic);
-		if (!ret)
-			hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
-		spin_unlock_irqrestore(&ioc->lock, flags);
+	cic->ioc = ioc;
+	cic->key = cfqd;
+	cic->q = cfqd->queue;
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	ret = radix_tree_insert(&ioc->radix_root, cfqd->queue->id, cic);
+	if (!ret)
+		hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
+	spin_unlock_irqrestore(&ioc->lock, flags);
 
-		radix_tree_preload_end();
+	radix_tree_preload_end();
 
-		if (!ret) {
-			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-			list_add(&cic->queue_list, &cfqd->cic_list);
-			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-		}
+	if (!ret) {
+		spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+		list_add(&cic->queue_list, &cfqd->cic_list);
+		spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 	}
-
+out:
 	if (ret)
 		printk(KERN_ERR "cfq: cic link failed!\n");
-
 	return ret;
 }
 
@@ -3922,7 +3928,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 							struct cfq_io_context,
 							queue_list);
 
-		__cfq_exit_single_io_context(cfqd, cic);
+		cfq_exit_cic(cic);
 	}
 
 	cfq_put_async_queues(cfqd);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 28bb621ef5a2..079aea8fd8a8 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -15,6 +15,7 @@ struct cfq_ttime {
 
 struct cfq_io_context {
 	void *key;
+	struct request_queue *q;
 
 	struct cfq_queue *cfqq[2];
 
-- 
cgit v1.2.3


From dc86900e0a8f665122de6faadd27fb4c6d2b3e4d Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:38 +0100
Subject: block, cfq: move ioc ioprio/cgroup changed handling to cic

ioprio/cgroup change was handled by marking the changed state in ioc
and, on the following access to the ioc, performing RCU-protected
iteration through all cic's grabbing the matching queue_lock.

This patch moves the changed state to each cic.  When ioprio or cgroup
changes, the respective bit is set on all cic's of the ioc and when
each of those cic (not ioc) is accessed, change is applied for that
specific ioc-queue pair.

This also fixes the following two race conditions between setting and
clearing of changed states.

* Missing barrier between assign/load of ioprio and ioprio_changed
  allowed applying old ioprio.

* Change requests could happen between application of change and
  clearing of changed variables.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |  2 +-
 block/blk-ioc.c           | 45 +++++++++++++++++++++++++++++++++++++++++++++
 block/cfq-iosched.c       | 28 +++++++++-------------------
 fs/ioprio.c               |  3 +--
 include/linux/iocontext.h | 14 +++++++++-----
 5 files changed, 65 insertions(+), 27 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4b001dcd85b0..dc00835aab6a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1648,7 +1648,7 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	/* we don't lose anything even if ioc allocation fails */
 	ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
-		ioc->cgroup_changed = 1;
+		ioc_cgroup_changed(ioc);
 		put_io_context(ioc);
 	}
 }
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index b13ed96776c2..6f59fbad93d9 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -188,6 +188,51 @@ struct io_context *get_task_io_context(struct task_struct *task,
 }
 EXPORT_SYMBOL(get_task_io_context);
 
+void ioc_set_changed(struct io_context *ioc, int which)
+{
+	struct cfq_io_context *cic;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(cic, n, &ioc->cic_list, cic_list)
+		set_bit(which, &cic->changed);
+}
+
+/**
+ * ioc_ioprio_changed - notify ioprio change
+ * @ioc: io_context of interest
+ * @ioprio: new ioprio
+ *
+ * @ioc's ioprio has changed to @ioprio.  Set %CIC_IOPRIO_CHANGED for all
+ * cic's.  iosched is responsible for checking the bit and applying it on
+ * request issue path.
+ */
+void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	ioc->ioprio = ioprio;
+	ioc_set_changed(ioc, CIC_IOPRIO_CHANGED);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
+/**
+ * ioc_cgroup_changed - notify cgroup change
+ * @ioc: io_context of interest
+ *
+ * @ioc's cgroup has changed.  Set %CIC_CGROUP_CHANGED for all cic's.
+ * iosched is responsible for checking the bit and applying it on request
+ * issue path.
+ */
+void ioc_cgroup_changed(struct io_context *ioc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	ioc_set_changed(ioc, CIC_CGROUP_CHANGED);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
 static int __init blk_ioc_init(void)
 {
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a612ca65f371..51aece2eea7c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2904,7 +2904,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_ioprio(struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
@@ -2933,12 +2933,6 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
-static void cfq_ioc_set_ioprio(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, changed_ioprio);
-	ioc->ioprio_changed = 0;
-}
-
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			  pid_t pid, bool is_sync)
 {
@@ -2960,7 +2954,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_cgroup(struct cfq_io_context *cic)
 {
 	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
@@ -2986,12 +2980,6 @@ static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
-
-static void cfq_ioc_set_cgroup(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, changed_cgroup);
-	ioc->cgroup_changed = 0;
-}
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
 static struct cfq_queue *
@@ -3222,13 +3210,15 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 out:
 	get_io_context(ioc);
 
-	if (unlikely(ioc->ioprio_changed))
-		cfq_ioc_set_ioprio(ioc);
-
+	if (unlikely(cic->changed)) {
+		if (test_and_clear_bit(CIC_IOPRIO_CHANGED, &cic->changed))
+			changed_ioprio(cic);
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-	if (unlikely(ioc->cgroup_changed))
-		cfq_ioc_set_cgroup(ioc);
+		if (test_and_clear_bit(CIC_CGROUP_CHANGED, &cic->changed))
+			changed_cgroup(cic);
 #endif
+	}
+
 	return cic;
 err:
 	if (cic)
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 998ec239d1ea..0f1b9515213b 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -50,8 +50,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
-		ioc->ioprio = ioprio;
-		ioc->ioprio_changed = 1;
+		ioc_ioprio_changed(ioc, ioprio);
 		put_io_context(ioc);
 	}
 
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 079aea8fd8a8..2c2b6da96b3c 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -13,6 +13,11 @@ struct cfq_ttime {
 	unsigned long ttime_mean;
 };
 
+enum {
+	CIC_IOPRIO_CHANGED,
+	CIC_CGROUP_CHANGED,
+};
+
 struct cfq_io_context {
 	void *key;
 	struct request_queue *q;
@@ -26,6 +31,8 @@ struct cfq_io_context {
 	struct list_head queue_list;
 	struct hlist_node cic_list;
 
+	unsigned long changed;
+
 	void (*dtor)(struct io_context *); /* destructor */
 	void (*exit)(struct io_context *); /* called on task exit */
 
@@ -44,11 +51,6 @@ struct io_context {
 	spinlock_t lock;
 
 	unsigned short ioprio;
-	unsigned short ioprio_changed;
-
-#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-	unsigned short cgroup_changed;
-#endif
 
 	/*
 	 * For request batching
@@ -81,6 +83,8 @@ void put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
+void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
+void ioc_cgroup_changed(struct io_context *ioc);
 #else
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
-- 
cgit v1.2.3


From 216284c352a0061f5b20acff2c4e50fb43fea183 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:38 +0100
Subject: block, cfq: fix race condition in cic creation path and tighten
 locking

cfq_get_io_context() would fail if multiple tasks race to insert cic's
for the same association.  This patch restructures
cfq_get_io_context() such that slow path insertion race is handled
properly.

Note that the restructuring also makes cfq_get_io_context() called
under queue_lock and performs both ioc and cfqd insertions while
holding both ioc and queue locks.  This is part of on-going locking
tightening and will be used to simplify synchronization rules.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c | 135 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 76 insertions(+), 59 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 51aece2eea7c..181a63d36691 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2908,13 +2908,10 @@ static void changed_ioprio(struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
-	unsigned long flags;
 
 	if (unlikely(!cfqd))
 		return;
 
-	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
@@ -2929,8 +2926,6 @@ static void changed_ioprio(struct cfq_io_context *cic)
 	cfqq = cic->cfqq[BLK_RW_SYNC];
 	if (cfqq)
 		cfq_mark_cfqq_prio_changed(cfqq);
-
-	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -2958,7 +2953,6 @@ static void changed_cgroup(struct cfq_io_context *cic)
 {
 	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
-	unsigned long flags;
 	struct request_queue *q;
 
 	if (unlikely(!cfqd))
@@ -2966,8 +2960,6 @@ static void changed_cgroup(struct cfq_io_context *cic)
 
 	q = cfqd->queue;
 
-	spin_lock_irqsave(q->queue_lock, flags);
-
 	if (sync_cfqq) {
 		/*
 		 * Drop reference to sync queue. A new sync queue will be
@@ -2977,8 +2969,6 @@ static void changed_cgroup(struct cfq_io_context *cic)
 		cic_set_cfqq(cic, NULL, 1);
 		cfq_put_queue(sync_cfqq);
 	}
-
-	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
@@ -3142,16 +3132,31 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	return cic;
 }
 
-/*
- * Add cic into ioc, using cfqd as the search key. This enables us to lookup
- * the process specific cfq io context when entered from the block layer.
- * Also adds the cic to a per-cfqd list, used when this queue is removed.
+/**
+ * cfq_create_cic - create and link a cfq_io_context
+ * @cfqd: cfqd of interest
+ * @gfp_mask: allocation mask
+ *
+ * Make sure cfq_io_context linking %current->io_context and @cfqd exists.
+ * If ioc and/or cic doesn't exist, they will be created using @gfp_mask.
  */
-static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
-			struct cfq_io_context *cic, gfp_t gfp_mask)
+static int cfq_create_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
-	unsigned long flags;
-	int ret;
+	struct request_queue *q = cfqd->queue;
+	struct cfq_io_context *cic = NULL;
+	struct io_context *ioc;
+	int ret = -ENOMEM;
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+
+	/* allocate stuff */
+	ioc = current_io_context(gfp_mask, q->node);
+	if (!ioc)
+		goto out;
+
+	cic = cfq_alloc_io_context(cfqd, gfp_mask);
+	if (!cic)
+		goto out;
 
 	ret = radix_tree_preload(gfp_mask);
 	if (ret)
@@ -3161,53 +3166,72 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 	cic->key = cfqd;
 	cic->q = cfqd->queue;
 
-	spin_lock_irqsave(&ioc->lock, flags);
-	ret = radix_tree_insert(&ioc->radix_root, cfqd->queue->id, cic);
-	if (!ret)
-		hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-
-	radix_tree_preload_end();
+	/* lock both q and ioc and try to link @cic */
+	spin_lock_irq(q->queue_lock);
+	spin_lock(&ioc->lock);
 
-	if (!ret) {
-		spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+	ret = radix_tree_insert(&ioc->radix_root, q->id, cic);
+	if (likely(!ret)) {
+		hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
 		list_add(&cic->queue_list, &cfqd->cic_list);
-		spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+		cic = NULL;
+	} else if (ret == -EEXIST) {
+		/* someone else already did it */
+		ret = 0;
 	}
+
+	spin_unlock(&ioc->lock);
+	spin_unlock_irq(q->queue_lock);
+
+	radix_tree_preload_end();
 out:
 	if (ret)
 		printk(KERN_ERR "cfq: cic link failed!\n");
+	if (cic)
+		cfq_cic_free(cic);
 	return ret;
 }
 
-/*
- * Setup general io context and cfq io context. There can be several cfq
- * io contexts per general io context, if this process is doing io to more
- * than one device managed by cfq.
+/**
+ * cfq_get_io_context - acquire cfq_io_context and bump refcnt on io_context
+ * @cfqd: cfqd to setup cic for
+ * @gfp_mask: allocation mask
+ *
+ * Return cfq_io_context associating @cfqd and %current->io_context and
+ * bump refcnt on io_context.  If ioc or cic doesn't exist, they're created
+ * using @gfp_mask.
+ *
+ * Must be called under queue_lock which may be released and re-acquired.
+ * This function also may sleep depending on @gfp_mask.
  */
 static struct cfq_io_context *
 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
-	struct io_context *ioc = NULL;
+	struct request_queue *q = cfqd->queue;
 	struct cfq_io_context *cic = NULL;
+	struct io_context *ioc;
+	int err;
+
+	lockdep_assert_held(q->queue_lock);
+
+	while (true) {
+		/* fast path */
+		ioc = current->io_context;
+		if (likely(ioc)) {
+			cic = cfq_cic_lookup(cfqd, ioc);
+			if (likely(cic))
+				break;
+		}
 
-	might_sleep_if(gfp_mask & __GFP_WAIT);
-
-	ioc = current_io_context(gfp_mask, cfqd->queue->node);
-	if (!ioc)
-		goto err;
-
-	cic = cfq_cic_lookup(cfqd, ioc);
-	if (cic)
-		goto out;
-
-	cic = cfq_alloc_io_context(cfqd, gfp_mask);
-	if (cic == NULL)
-		goto err;
+		/* slow path - unlock, create missing ones and retry */
+		spin_unlock_irq(q->queue_lock);
+		err = cfq_create_cic(cfqd, gfp_mask);
+		spin_lock_irq(q->queue_lock);
+		if (err)
+			return NULL;
+	}
 
-	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
-		goto err;
-out:
+	/* bump @ioc's refcnt and handle changed notifications */
 	get_io_context(ioc);
 
 	if (unlikely(cic->changed)) {
@@ -3220,10 +3244,6 @@ out:
 	}
 
 	return cic;
-err:
-	if (cic)
-		cfq_cic_free(cic);
-	return NULL;
 }
 
 static void
@@ -3759,14 +3779,11 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
-	unsigned long flags;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
+	spin_lock_irq(q->queue_lock);
 	cic = cfq_get_io_context(cfqd, gfp_mask);
-
-	spin_lock_irqsave(q->queue_lock, flags);
-
 	if (!cic)
 		goto queue_fail;
 
@@ -3802,12 +3819,12 @@ new_queue:
 	rq->elevator_private[0] = cic;
 	rq->elevator_private[1] = cfqq;
 	rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irq(q->queue_lock);
 	return 0;
 
 queue_fail:
 	cfq_schedule_dispatch(cfqd);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irq(q->queue_lock);
 	cfq_log(cfqd, "set_request fail");
 	return 1;
 }
-- 
cgit v1.2.3


From f1a4f4d35ff30a328d5ea28f6cc826b2083111d2 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:39 +0100
Subject: block, cfq: fix cic lookup locking

* cfq_cic_lookup() may be called without queue_lock and multiple tasks
  can execute it simultaneously for the same shared ioc.  Nothing
  prevents them racing each other and trying to drop the same dead cic
  entry multiple times.

* smp_wmb() in cfq_exit_cic() doesn't really do anything and nothing
  prevents cfq_cic_lookup() seeing stale cic->key.  This usually
  doesn't blow up because by the time cic is exited, all requests have
  been drained and new requests are terminated before going through
  elevator.  However, it can still be triggered by plug merge path
  which doesn't grab queue_lock and thus can't check DEAD state
  reliably.

This patch updates lookup locking such that,

* Lookup is always performed under queue_lock.  This doesn't add any
  more locking.  The only issue is cfq_allow_merge() which can be
  called from plug merge path without holding any lock.  For now, this
  is worked around by using cic of the request to merge into, which is
  guaranteed to have the same ioc.  For longer term, I think it would
  be best to separate out plug merge method from regular one.

* Spurious ioc->lock locking around cic lookup hint assignment
  dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c | 67 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 32 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 181a63d36691..e617b088c59b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1682,12 +1682,19 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 		return false;
 
 	/*
-	 * Lookup the cfqq that this bio will be queued with. Allow
-	 * merge only if rq is queued there.
+	 * Lookup the cfqq that this bio will be queued with and allow
+	 * merge only if rq is queued there.  This function can be called
+	 * from plug merge without queue_lock.  In such cases, ioc of @rq
+	 * and %current are guaranteed to be equal.  Avoid lookup which
+	 * requires queue_lock by using @rq's cic.
 	 */
-	cic = cfq_cic_lookup(cfqd, current->io_context);
-	if (!cic)
-		return false;
+	if (current->io_context == RQ_CIC(rq)->ioc) {
+		cic = RQ_CIC(rq);
+	} else {
+		cic = cfq_cic_lookup(cfqd, current->io_context);
+		if (!cic)
+			return false;
+	}
 
 	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 	return cfqq == RQ_CFQQ(rq);
@@ -2784,21 +2791,15 @@ static void cfq_exit_cic(struct cfq_io_context *cic)
 	struct io_context *ioc = cic->ioc;
 
 	list_del_init(&cic->queue_list);
+	cic->key = cfqd_dead_key(cfqd);
 
 	/*
-	 * Make sure dead mark is seen for dead queues
+	 * Both setting lookup hint to and clearing it from @cic are done
+	 * under queue_lock.  If it's not pointing to @cic now, it never
+	 * will.  Hint assignment itself can race safely.
 	 */
-	smp_wmb();
-	cic->key = cfqd_dead_key(cfqd);
-
-	rcu_read_lock();
-	if (rcu_dereference(ioc->ioc_data) == cic) {
-		rcu_read_unlock();
-		spin_lock(&ioc->lock);
+	if (rcu_dereference_raw(ioc->ioc_data) == cic)
 		rcu_assign_pointer(ioc->ioc_data, NULL);
-		spin_unlock(&ioc->lock);
-	} else
-		rcu_read_unlock();
 
 	if (cic->cfqq[BLK_RW_ASYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -3092,12 +3093,20 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
 	cfq_cic_free(cic);
 }
 
+/**
+ * cfq_cic_lookup - lookup cfq_io_context
+ * @cfqd: the associated cfq_data
+ * @ioc: the associated io_context
+ *
+ * Look up cfq_io_context associated with @cfqd - @ioc pair.  Must be
+ * called with queue_lock held.
+ */
 static struct cfq_io_context *
 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 {
 	struct cfq_io_context *cic;
-	unsigned long flags;
 
+	lockdep_assert_held(cfqd->queue->queue_lock);
 	if (unlikely(!ioc))
 		return NULL;
 
@@ -3107,28 +3116,22 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	 * we maintain a last-hit cache, to avoid browsing over the tree
 	 */
 	cic = rcu_dereference(ioc->ioc_data);
-	if (cic && cic->key == cfqd) {
-		rcu_read_unlock();
-		return cic;
-	}
+	if (cic && cic->key == cfqd)
+		goto out;
 
 	do {
 		cic = radix_tree_lookup(&ioc->radix_root, cfqd->queue->id);
-		rcu_read_unlock();
 		if (!cic)
 			break;
-		if (unlikely(cic->key != cfqd)) {
-			cfq_drop_dead_cic(cfqd, ioc, cic);
-			rcu_read_lock();
-			continue;
+		if (likely(cic->key == cfqd)) {
+			/* hint assignment itself can race safely */
+			rcu_assign_pointer(ioc->ioc_data, cic);
+			break;
 		}
-
-		spin_lock_irqsave(&ioc->lock, flags);
-		rcu_assign_pointer(ioc->ioc_data, cic);
-		spin_unlock_irqrestore(&ioc->lock, flags);
-		break;
+		cfq_drop_dead_cic(cfqd, ioc, cic);
 	} while (1);
-
+out:
+	rcu_read_unlock();
 	return cic;
 }
 
-- 
cgit v1.2.3


From b2efa05265d62bc29f3a64400fad4b44340eedb8 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:39 +0100
Subject: block, cfq: unlink cfq_io_context's immediately

cic is association between io_context and request_queue.  A cic is
linked from both ioc and q and should be destroyed when either one
goes away.  As ioc and q both have their own locks, locking becomes a
bit complex - both orders work for removal from one but not from the
other.

Currently, cfq tries to circumvent this locking order issue with RCU.
ioc->lock nests inside queue_lock but the radix tree and cic's are
also protected by RCU allowing either side to walk their lists without
grabbing lock.

This rather unconventional use of RCU quickly devolves into extremely
fragile convolution.  e.g. The following is from cfqd going away too
soon after ioc and q exits raced.

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:
 [   88.503444]
 Pid: 599, comm: hexdump Not tainted 3.1.0-rc10-work+ #158 Bochs Bochs
 RIP: 0010:[<ffffffff81397628>]  [<ffffffff81397628>] cfq_exit_single_io_context+0x58/0xf0
 ...
 Call Trace:
  [<ffffffff81395a4a>] call_for_each_cic+0x5a/0x90
  [<ffffffff81395ab5>] cfq_exit_io_context+0x15/0x20
  [<ffffffff81389130>] exit_io_context+0x100/0x140
  [<ffffffff81098a29>] do_exit+0x579/0x850
  [<ffffffff81098d5b>] do_group_exit+0x5b/0xd0
  [<ffffffff81098de7>] sys_exit_group+0x17/0x20
  [<ffffffff81b02f2b>] system_call_fastpath+0x16/0x1b

The only real hot path here is cic lookup during request
initialization and avoiding extra locking requires very confined use
of RCU.  This patch makes cic removal from both ioc and request_queue
perform double-locking and unlink immediately.

* From q side, the change is almost trivial as ioc->lock nests inside
  queue_lock.  It just needs to grab each ioc->lock as it walks
  cic_list and unlink it.

* From ioc side, it's a bit more difficult because of inversed lock
  order.  ioc needs its lock to walk its cic_list but can't grab the
  matching queue_lock and needs to perform unlock-relock dancing.

  Unlinking is now wholly done from put_io_context() and fast path is
  optimized by using the queue_lock the caller already holds, which is
  by far the most common case.  If the ioc accessed multiple devices,
  it tries with trylock.  In unlikely cases of fast path failure, it
  falls back to full double-locking dance from workqueue.

Double-locking isn't the prettiest thing in the world but it's *far*
simpler and more understandable than RCU trick without adding any
meaningful overhead.

This still leaves a lot of now unnecessary RCU logics.  Future patches
will trim them.

-v2: Vivek pointed out that cic->q was being dereferenced after
     cic->release() was called.  Updated to use local variable @this_q
     instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |   2 +-
 block/blk-ioc.c           | 166 ++++++++++++++++++++++++++++++++++++++--------
 block/cfq-iosched.c       |  44 +++---------
 fs/ioprio.c               |   2 +-
 include/linux/blkdev.h    |   3 +
 include/linux/iocontext.h |  12 ++--
 kernel/fork.c             |   2 +-
 7 files changed, 159 insertions(+), 72 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index dc00835aab6a..278869358049 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1649,7 +1649,7 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc_cgroup_changed(ioc);
-		put_io_context(ioc);
+		put_io_context(ioc, NULL);
 	}
 }
 
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 6f59fbad93d9..fb23965595da 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -29,55 +29,164 @@ void get_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(get_io_context);
 
-static void cfq_dtor(struct io_context *ioc)
+/*
+ * Releasing ioc may nest into another put_io_context() leading to nested
+ * fast path release.  As the ioc's can't be the same, this is okay but
+ * makes lockdep whine.  Keep track of nesting and use it as subclass.
+ */
+#ifdef CONFIG_LOCKDEP
+#define ioc_release_depth(q)		((q) ? (q)->ioc_release_depth : 0)
+#define ioc_release_depth_inc(q)	(q)->ioc_release_depth++
+#define ioc_release_depth_dec(q)	(q)->ioc_release_depth--
+#else
+#define ioc_release_depth(q)		0
+#define ioc_release_depth_inc(q)	do { } while (0)
+#define ioc_release_depth_dec(q)	do { } while (0)
+#endif
+
+/*
+ * Slow path for ioc release in put_io_context().  Performs double-lock
+ * dancing to unlink all cic's and then frees ioc.
+ */
+static void ioc_release_fn(struct work_struct *work)
 {
-	if (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic;
+	struct io_context *ioc = container_of(work, struct io_context,
+					      release_work);
+	struct request_queue *last_q = NULL;
+
+	spin_lock_irq(&ioc->lock);
+
+	while (!hlist_empty(&ioc->cic_list)) {
+		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
+							 struct cfq_io_context,
+							 cic_list);
+		struct request_queue *this_q = cic->q;
+
+		if (this_q != last_q) {
+			/*
+			 * Need to switch to @this_q.  Once we release
+			 * @ioc->lock, it can go away along with @cic.
+			 * Hold on to it.
+			 */
+			__blk_get_queue(this_q);
+
+			/*
+			 * blk_put_queue() might sleep thanks to kobject
+			 * idiocy.  Always release both locks, put and
+			 * restart.
+			 */
+			if (last_q) {
+				spin_unlock(last_q->queue_lock);
+				spin_unlock_irq(&ioc->lock);
+				blk_put_queue(last_q);
+			} else {
+				spin_unlock_irq(&ioc->lock);
+			}
+
+			last_q = this_q;
+			spin_lock_irq(this_q->queue_lock);
+			spin_lock(&ioc->lock);
+			continue;
+		}
+		ioc_release_depth_inc(this_q);
+		cic->exit(cic);
+		cic->release(cic);
+		ioc_release_depth_dec(this_q);
+	}
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
-		cic->dtor(ioc);
+	if (last_q) {
+		spin_unlock(last_q->queue_lock);
+		spin_unlock_irq(&ioc->lock);
+		blk_put_queue(last_q);
+	} else {
+		spin_unlock_irq(&ioc->lock);
 	}
+
+	kmem_cache_free(iocontext_cachep, ioc);
 }
 
 /**
  * put_io_context - put a reference of io_context
  * @ioc: io_context to put
+ * @locked_q: request_queue the caller is holding queue_lock of (hint)
  *
  * Decrement reference count of @ioc and release it if the count reaches
- * zero.
+ * zero.  If the caller is holding queue_lock of a queue, it can indicate
+ * that with @locked_q.  This is an optimization hint and the caller is
+ * allowed to pass in %NULL even when it's holding a queue_lock.
  */
-void put_io_context(struct io_context *ioc)
+void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
 {
+	struct request_queue *last_q = locked_q;
+	unsigned long flags;
+
 	if (ioc == NULL)
 		return;
 
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	if (locked_q)
+		lockdep_assert_held(locked_q->queue_lock);
 
 	if (!atomic_long_dec_and_test(&ioc->refcount))
 		return;
 
-	rcu_read_lock();
-	cfq_dtor(ioc);
-	rcu_read_unlock();
-
-	kmem_cache_free(iocontext_cachep, ioc);
-}
-EXPORT_SYMBOL(put_io_context);
+	/*
+	 * Destroy @ioc.  This is a bit messy because cic's are chained
+	 * from both ioc and queue, and ioc->lock nests inside queue_lock.
+	 * The inner ioc->lock should be held to walk our cic_list and then
+	 * for each cic the outer matching queue_lock should be grabbed.
+	 * ie. We need to do reverse-order double lock dancing.
+	 *
+	 * Another twist is that we are often called with one of the
+	 * matching queue_locks held as indicated by @locked_q, which
+	 * prevents performing double-lock dance for other queues.
+	 *
+	 * So, we do it in two stages.  The fast path uses the queue_lock
+	 * the caller is holding and, if other queues need to be accessed,
+	 * uses trylock to avoid introducing locking dependency.  This can
+	 * handle most cases, especially if @ioc was performing IO on only
+	 * single device.
+	 *
+	 * If trylock doesn't cut it, we defer to @ioc->release_work which
+	 * can do all the double-locking dancing.
+	 */
+	spin_lock_irqsave_nested(&ioc->lock, flags,
+				 ioc_release_depth(locked_q));
+
+	while (!hlist_empty(&ioc->cic_list)) {
+		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
+							 struct cfq_io_context,
+							 cic_list);
+		struct request_queue *this_q = cic->q;
+
+		if (this_q != last_q) {
+			if (last_q && last_q != locked_q)
+				spin_unlock(last_q->queue_lock);
+			last_q = NULL;
+
+			if (!spin_trylock(this_q->queue_lock))
+				break;
+			last_q = this_q;
+			continue;
+		}
+		ioc_release_depth_inc(this_q);
+		cic->exit(cic);
+		cic->release(cic);
+		ioc_release_depth_dec(this_q);
+	}
 
-static void cfq_exit(struct io_context *ioc)
-{
-	rcu_read_lock();
+	if (last_q && last_q != locked_q)
+		spin_unlock(last_q->queue_lock);
 
-	if (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic;
+	spin_unlock_irqrestore(&ioc->lock, flags);
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
-		cic->exit(ioc);
-	}
-	rcu_read_unlock();
+	/* if no cic's left, we're done; otherwise, kick release_work */
+	if (hlist_empty(&ioc->cic_list))
+		kmem_cache_free(iocontext_cachep, ioc);
+	else
+		schedule_work(&ioc->release_work);
 }
+EXPORT_SYMBOL(put_io_context);
 
 /* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
@@ -92,10 +201,8 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-	if (atomic_dec_and_test(&ioc->nr_tasks))
-		cfq_exit(ioc);
-
-	put_io_context(ioc);
+	atomic_dec(&ioc->nr_tasks);
+	put_io_context(ioc, NULL);
 }
 
 static struct io_context *create_task_io_context(struct task_struct *task,
@@ -115,6 +222,7 @@ static struct io_context *create_task_io_context(struct task_struct *task,
 	spin_lock_init(&ioc->lock);
 	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->cic_list);
+	INIT_WORK(&ioc->release_work, ioc_release_fn);
 
 	/* try to install, somebody might already have beaten us to it */
 	task_lock(task);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e617b088c59b..6cc606560402 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1778,7 +1778,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqd->active_queue = NULL;
 
 	if (cfqd->active_cic) {
-		put_io_context(cfqd->active_cic->ioc);
+		put_io_context(cfqd->active_cic->ioc, cfqd->queue);
 		cfqd->active_cic = NULL;
 	}
 }
@@ -2812,38 +2812,6 @@ static void cfq_exit_cic(struct cfq_io_context *cic)
 	}
 }
 
-static void cfq_exit_single_io_context(struct io_context *ioc,
-				       struct cfq_io_context *cic)
-{
-	struct cfq_data *cfqd = cic_to_cfqd(cic);
-
-	if (cfqd) {
-		struct request_queue *q = cfqd->queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
-
-		/*
-		 * Ensure we get a fresh copy of the ->key to prevent
-		 * race between exiting task and queue
-		 */
-		smp_read_barrier_depends();
-		if (cic->key == cfqd)
-			cfq_exit_cic(cic);
-
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
-}
-
-/*
- * The process that ioc belongs to has exited, we need to clean up
- * and put the internal structures we have that belongs to that process.
- */
-static void cfq_exit_io_context(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, cfq_exit_single_io_context);
-}
-
 static struct cfq_io_context *
 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
@@ -2855,8 +2823,8 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		cic->ttime.last_end_request = jiffies;
 		INIT_LIST_HEAD(&cic->queue_list);
 		INIT_HLIST_NODE(&cic->cic_list);
-		cic->dtor = cfq_free_io_context;
-		cic->exit = cfq_exit_io_context;
+		cic->exit = cfq_exit_cic;
+		cic->release = cfq_release_cic;
 		elv_ioc_count_inc(cfq_ioc_count);
 	}
 
@@ -3726,7 +3694,7 @@ static void cfq_put_request(struct request *rq)
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
 
-		put_io_context(RQ_CIC(rq)->ioc);
+		put_io_context(RQ_CIC(rq)->ioc, cfqq->cfqd->queue);
 
 		rq->elevator_private[0] = NULL;
 		rq->elevator_private[1] = NULL;
@@ -3937,8 +3905,12 @@ static void cfq_exit_queue(struct elevator_queue *e)
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
 							struct cfq_io_context,
 							queue_list);
+		struct io_context *ioc = cic->ioc;
 
+		spin_lock(&ioc->lock);
 		cfq_exit_cic(cic);
+		cfq_release_cic(cic);
+		spin_unlock(&ioc->lock);
 	}
 
 	cfq_put_async_queues(cfqd);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 0f1b9515213b..f84b380d65e5 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc_ioprio_changed(ioc, ioprio);
-		put_io_context(ioc);
+		put_io_context(ioc, NULL);
 	}
 
 	return err;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d1b6f4ed1f96..65c2f8c70089 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -393,6 +393,9 @@ struct request_queue {
 	/* Throttle data */
 	struct throtl_data *td;
 #endif
+#ifdef CONFIG_LOCKDEP
+	int			ioc_release_depth;
+#endif
 };
 
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 2c2b6da96b3c..01e863128780 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -3,6 +3,7 @@
 
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
+#include <linux/workqueue.h>
 
 struct cfq_queue;
 struct cfq_ttime {
@@ -33,8 +34,8 @@ struct cfq_io_context {
 
 	unsigned long changed;
 
-	void (*dtor)(struct io_context *); /* destructor */
-	void (*exit)(struct io_context *); /* called on task exit */
+	void (*exit)(struct cfq_io_context *);
+	void (*release)(struct cfq_io_context *);
 
 	struct rcu_head rcu_head;
 };
@@ -61,6 +62,8 @@ struct io_context {
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
 	void __rcu *ioc_data;
+
+	struct work_struct release_work;
 };
 
 static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -79,7 +82,7 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 
 struct task_struct;
 #ifdef CONFIG_BLOCK
-void put_io_context(struct io_context *ioc);
+void put_io_context(struct io_context *ioc, struct request_queue *locked_q);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
@@ -87,7 +90,8 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
 void ioc_cgroup_changed(struct io_context *ioc);
 #else
 struct io_context;
-static inline void put_io_context(struct io_context *ioc) { }
+static inline void put_io_context(struct io_context *ioc,
+				  struct request_queue *locked_q) { }
 static inline void exit_io_context(struct task_struct *task) { }
 #endif
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5bcfc739bb7c..2753449f2038 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -887,7 +887,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 			return -ENOMEM;
 
 		new_ioc->ioprio = ioc->ioprio;
-		put_io_context(new_ioc);
+		put_io_context(new_ioc, NULL);
 	}
 #endif
 	return 0;
-- 
cgit v1.2.3


From b9a1920837bc53430d339380e393a6e4c372939f Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:39 +0100
Subject: block, cfq: remove delayed unlink

Now that all cic's are immediately unlinked from both ioc and queue,
lazy dropping from lookup path and trimming on elevator unregister are
unnecessary.  Kill them and remove now unused elevator_ops->trim().

This also leaves call_for_each_cic() without any user.  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c      | 92 ++++++------------------------------------------
 block/elevator.c         | 16 ---------
 include/linux/elevator.h |  1 -
 3 files changed, 10 insertions(+), 99 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6cc606560402..ff44435fad50 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2669,24 +2669,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	cfq_put_cfqg(cfqg);
 }
 
-/*
- * Call func for each cic attached to this ioc.
- */
-static void
-call_for_each_cic(struct io_context *ioc,
-		  void (*func)(struct io_context *, struct cfq_io_context *))
-{
-	struct cfq_io_context *cic;
-	struct hlist_node *n;
-
-	rcu_read_lock();
-
-	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
-		func(ioc, cic);
-
-	rcu_read_unlock();
-}
-
 static void cfq_cic_free_rcu(struct rcu_head *head)
 {
 	struct cfq_io_context *cic;
@@ -2727,31 +2709,6 @@ static void cfq_release_cic(struct cfq_io_context *cic)
 	cfq_cic_free(cic);
 }
 
-static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ioc->lock, flags);
-	cfq_release_cic(cic);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-}
-
-/*
- * Must be called with rcu_read_lock() held or preemption otherwise disabled.
- * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
- * and ->trim() which is called with the task lock held
- */
-static void cfq_free_io_context(struct io_context *ioc)
-{
-	/*
-	 * ioc->refcount is zero here, or we are called from elv_unregister(),
-	 * so no more cic's are allowed to be linked into this ioc.  So it
-	 * should be ok to iterate over the known list, we will see all cic's
-	 * since no new ones are added.
-	 */
-	call_for_each_cic(ioc, cic_free_func);
-}
-
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
 {
 	struct cfq_queue *__cfqq, *next;
@@ -3037,30 +2994,6 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 	return cfqq;
 }
 
-/*
- * We drop cfq io contexts lazily, so we may find a dead one.
- */
-static void
-cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
-		  struct cfq_io_context *cic)
-{
-	unsigned long flags;
-
-	WARN_ON(!list_empty(&cic->queue_list));
-	BUG_ON(cic->key != cfqd_dead_key(cfqd));
-
-	spin_lock_irqsave(&ioc->lock, flags);
-
-	BUG_ON(rcu_dereference_check(ioc->ioc_data,
-		lockdep_is_held(&ioc->lock)) == cic);
-
-	radix_tree_delete(&ioc->radix_root, cfqd->queue->id);
-	hlist_del_rcu(&cic->cic_list);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-
-	cfq_cic_free(cic);
-}
-
 /**
  * cfq_cic_lookup - lookup cfq_io_context
  * @cfqd: the associated cfq_data
@@ -3078,26 +3011,22 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	if (unlikely(!ioc))
 		return NULL;
 
-	rcu_read_lock();
-
 	/*
-	 * we maintain a last-hit cache, to avoid browsing over the tree
+	 * cic's are indexed from @ioc using radix tree and hint pointer,
+	 * both of which are protected with RCU.  All removals are done
+	 * holding both q and ioc locks, and we're holding q lock - if we
+	 * find a cic which points to us, it's guaranteed to be valid.
 	 */
+	rcu_read_lock();
 	cic = rcu_dereference(ioc->ioc_data);
 	if (cic && cic->key == cfqd)
 		goto out;
 
-	do {
-		cic = radix_tree_lookup(&ioc->radix_root, cfqd->queue->id);
-		if (!cic)
-			break;
-		if (likely(cic->key == cfqd)) {
-			/* hint assignment itself can race safely */
-			rcu_assign_pointer(ioc->ioc_data, cic);
-			break;
-		}
-		cfq_drop_dead_cic(cfqd, ioc, cic);
-	} while (1);
+	cic = radix_tree_lookup(&ioc->radix_root, cfqd->queue->id);
+	if (cic && cic->key == cfqd)
+		rcu_assign_pointer(ioc->ioc_data, cic);	/* allowed to race */
+	else
+		cic = NULL;
 out:
 	rcu_read_unlock();
 	return cic;
@@ -4182,7 +4111,6 @@ static struct elevator_type iosched_cfq = {
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
-		.trim =				cfq_free_io_context,
 	},
 	.elevator_attrs =	cfq_attrs,
 	.elevator_name =	"cfq",
diff --git a/block/elevator.c b/block/elevator.c
index 66343d6917d0..6a343e8f8319 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -913,22 +913,6 @@ EXPORT_SYMBOL_GPL(elv_register);
 
 void elv_unregister(struct elevator_type *e)
 {
-	struct task_struct *g, *p;
-
-	/*
-	 * Iterate every thread in the process to remove the io contexts.
-	 */
-	if (e->ops.trim) {
-		read_lock(&tasklist_lock);
-		do_each_thread(g, p) {
-			task_lock(p);
-			if (p->io_context)
-				e->ops.trim(p->io_context);
-			task_unlock(p);
-		} while_each_thread(g, p);
-		read_unlock(&tasklist_lock);
-	}
-
 	spin_lock(&elv_list_lock);
 	list_del_init(&e->list);
 	spin_unlock(&elv_list_lock);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 1d0f7a2ff73b..581dd1bd3d3e 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -63,7 +63,6 @@ struct elevator_ops
 
 	elevator_init_fn *elevator_init_fn;
 	elevator_exit_fn *elevator_exit_fn;
-	void (*trim)(struct io_context *);
 };
 
 #define ELV_NAME_MAX	(16)
-- 
cgit v1.2.3


From b50b636bce6293fa858cc7ff6c3ffe4920d90006 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:39 +0100
Subject: block, cfq: kill ioc_gone

Now that cic's are immediately unlinked under both locks, there's no
need to count and drain cic's before module unload.  RCU callback
completion is waited with rcu_barrier().

While at it, remove residual RCU operations on cic_list.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c      | 43 +++++--------------------------------------
 include/linux/elevator.h | 17 -----------------
 2 files changed, 5 insertions(+), 55 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ff44435fad50..ae7791a8ded9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -62,10 +62,6 @@ static const int cfq_hist_divisor = 4;
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
 
-static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
-static struct completion *ioc_gone;
-static DEFINE_SPINLOCK(ioc_gone_lock);
-
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
@@ -2671,26 +2667,8 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 
 static void cfq_cic_free_rcu(struct rcu_head *head)
 {
-	struct cfq_io_context *cic;
-
-	cic = container_of(head, struct cfq_io_context, rcu_head);
-
-	kmem_cache_free(cfq_ioc_pool, cic);
-	elv_ioc_count_dec(cfq_ioc_count);
-
-	if (ioc_gone) {
-		/*
-		 * CFQ scheduler is exiting, grab exit lock and check
-		 * the pending io context count. If it hits zero,
-		 * complete ioc_gone and set it back to NULL
-		 */
-		spin_lock(&ioc_gone_lock);
-		if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
-			complete(ioc_gone);
-			ioc_gone = NULL;
-		}
-		spin_unlock(&ioc_gone_lock);
-	}
+	kmem_cache_free(cfq_ioc_pool,
+			container_of(head, struct cfq_io_context, rcu_head));
 }
 
 static void cfq_cic_free(struct cfq_io_context *cic)
@@ -2705,7 +2683,7 @@ static void cfq_release_cic(struct cfq_io_context *cic)
 
 	BUG_ON(!(dead_key & CIC_DEAD_KEY));
 	radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
-	hlist_del_rcu(&cic->cic_list);
+	hlist_del(&cic->cic_list);
 	cfq_cic_free(cic);
 }
 
@@ -2782,7 +2760,6 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		INIT_HLIST_NODE(&cic->cic_list);
 		cic->exit = cfq_exit_cic;
 		cic->release = cfq_release_cic;
-		elv_ioc_count_inc(cfq_ioc_count);
 	}
 
 	return cic;
@@ -3072,7 +3049,7 @@ static int cfq_create_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 
 	ret = radix_tree_insert(&ioc->radix_root, q->id, cic);
 	if (likely(!ret)) {
-		hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
+		hlist_add_head(&cic->cic_list, &ioc->cic_list);
 		list_add(&cic->queue_list, &cfqd->cic_list);
 		cic = NULL;
 	} else if (ret == -EEXIST) {
@@ -4156,19 +4133,9 @@ static int __init cfq_init(void)
 
 static void __exit cfq_exit(void)
 {
-	DECLARE_COMPLETION_ONSTACK(all_gone);
 	blkio_policy_unregister(&blkio_policy_cfq);
 	elv_unregister(&iosched_cfq);
-	ioc_gone = &all_gone;
-	/* ioc_gone's update must be visible before reading ioc_count */
-	smp_wmb();
-
-	/*
-	 * this also protects us from entering cfq_slab_kill() with
-	 * pending RCU callbacks
-	 */
-	if (elv_ioc_count_read(cfq_ioc_count))
-		wait_for_completion(&all_gone);
+	rcu_barrier();	/* make sure all cic RCU frees are complete */
 	cfq_slab_kill();
 }
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 581dd1bd3d3e..02604c89ddde 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -196,22 +196,5 @@ enum {
 	INIT_LIST_HEAD(&(rq)->csd.list);	\
 	} while (0)
 
-/*
- * io context count accounting
- */
-#define elv_ioc_count_mod(name, __val) this_cpu_add(name, __val)
-#define elv_ioc_count_inc(name)	this_cpu_inc(name)
-#define elv_ioc_count_dec(name)	this_cpu_dec(name)
-
-#define elv_ioc_count_read(name)				\
-({								\
-	unsigned long __val = 0;				\
-	int __cpu;						\
-	smp_wmb();						\
-	for_each_possible_cpu(__cpu)				\
-		__val += per_cpu(name, __cpu);			\
-	__val;							\
-})
-
 #endif /* CONFIG_BLOCK */
 #endif
-- 
cgit v1.2.3


From 1238033c79e92e5c315af12e45396f1a78c73dec Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:40 +0100
Subject: block, cfq: kill cic->key

Now that lazy paths are removed, cfqd_dead_key() is meaningless and
cic->q can be used whereever cic->key is used.  Kill cic->key.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c       | 26 +++++---------------------
 include/linux/iocontext.h |  1 -
 2 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ae7791a8ded9..3b07ce168780 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -472,22 +472,9 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
 	cic->cfqq[is_sync] = cfqq;
 }
 
-#define CIC_DEAD_KEY	1ul
-#define CIC_DEAD_INDEX_SHIFT	1
-
-static inline void *cfqd_dead_key(struct cfq_data *cfqd)
-{
-	return (void *)(cfqd->queue->id << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
-}
-
 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
 {
-	struct cfq_data *cfqd = cic->key;
-
-	if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
-		return NULL;
-
-	return cfqd;
+	return cic->q->elevator->elevator_data;
 }
 
 /*
@@ -2679,10 +2666,8 @@ static void cfq_cic_free(struct cfq_io_context *cic)
 static void cfq_release_cic(struct cfq_io_context *cic)
 {
 	struct io_context *ioc = cic->ioc;
-	unsigned long dead_key = (unsigned long) cic->key;
 
-	BUG_ON(!(dead_key & CIC_DEAD_KEY));
-	radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
+	radix_tree_delete(&ioc->radix_root, cic->q->id);
 	hlist_del(&cic->cic_list);
 	cfq_cic_free(cic);
 }
@@ -2726,7 +2711,6 @@ static void cfq_exit_cic(struct cfq_io_context *cic)
 	struct io_context *ioc = cic->ioc;
 
 	list_del_init(&cic->queue_list);
-	cic->key = cfqd_dead_key(cfqd);
 
 	/*
 	 * Both setting lookup hint to and clearing it from @cic are done
@@ -2982,6 +2966,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 static struct cfq_io_context *
 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 {
+	struct request_queue *q = cfqd->queue;
 	struct cfq_io_context *cic;
 
 	lockdep_assert_held(cfqd->queue->queue_lock);
@@ -2996,11 +2981,11 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 	 */
 	rcu_read_lock();
 	cic = rcu_dereference(ioc->ioc_data);
-	if (cic && cic->key == cfqd)
+	if (cic && cic->q == q)
 		goto out;
 
 	cic = radix_tree_lookup(&ioc->radix_root, cfqd->queue->id);
-	if (cic && cic->key == cfqd)
+	if (cic && cic->q == q)
 		rcu_assign_pointer(ioc->ioc_data, cic);	/* allowed to race */
 	else
 		cic = NULL;
@@ -3040,7 +3025,6 @@ static int cfq_create_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 		goto out;
 
 	cic->ioc = ioc;
-	cic->key = cfqd;
 	cic->q = cfqd->queue;
 
 	/* lock both q and ioc and try to link @cic */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 01e863128780..b2b75a54f252 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -20,7 +20,6 @@ enum {
 };
 
 struct cfq_io_context {
-	void *key;
 	struct request_queue *q;
 
 	struct cfq_queue *cfqq[2];
-- 
cgit v1.2.3


From f2dbd76a0a994bc1d5a3d0e7c844cc373832e86c Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:40 +0100
Subject: block, cfq: replace current_io_context() with create_io_context()

When called under queue_lock, current_io_context() triggers lockdep
warning if it hits allocation path.  This is because io_context
installation is protected by task_lock which is not IRQ safe, so it
triggers irq-unsafe-lock -> irq -> irq-safe-lock -> irq-unsafe-lock
deadlock warning.

Given the restriction, accessor + creator rolled into one doesn't work
too well.  Drop current_io_context() and let the users access
task->io_context directly inside queue_lock combined with explicit
creation using create_io_context().

Future ioc updates will further consolidate ioc access and the create
interface will be unexported.

While at it, relocate ioc internal interface declarations in blk.h and
add section comments before and after.

This patch does not introduce functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c    | 25 ++++++++++++++++-----
 block/blk-ioc.c     | 62 +++++++++++++++--------------------------------------
 block/blk.h         | 36 ++++++++++++++++++++++++++++---
 block/cfq-iosched.c |  2 +-
 4 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index fd4749391e17..6804fdf27eff 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -771,9 +771,12 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
-	struct io_context *ioc = NULL;
+	struct io_context *ioc;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
+	bool retried = false;
 	int may_queue;
+retry:
+	ioc = current->io_context;
 
 	if (unlikely(blk_queue_dead(q)))
 		return NULL;
@@ -784,7 +787,20 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 
 	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
 		if (rl->count[is_sync]+1 >= q->nr_requests) {
-			ioc = current_io_context(GFP_ATOMIC, q->node);
+			/*
+			 * We want ioc to record batching state.  If it's
+			 * not already there, creating a new one requires
+			 * dropping queue_lock, which in turn requires
+			 * retesting conditions to avoid queue hang.
+			 */
+			if (!ioc && !retried) {
+				spin_unlock_irq(q->queue_lock);
+				create_io_context(current, gfp_mask, q->node);
+				spin_lock_irq(q->queue_lock);
+				retried = true;
+				goto retry;
+			}
+
 			/*
 			 * The queue will fill after this allocation, so set
 			 * it as full, and mark this process as "batching".
@@ -892,7 +908,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	while (!rq) {
 		DEFINE_WAIT(wait);
-		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 
 		if (unlikely(blk_queue_dead(q)))
@@ -912,8 +927,8 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		 * up to a big batch of them for a small period time.
 		 * See ioc_batching, ioc_set_batching
 		 */
-		ioc = current_io_context(GFP_NOIO, q->node);
-		ioc_set_batching(q, ioc);
+		create_io_context(current, GFP_NOIO, q->node);
+		ioc_set_batching(q, current->io_context);
 
 		spin_lock_irq(q->queue_lock);
 		finish_wait(&rl->wait[is_sync], &wait);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index fb23965595da..e23c797b4685 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -205,16 +205,15 @@ void exit_io_context(struct task_struct *task)
 	put_io_context(ioc, NULL);
 }
 
-static struct io_context *create_task_io_context(struct task_struct *task,
-						 gfp_t gfp_flags, int node,
-						 bool take_ref)
+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
+				int node)
 {
 	struct io_context *ioc;
 
 	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
 				    node);
 	if (unlikely(!ioc))
-		return NULL;
+		return;
 
 	/* initialize */
 	atomic_long_set(&ioc->refcount, 1);
@@ -226,42 +225,13 @@ static struct io_context *create_task_io_context(struct task_struct *task,
 
 	/* try to install, somebody might already have beaten us to it */
 	task_lock(task);
-
-	if (!task->io_context && !(task->flags & PF_EXITING)) {
+	if (!task->io_context && !(task->flags & PF_EXITING))
 		task->io_context = ioc;
-	} else {
+	else
 		kmem_cache_free(iocontext_cachep, ioc);
-		ioc = task->io_context;
-	}
-
-	if (ioc && take_ref)
-		get_io_context(ioc);
-
 	task_unlock(task);
-	return ioc;
 }
-
-/**
- * current_io_context - get io_context of %current
- * @gfp_flags: allocation flags, used if allocation is necessary
- * @node: allocation node, used if allocation is necessary
- *
- * Return io_context of %current.  If it doesn't exist, it is created with
- * @gfp_flags and @node.  The returned io_context does NOT have its
- * reference count incremented.  Because io_context is exited only on task
- * exit, %current can be sure that the returned io_context is valid and
- * alive as long as it is executing.
- */
-struct io_context *current_io_context(gfp_t gfp_flags, int node)
-{
-	might_sleep_if(gfp_flags & __GFP_WAIT);
-
-	if (current->io_context)
-		return current->io_context;
-
-	return create_task_io_context(current, gfp_flags, node, false);
-}
-EXPORT_SYMBOL(current_io_context);
+EXPORT_SYMBOL(create_io_context_slowpath);
 
 /**
  * get_task_io_context - get io_context of a task
@@ -274,7 +244,7 @@ EXPORT_SYMBOL(current_io_context);
  * incremented.
  *
  * This function always goes through task_lock() and it's better to use
- * current_io_context() + get_io_context() for %current.
+ * %current->io_context + get_io_context() for %current.
  */
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node)
@@ -283,16 +253,18 @@ struct io_context *get_task_io_context(struct task_struct *task,
 
 	might_sleep_if(gfp_flags & __GFP_WAIT);
 
-	task_lock(task);
-	ioc = task->io_context;
-	if (likely(ioc)) {
-		get_io_context(ioc);
+	do {
+		task_lock(task);
+		ioc = task->io_context;
+		if (likely(ioc)) {
+			get_io_context(ioc);
+			task_unlock(task);
+			return ioc;
+		}
 		task_unlock(task);
-		return ioc;
-	}
-	task_unlock(task);
+	} while (create_io_context(task, gfp_flags, node));
 
-	return create_task_io_context(task, gfp_flags, node, true);
+	return NULL;
 }
 EXPORT_SYMBOL(get_task_io_context);
 
diff --git a/block/blk.h b/block/blk.h
index 8d421156fefb..5bca2668e1bf 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -127,9 +127,6 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
 }
 #endif
 
-void get_io_context(struct io_context *ioc);
-struct io_context *current_io_context(gfp_t gfp_flags, int node);
-
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		     struct bio *bio);
 int ll_front_merge_fn(struct request_queue *q, struct request *req, 
@@ -198,6 +195,39 @@ static inline int blk_do_io_stat(struct request *rq)
 	        (rq->cmd_flags & REQ_DISCARD));
 }
 
+/*
+ * Internal io_context interface
+ */
+void get_io_context(struct io_context *ioc);
+
+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
+				int node);
+
+/**
+ * create_io_context - try to create task->io_context
+ * @task: target task
+ * @gfp_mask: allocation mask
+ * @node: allocation node
+ *
+ * If @task->io_context is %NULL, allocate a new io_context and install it.
+ * Returns the current @task->io_context which may be %NULL if allocation
+ * failed.
+ *
+ * Note that this function can't be called with IRQ disabled because
+ * task_lock which protects @task->io_context is IRQ-unsafe.
+ */
+static inline struct io_context *create_io_context(struct task_struct *task,
+						   gfp_t gfp_mask, int node)
+{
+	WARN_ON_ONCE(irqs_disabled());
+	if (unlikely(!task->io_context))
+		create_io_context_slowpath(task, gfp_mask, node);
+	return task->io_context;
+}
+
+/*
+ * Internal throttling interface
+ */
 #ifdef CONFIG_BLK_DEV_THROTTLING
 extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern void blk_throtl_drain(struct request_queue *q);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3b07ce168780..5f7e4d161404 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3012,7 +3012,7 @@ static int cfq_create_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
 	/* allocate stuff */
-	ioc = current_io_context(gfp_mask, q->node);
+	ioc = create_io_context(current, gfp_mask, q->node);
 	if (!ioc)
 		goto out;
 
-- 
cgit v1.2.3


From f8fc877d3c1f10457d0d73d8540a0c51a1fa718a Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:40 +0100
Subject: block: reorder elevator switch sequence

Elevator switch sequence first attached the new elevator, then tried
registering it (sysfs) and if that failed attached back the old
elevator.  However, sysfs registration doesn't require the elevator to
be attached, so there is no reason to do the "detach, attach new,
register, maybe re-attach old" sequence.  It can just do "register,
detach, attach".

* elevator_init_queue() is updated to set ->elevator_data directly and
  return 0 / -errno.  This allows elevator_exit() on an unattached
  elevator.

* __elv_unregister_queue() which was necessary to unregister
  unattached q is removed in favor of __elv_register_queue() which can
  register unattached q.

* elevator_attach() becomes a single assignment and obscures more then
  it helps.  Dropped.

This will help cleaning up io_context handling across elevator switch.

This patch doesn't introduce visible behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c | 91 +++++++++++++++++++++++---------------------------------
 1 file changed, 37 insertions(+), 54 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 6a343e8f8319..a16c2d1713e5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -168,17 +168,13 @@ static struct elevator_type *elevator_get(const char *name)
 	return e;
 }
 
-static void *elevator_init_queue(struct request_queue *q,
-				 struct elevator_queue *eq)
+static int elevator_init_queue(struct request_queue *q,
+			       struct elevator_queue *eq)
 {
-	return eq->ops->elevator_init_fn(q);
-}
-
-static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
-			   void *data)
-{
-	q->elevator = eq;
-	eq->elevator_data = data;
+	eq->elevator_data = eq->ops->elevator_init_fn(q);
+	if (eq->elevator_data)
+		return 0;
+	return -ENOMEM;
 }
 
 static char chosen_elevator[ELV_NAME_MAX];
@@ -241,7 +237,7 @@ int elevator_init(struct request_queue *q, char *name)
 {
 	struct elevator_type *e = NULL;
 	struct elevator_queue *eq;
-	void *data;
+	int err;
 
 	if (unlikely(q->elevator))
 		return 0;
@@ -278,13 +274,13 @@ int elevator_init(struct request_queue *q, char *name)
 	if (!eq)
 		return -ENOMEM;
 
-	data = elevator_init_queue(q, eq);
-	if (!data) {
+	err = elevator_init_queue(q, eq);
+	if (err) {
 		kobject_put(&eq->kobj);
-		return -ENOMEM;
+		return err;
 	}
 
-	elevator_attach(q, eq, data);
+	q->elevator = eq;
 	return 0;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -856,9 +852,8 @@ static struct kobj_type elv_ktype = {
 	.release	= elevator_release,
 };
 
-int elv_register_queue(struct request_queue *q)
+int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
 {
-	struct elevator_queue *e = q->elevator;
 	int error;
 
 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
@@ -876,19 +871,22 @@ int elv_register_queue(struct request_queue *q)
 	}
 	return error;
 }
-EXPORT_SYMBOL(elv_register_queue);
 
-static void __elv_unregister_queue(struct elevator_queue *e)
+int elv_register_queue(struct request_queue *q)
 {
-	kobject_uevent(&e->kobj, KOBJ_REMOVE);
-	kobject_del(&e->kobj);
-	e->registered = 0;
+	return __elv_register_queue(q, q->elevator);
 }
+EXPORT_SYMBOL(elv_register_queue);
 
 void elv_unregister_queue(struct request_queue *q)
 {
-	if (q)
-		__elv_unregister_queue(q->elevator);
+	if (q) {
+		struct elevator_queue *e = q->elevator;
+
+		kobject_uevent(&e->kobj, KOBJ_REMOVE);
+		kobject_del(&e->kobj);
+		e->registered = 0;
+	}
 }
 EXPORT_SYMBOL(elv_unregister_queue);
 
@@ -928,50 +926,36 @@ EXPORT_SYMBOL_GPL(elv_unregister);
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	struct elevator_queue *old_elevator, *e;
-	void *data;
 	int err;
 
-	/*
-	 * Allocate new elevator
-	 */
+	/* allocate new elevator */
 	e = elevator_alloc(q, new_e);
 	if (!e)
 		return -ENOMEM;
 
-	data = elevator_init_queue(q, e);
-	if (!data) {
+	err = elevator_init_queue(q, e);
+	if (err) {
 		kobject_put(&e->kobj);
-		return -ENOMEM;
+		return err;
 	}
 
-	/*
-	 * Turn on BYPASS and drain all requests w/ elevator private data
-	 */
+	/* turn on BYPASS and drain all requests w/ elevator private data */
 	elv_quiesce_start(q);
 
-	/*
-	 * Remember old elevator.
-	 */
-	old_elevator = q->elevator;
-
-	/*
-	 * attach and start new elevator
-	 */
-	spin_lock_irq(q->queue_lock);
-	elevator_attach(q, e, data);
-	spin_unlock_irq(q->queue_lock);
-
-	if (old_elevator->registered) {
-		__elv_unregister_queue(old_elevator);
-
-		err = elv_register_queue(q);
+	/* unregister old queue, register new one and kill old elevator */
+	if (q->elevator->registered) {
+		elv_unregister_queue(q);
+		err = __elv_register_queue(q, e);
 		if (err)
 			goto fail_register;
 	}
 
-	/*
-	 * finally exit old elevator and turn off BYPASS.
-	 */
+	/* done, replace the old one with new one and turn off BYPASS */
+	spin_lock_irq(q->queue_lock);
+	old_elevator = q->elevator;
+	q->elevator = e;
+	spin_unlock_irq(q->queue_lock);
+
 	elevator_exit(old_elevator);
 	elv_quiesce_end(q);
 
@@ -985,7 +969,6 @@ fail_register:
 	 * one again (along with re-adding the sysfs dir)
 	 */
 	elevator_exit(e);
-	q->elevator = old_elevator;
 	elv_register_queue(q);
 	elv_quiesce_end(q);
 
-- 
cgit v1.2.3


From 22f746e235a5cbee2a6ca9887b1be2aa7d31fe71 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:41 +0100
Subject: block: remove elevator_queue->ops

elevator_queue->ops points to the same ops struct ->elevator_type.ops
is pointing to.  The only effect of caching it in elevator_queue is
shorter notation - it doesn't save any indirect derefence.

Relocate elevator_type->list which used only during module init/exit
to the end of the structure, rename elevator_queue->elevator_type to
->type, and replace elevator_queue->ops with elevator_queue->type.ops.

This doesn't introduce any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h              | 10 +++----
 block/elevator.c         | 74 +++++++++++++++++++++++-------------------------
 include/linux/elevator.h |  5 ++--
 3 files changed, 43 insertions(+), 46 deletions(-)

diff --git a/block/blk.h b/block/blk.h
index 5bca2668e1bf..4943770e0792 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -94,7 +94,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			return NULL;
 		}
 		if (unlikely(blk_queue_dead(q)) ||
-		    !q->elevator->ops->elevator_dispatch_fn(q, 0))
+		    !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
 }
@@ -103,16 +103,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->ops->elevator_activate_req_fn)
-		e->ops->elevator_activate_req_fn(q, rq);
+	if (e->type->ops.elevator_activate_req_fn)
+		e->type->ops.elevator_activate_req_fn(q, rq);
 }
 
 static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->ops->elevator_deactivate_req_fn)
-		e->ops->elevator_deactivate_req_fn(q, rq);
+	if (e->type->ops.elevator_deactivate_req_fn)
+		e->type->ops.elevator_deactivate_req_fn(q, rq);
 }
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
diff --git a/block/elevator.c b/block/elevator.c
index a16c2d1713e5..31ffe76aed3d 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -61,8 +61,8 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e->ops->elevator_allow_merge_fn)
-		return e->ops->elevator_allow_merge_fn(q, rq, bio);
+	if (e->type->ops.elevator_allow_merge_fn)
+		return e->type->ops.elevator_allow_merge_fn(q, rq, bio);
 
 	return 1;
 }
@@ -171,7 +171,7 @@ static struct elevator_type *elevator_get(const char *name)
 static int elevator_init_queue(struct request_queue *q,
 			       struct elevator_queue *eq)
 {
-	eq->elevator_data = eq->ops->elevator_init_fn(q);
+	eq->elevator_data = eq->type->ops.elevator_init_fn(q);
 	if (eq->elevator_data)
 		return 0;
 	return -ENOMEM;
@@ -203,8 +203,7 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
 	if (unlikely(!eq))
 		goto err;
 
-	eq->ops = &e->ops;
-	eq->elevator_type = e;
+	eq->type = e;
 	kobject_init(&eq->kobj, &elv_ktype);
 	mutex_init(&eq->sysfs_lock);
 
@@ -228,7 +227,7 @@ static void elevator_release(struct kobject *kobj)
 	struct elevator_queue *e;
 
 	e = container_of(kobj, struct elevator_queue, kobj);
-	elevator_put(e->elevator_type);
+	elevator_put(e->type);
 	kfree(e->hash);
 	kfree(e);
 }
@@ -288,9 +287,8 @@ EXPORT_SYMBOL(elevator_init);
 void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
-	if (e->ops->elevator_exit_fn)
-		e->ops->elevator_exit_fn(e);
-	e->ops = NULL;
+	if (e->type->ops.elevator_exit_fn)
+		e->type->ops.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
 
 	kobject_put(&e->kobj);
@@ -500,8 +498,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 		return ELEVATOR_BACK_MERGE;
 	}
 
-	if (e->ops->elevator_merge_fn)
-		return e->ops->elevator_merge_fn(q, req, bio);
+	if (e->type->ops.elevator_merge_fn)
+		return e->type->ops.elevator_merge_fn(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
 }
@@ -544,8 +542,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->ops->elevator_merged_fn)
-		e->ops->elevator_merged_fn(q, rq, type);
+	if (e->type->ops.elevator_merged_fn)
+		e->type->ops.elevator_merged_fn(q, rq, type);
 
 	if (type == ELEVATOR_BACK_MERGE)
 		elv_rqhash_reposition(q, rq);
@@ -559,8 +557,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 	struct elevator_queue *e = q->elevator;
 	const int next_sorted = next->cmd_flags & REQ_SORTED;
 
-	if (next_sorted && e->ops->elevator_merge_req_fn)
-		e->ops->elevator_merge_req_fn(q, rq, next);
+	if (next_sorted && e->type->ops.elevator_merge_req_fn)
+		e->type->ops.elevator_merge_req_fn(q, rq, next);
 
 	elv_rqhash_reposition(q, rq);
 
@@ -577,8 +575,8 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->ops->elevator_bio_merged_fn)
-		e->ops->elevator_bio_merged_fn(q, rq, bio);
+	if (e->type->ops.elevator_bio_merged_fn)
+		e->type->ops.elevator_bio_merged_fn(q, rq, bio);
 }
 
 void elv_requeue_request(struct request_queue *q, struct request *rq)
@@ -604,12 +602,12 @@ void elv_drain_elevator(struct request_queue *q)
 
 	lockdep_assert_held(q->queue_lock);
 
-	while (q->elevator->ops->elevator_dispatch_fn(q, 1))
+	while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
 		;
 	if (q->nr_sorted && printed++ < 10) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
 		       "(nr_sorted=%u), please report this\n",
-		       q->elevator->elevator_type->elevator_name, q->nr_sorted);
+		       q->elevator->type->elevator_name, q->nr_sorted);
 	}
 }
 
@@ -698,7 +696,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 		 * rq cannot be accessed after calling
 		 * elevator_add_req_fn.
 		 */
-		q->elevator->ops->elevator_add_req_fn(q, rq);
+		q->elevator->type->ops.elevator_add_req_fn(q, rq);
 		break;
 
 	case ELEVATOR_INSERT_FLUSH:
@@ -727,8 +725,8 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->ops->elevator_latter_req_fn)
-		return e->ops->elevator_latter_req_fn(q, rq);
+	if (e->type->ops.elevator_latter_req_fn)
+		return e->type->ops.elevator_latter_req_fn(q, rq);
 	return NULL;
 }
 
@@ -736,8 +734,8 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->ops->elevator_former_req_fn)
-		return e->ops->elevator_former_req_fn(q, rq);
+	if (e->type->ops.elevator_former_req_fn)
+		return e->type->ops.elevator_former_req_fn(q, rq);
 	return NULL;
 }
 
@@ -745,8 +743,8 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->ops->elevator_set_req_fn)
-		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
+	if (e->type->ops.elevator_set_req_fn)
+		return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask);
 
 	rq->elevator_private[0] = NULL;
 	return 0;
@@ -756,16 +754,16 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->ops->elevator_put_req_fn)
-		e->ops->elevator_put_req_fn(rq);
+	if (e->type->ops.elevator_put_req_fn)
+		e->type->ops.elevator_put_req_fn(rq);
 }
 
 int elv_may_queue(struct request_queue *q, int rw)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->ops->elevator_may_queue_fn)
-		return e->ops->elevator_may_queue_fn(q, rw);
+	if (e->type->ops.elevator_may_queue_fn)
+		return e->type->ops.elevator_may_queue_fn(q, rw);
 
 	return ELV_MQUEUE_MAY;
 }
@@ -800,8 +798,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]--;
 		if ((rq->cmd_flags & REQ_SORTED) &&
-		    e->ops->elevator_completed_req_fn)
-			e->ops->elevator_completed_req_fn(q, rq);
+		    e->type->ops.elevator_completed_req_fn)
+			e->type->ops.elevator_completed_req_fn(q, rq);
 	}
 }
 
@@ -819,7 +817,7 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 
 	e = container_of(kobj, struct elevator_queue, kobj);
 	mutex_lock(&e->sysfs_lock);
-	error = e->ops ? entry->show(e, page) : -ENOENT;
+	error = e->type ? entry->show(e, page) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
 	return error;
 }
@@ -837,7 +835,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
 
 	e = container_of(kobj, struct elevator_queue, kobj);
 	mutex_lock(&e->sysfs_lock);
-	error = e->ops ? entry->store(e, page, length) : -ENOENT;
+	error = e->type ? entry->store(e, page, length) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
 	return error;
 }
@@ -858,7 +856,7 @@ int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
 
 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
 	if (!error) {
-		struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
+		struct elv_fs_entry *attr = e->type->elevator_attrs;
 		if (attr) {
 			while (attr->attr.name) {
 				if (sysfs_create_file(&e->kobj, &attr->attr))
@@ -959,7 +957,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	elevator_exit(old_elevator);
 	elv_quiesce_end(q);
 
-	blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
+	blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name);
 
 	return 0;
 
@@ -993,7 +991,7 @@ int elevator_change(struct request_queue *q, const char *name)
 		return -EINVAL;
 	}
 
-	if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
+	if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
 		elevator_put(e);
 		return 0;
 	}
@@ -1028,7 +1026,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	if (!q->elevator || !blk_queue_stackable(q))
 		return sprintf(name, "none\n");
 
-	elv = e->elevator_type;
+	elv = e->type;
 
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 02604c89ddde..04958ef53e62 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -78,11 +78,11 @@ struct elv_fs_entry {
  */
 struct elevator_type
 {
-	struct list_head list;
 	struct elevator_ops ops;
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
+	struct list_head list;
 };
 
 /*
@@ -90,10 +90,9 @@ struct elevator_type
  */
 struct elevator_queue
 {
-	struct elevator_ops *ops;
+	struct elevator_type *type;
 	void *elevator_data;
 	struct kobject kobj;
-	struct elevator_type *elevator_type;
 	struct mutex sysfs_lock;
 	struct hlist_head *hash;
 	unsigned int registered:1;
-- 
cgit v1.2.3


From c58698073218f2c8f2fc5982fa3938c2d3803b9f Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:41 +0100
Subject: block, cfq: reorganize cfq_io_context into generic and cfq specific
 parts

Currently io_context and cfq logics are mixed without clear boundary.
Most of io_context is independent from cfq but cfq_io_context handling
logic is dispersed between generic ioc code and cfq.

cfq_io_context represents association between an io_context and a
request_queue, which is a concept useful outside of cfq, but it also
contains fields which are useful only to cfq.

This patch takes out generic part and put it into io_cq (io
context-queue) and the rest into cfq_io_cq (cic moniker remains the
same) which contains io_cq.  The following changes are made together.

* cfq_ttime and cfq_io_cq now live in cfq-iosched.c.

* All related fields, functions and constants are renamed accordingly.

* ioc->ioc_data is now "struct io_cq *" instead of "void *" and
  renamed to icq_hint.

This prepares for io_context API cleanup.  Documentation is currently
sparse.  It will be added later.

Changes in this patch are mechanical and don't cause functional
change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c           |  58 ++++++-----
 block/cfq-iosched.c       | 248 +++++++++++++++++++++++++---------------------
 include/linux/iocontext.h |  43 +++-----
 3 files changed, 175 insertions(+), 174 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index e23c797b4685..dc5e69d335a0 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -46,7 +46,7 @@ EXPORT_SYMBOL(get_io_context);
 
 /*
  * Slow path for ioc release in put_io_context().  Performs double-lock
- * dancing to unlink all cic's and then frees ioc.
+ * dancing to unlink all icq's and then frees ioc.
  */
 static void ioc_release_fn(struct work_struct *work)
 {
@@ -56,11 +56,10 @@ static void ioc_release_fn(struct work_struct *work)
 
 	spin_lock_irq(&ioc->lock);
 
-	while (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
-							 struct cfq_io_context,
-							 cic_list);
-		struct request_queue *this_q = cic->q;
+	while (!hlist_empty(&ioc->icq_list)) {
+		struct io_cq *icq = hlist_entry(ioc->icq_list.first,
+						struct io_cq, ioc_node);
+		struct request_queue *this_q = icq->q;
 
 		if (this_q != last_q) {
 			/*
@@ -89,8 +88,8 @@ static void ioc_release_fn(struct work_struct *work)
 			continue;
 		}
 		ioc_release_depth_inc(this_q);
-		cic->exit(cic);
-		cic->release(cic);
+		icq->exit(icq);
+		icq->release(icq);
 		ioc_release_depth_dec(this_q);
 	}
 
@@ -131,10 +130,10 @@ void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
 		return;
 
 	/*
-	 * Destroy @ioc.  This is a bit messy because cic's are chained
+	 * Destroy @ioc.  This is a bit messy because icq's are chained
 	 * from both ioc and queue, and ioc->lock nests inside queue_lock.
-	 * The inner ioc->lock should be held to walk our cic_list and then
-	 * for each cic the outer matching queue_lock should be grabbed.
+	 * The inner ioc->lock should be held to walk our icq_list and then
+	 * for each icq the outer matching queue_lock should be grabbed.
 	 * ie. We need to do reverse-order double lock dancing.
 	 *
 	 * Another twist is that we are often called with one of the
@@ -153,11 +152,10 @@ void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
 	spin_lock_irqsave_nested(&ioc->lock, flags,
 				 ioc_release_depth(locked_q));
 
-	while (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
-							 struct cfq_io_context,
-							 cic_list);
-		struct request_queue *this_q = cic->q;
+	while (!hlist_empty(&ioc->icq_list)) {
+		struct io_cq *icq = hlist_entry(ioc->icq_list.first,
+						struct io_cq, ioc_node);
+		struct request_queue *this_q = icq->q;
 
 		if (this_q != last_q) {
 			if (last_q && last_q != locked_q)
@@ -170,8 +168,8 @@ void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
 			continue;
 		}
 		ioc_release_depth_inc(this_q);
-		cic->exit(cic);
-		cic->release(cic);
+		icq->exit(icq);
+		icq->release(icq);
 		ioc_release_depth_dec(this_q);
 	}
 
@@ -180,8 +178,8 @@ void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
 
 	spin_unlock_irqrestore(&ioc->lock, flags);
 
-	/* if no cic's left, we're done; otherwise, kick release_work */
-	if (hlist_empty(&ioc->cic_list))
+	/* if no icq is left, we're done; otherwise, kick release_work */
+	if (hlist_empty(&ioc->icq_list))
 		kmem_cache_free(iocontext_cachep, ioc);
 	else
 		schedule_work(&ioc->release_work);
@@ -219,8 +217,8 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
 	atomic_long_set(&ioc->refcount, 1);
 	atomic_set(&ioc->nr_tasks, 1);
 	spin_lock_init(&ioc->lock);
-	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
-	INIT_HLIST_HEAD(&ioc->cic_list);
+	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
+	INIT_HLIST_HEAD(&ioc->icq_list);
 	INIT_WORK(&ioc->release_work, ioc_release_fn);
 
 	/* try to install, somebody might already have beaten us to it */
@@ -270,11 +268,11 @@ EXPORT_SYMBOL(get_task_io_context);
 
 void ioc_set_changed(struct io_context *ioc, int which)
 {
-	struct cfq_io_context *cic;
+	struct io_cq *icq;
 	struct hlist_node *n;
 
-	hlist_for_each_entry(cic, n, &ioc->cic_list, cic_list)
-		set_bit(which, &cic->changed);
+	hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
+		set_bit(which, &icq->changed);
 }
 
 /**
@@ -282,8 +280,8 @@ void ioc_set_changed(struct io_context *ioc, int which)
  * @ioc: io_context of interest
  * @ioprio: new ioprio
  *
- * @ioc's ioprio has changed to @ioprio.  Set %CIC_IOPRIO_CHANGED for all
- * cic's.  iosched is responsible for checking the bit and applying it on
+ * @ioc's ioprio has changed to @ioprio.  Set %ICQ_IOPRIO_CHANGED for all
+ * icq's.  iosched is responsible for checking the bit and applying it on
  * request issue path.
  */
 void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
@@ -292,7 +290,7 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
 
 	spin_lock_irqsave(&ioc->lock, flags);
 	ioc->ioprio = ioprio;
-	ioc_set_changed(ioc, CIC_IOPRIO_CHANGED);
+	ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 }
 
@@ -300,7 +298,7 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
  * ioc_cgroup_changed - notify cgroup change
  * @ioc: io_context of interest
  *
- * @ioc's cgroup has changed.  Set %CIC_CGROUP_CHANGED for all cic's.
+ * @ioc's cgroup has changed.  Set %ICQ_CGROUP_CHANGED for all icq's.
  * iosched is responsible for checking the bit and applying it on request
  * issue path.
  */
@@ -309,7 +307,7 @@ void ioc_cgroup_changed(struct io_context *ioc)
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioc->lock, flags);
-	ioc_set_changed(ioc, CIC_CGROUP_CHANGED);
+	ioc_set_changed(ioc, ICQ_CGROUP_CHANGED);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5f7e4d161404..d2f16fcdec7f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,13 +54,12 @@ static const int cfq_hist_divisor = 4;
 #define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 
-#define RQ_CIC(rq)		\
-	((struct cfq_io_context *) (rq)->elevator_private[0])
+#define RQ_CIC(rq)		icq_to_cic((rq)->elevator_private[0])
 #define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private[1])
 #define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private[2])
 
 static struct kmem_cache *cfq_pool;
-static struct kmem_cache *cfq_ioc_pool;
+static struct kmem_cache *cfq_icq_pool;
 
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
@@ -69,6 +68,14 @@ static struct kmem_cache *cfq_ioc_pool;
 #define sample_valid(samples)	((samples) > 80)
 #define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
 
+struct cfq_ttime {
+	unsigned long last_end_request;
+
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+};
+
 /*
  * Most of our rbtree usage is for sorting with min extraction, so
  * if we cache the leftmost node we don't have to walk down the tree
@@ -210,6 +217,12 @@ struct cfq_group {
 	struct cfq_ttime ttime;
 };
 
+struct cfq_io_cq {
+	struct io_cq		icq;		/* must be the first member */
+	struct cfq_queue	*cfqq[2];
+	struct cfq_ttime	ttime;
+};
+
 /*
  * Per block device queue structure
  */
@@ -261,7 +274,7 @@ struct cfq_data {
 	struct work_struct unplug_work;
 
 	struct cfq_queue *active_queue;
-	struct cfq_io_context *active_cic;
+	struct cfq_io_cq *active_cic;
 
 	/*
 	 * async queue for each priority case
@@ -284,7 +297,7 @@ struct cfq_data {
 	unsigned int cfq_group_idle;
 	unsigned int cfq_latency;
 
-	struct list_head cic_list;
+	struct list_head icq_list;
 
 	/*
 	 * Fallback dummy cfqq for extreme OOM conditions
@@ -457,24 +470,28 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
 				       struct io_context *, gfp_t);
-static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
-						struct io_context *);
+static struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *, struct io_context *);
 
-static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
-					    bool is_sync)
+static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
+{
+	/* cic->icq is the first member, %NULL will convert to %NULL */
+	return container_of(icq, struct cfq_io_cq, icq);
+}
+
+static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
 {
 	return cic->cfqq[is_sync];
 }
 
-static inline void cic_set_cfqq(struct cfq_io_context *cic,
-				struct cfq_queue *cfqq, bool is_sync)
+static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
+				bool is_sync)
 {
 	cic->cfqq[is_sync] = cfqq;
 }
 
-static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
+static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
 {
-	return cic->q->elevator->elevator_data;
+	return cic->icq.q->elevator->elevator_data;
 }
 
 /*
@@ -1541,7 +1558,7 @@ static struct request *
 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
 	struct task_struct *tsk = current;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
 
 	cic = cfq_cic_lookup(cfqd, tsk->io_context);
@@ -1655,7 +1672,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 			   struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
 
 	/*
@@ -1671,7 +1688,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 	 * and %current are guaranteed to be equal.  Avoid lookup which
 	 * requires queue_lock by using @rq's cic.
 	 */
-	if (current->io_context == RQ_CIC(rq)->ioc) {
+	if (current->io_context == RQ_CIC(rq)->icq.ioc) {
 		cic = RQ_CIC(rq);
 	} else {
 		cic = cfq_cic_lookup(cfqd, current->io_context);
@@ -1761,7 +1778,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqd->active_queue = NULL;
 
 	if (cfqd->active_cic) {
-		put_io_context(cfqd->active_cic->ioc, cfqd->queue);
+		put_io_context(cfqd->active_cic->icq.ioc, cfqd->queue);
 		cfqd->active_cic = NULL;
 	}
 }
@@ -1981,7 +1998,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	unsigned long sl, group_idle = 0;
 
 	/*
@@ -2016,7 +2033,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
-	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
+	if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
 		return;
 
 	/*
@@ -2567,9 +2584,9 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_dispatch_insert(cfqd->queue, rq);
 
 	if (!cfqd->active_cic) {
-		struct cfq_io_context *cic = RQ_CIC(rq);
+		struct cfq_io_cq *cic = RQ_CIC(rq);
 
-		atomic_long_inc(&cic->ioc->refcount);
+		atomic_long_inc(&cic->icq.ioc->refcount);
 		cfqd->active_cic = cic;
 	}
 
@@ -2652,24 +2669,24 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	cfq_put_cfqg(cfqg);
 }
 
-static void cfq_cic_free_rcu(struct rcu_head *head)
+static void cfq_icq_free_rcu(struct rcu_head *head)
 {
-	kmem_cache_free(cfq_ioc_pool,
-			container_of(head, struct cfq_io_context, rcu_head));
+	kmem_cache_free(cfq_icq_pool,
+			icq_to_cic(container_of(head, struct io_cq, rcu_head)));
 }
 
-static void cfq_cic_free(struct cfq_io_context *cic)
+static void cfq_icq_free(struct io_cq *icq)
 {
-	call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
+	call_rcu(&icq->rcu_head, cfq_icq_free_rcu);
 }
 
-static void cfq_release_cic(struct cfq_io_context *cic)
+static void cfq_release_icq(struct io_cq *icq)
 {
-	struct io_context *ioc = cic->ioc;
+	struct io_context *ioc = icq->ioc;
 
-	radix_tree_delete(&ioc->radix_root, cic->q->id);
-	hlist_del(&cic->cic_list);
-	cfq_cic_free(cic);
+	radix_tree_delete(&ioc->icq_tree, icq->q->id);
+	hlist_del(&icq->ioc_node);
+	cfq_icq_free(icq);
 }
 
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -2705,20 +2722,21 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_put_queue(cfqq);
 }
 
-static void cfq_exit_cic(struct cfq_io_context *cic)
+static void cfq_exit_icq(struct io_cq *icq)
 {
+	struct cfq_io_cq *cic = icq_to_cic(icq);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
-	struct io_context *ioc = cic->ioc;
+	struct io_context *ioc = icq->ioc;
 
-	list_del_init(&cic->queue_list);
+	list_del_init(&icq->q_node);
 
 	/*
-	 * Both setting lookup hint to and clearing it from @cic are done
-	 * under queue_lock.  If it's not pointing to @cic now, it never
+	 * Both setting lookup hint to and clearing it from @icq are done
+	 * under queue_lock.  If it's not pointing to @icq now, it never
 	 * will.  Hint assignment itself can race safely.
 	 */
-	if (rcu_dereference_raw(ioc->ioc_data) == cic)
-		rcu_assign_pointer(ioc->ioc_data, NULL);
+	if (rcu_dereference_raw(ioc->icq_hint) == icq)
+		rcu_assign_pointer(ioc->icq_hint, NULL);
 
 	if (cic->cfqq[BLK_RW_ASYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -2731,19 +2749,18 @@ static void cfq_exit_cic(struct cfq_io_context *cic)
 	}
 }
 
-static struct cfq_io_context *
-cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
+static struct cfq_io_cq *cfq_alloc_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 
-	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
+	cic = kmem_cache_alloc_node(cfq_icq_pool, gfp_mask | __GFP_ZERO,
 							cfqd->queue->node);
 	if (cic) {
 		cic->ttime.last_end_request = jiffies;
-		INIT_LIST_HEAD(&cic->queue_list);
-		INIT_HLIST_NODE(&cic->cic_list);
-		cic->exit = cfq_exit_cic;
-		cic->release = cfq_release_cic;
+		INIT_LIST_HEAD(&cic->icq.q_node);
+		INIT_HLIST_NODE(&cic->icq.ioc_node);
+		cic->icq.exit = cfq_exit_icq;
+		cic->icq.release = cfq_release_icq;
 	}
 
 	return cic;
@@ -2791,7 +2808,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static void changed_ioprio(struct cfq_io_context *cic)
+static void changed_ioprio(struct cfq_io_cq *cic)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
@@ -2802,7 +2819,7 @@ static void changed_ioprio(struct cfq_io_context *cic)
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
+		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
 						GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
@@ -2836,7 +2853,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void changed_cgroup(struct cfq_io_context *cic)
+static void changed_cgroup(struct cfq_io_cq *cic)
 {
 	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
@@ -2864,7 +2881,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 		     struct io_context *ioc, gfp_t gfp_mask)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	struct cfq_group *cfqg;
 
 retry:
@@ -2956,56 +2973,57 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 }
 
 /**
- * cfq_cic_lookup - lookup cfq_io_context
+ * cfq_cic_lookup - lookup cfq_io_cq
  * @cfqd: the associated cfq_data
  * @ioc: the associated io_context
  *
- * Look up cfq_io_context associated with @cfqd - @ioc pair.  Must be
- * called with queue_lock held.
+ * Look up cfq_io_cq associated with @cfqd - @ioc pair.  Must be called
+ * with queue_lock held.
  */
-static struct cfq_io_context *
+static struct cfq_io_cq *
 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 {
 	struct request_queue *q = cfqd->queue;
-	struct cfq_io_context *cic;
+	struct io_cq *icq;
 
 	lockdep_assert_held(cfqd->queue->queue_lock);
 	if (unlikely(!ioc))
 		return NULL;
 
 	/*
-	 * cic's are indexed from @ioc using radix tree and hint pointer,
+	 * icq's are indexed from @ioc using radix tree and hint pointer,
 	 * both of which are protected with RCU.  All removals are done
 	 * holding both q and ioc locks, and we're holding q lock - if we
-	 * find a cic which points to us, it's guaranteed to be valid.
+	 * find a icq which points to us, it's guaranteed to be valid.
 	 */
 	rcu_read_lock();
-	cic = rcu_dereference(ioc->ioc_data);
-	if (cic && cic->q == q)
+	icq = rcu_dereference(ioc->icq_hint);
+	if (icq && icq->q == q)
 		goto out;
 
-	cic = radix_tree_lookup(&ioc->radix_root, cfqd->queue->id);
-	if (cic && cic->q == q)
-		rcu_assign_pointer(ioc->ioc_data, cic);	/* allowed to race */
+	icq = radix_tree_lookup(&ioc->icq_tree, cfqd->queue->id);
+	if (icq && icq->q == q)
+		rcu_assign_pointer(ioc->icq_hint, icq);	/* allowed to race */
 	else
-		cic = NULL;
+		icq = NULL;
 out:
 	rcu_read_unlock();
-	return cic;
+	return icq_to_cic(icq);
 }
 
 /**
- * cfq_create_cic - create and link a cfq_io_context
+ * cfq_create_cic - create and link a cfq_io_cq
  * @cfqd: cfqd of interest
  * @gfp_mask: allocation mask
  *
- * Make sure cfq_io_context linking %current->io_context and @cfqd exists.
- * If ioc and/or cic doesn't exist, they will be created using @gfp_mask.
+ * Make sure cfq_io_cq linking %current->io_context and @cfqd exists.  If
+ * ioc and/or cic doesn't exist, they will be created using @gfp_mask.
  */
 static int cfq_create_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct request_queue *q = cfqd->queue;
-	struct cfq_io_context *cic = NULL;
+	struct io_cq *icq = NULL;
+	struct cfq_io_cq *cic;
 	struct io_context *ioc;
 	int ret = -ENOMEM;
 
@@ -3016,26 +3034,27 @@ static int cfq_create_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 	if (!ioc)
 		goto out;
 
-	cic = cfq_alloc_io_context(cfqd, gfp_mask);
+	cic = cfq_alloc_cic(cfqd, gfp_mask);
 	if (!cic)
 		goto out;
+	icq = &cic->icq;
 
 	ret = radix_tree_preload(gfp_mask);
 	if (ret)
 		goto out;
 
-	cic->ioc = ioc;
-	cic->q = cfqd->queue;
+	icq->ioc = ioc;
+	icq->q = cfqd->queue;
 
-	/* lock both q and ioc and try to link @cic */
+	/* lock both q and ioc and try to link @icq */
 	spin_lock_irq(q->queue_lock);
 	spin_lock(&ioc->lock);
 
-	ret = radix_tree_insert(&ioc->radix_root, q->id, cic);
+	ret = radix_tree_insert(&ioc->icq_tree, q->id, icq);
 	if (likely(!ret)) {
-		hlist_add_head(&cic->cic_list, &ioc->cic_list);
-		list_add(&cic->queue_list, &cfqd->cic_list);
-		cic = NULL;
+		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
+		list_add(&icq->q_node, &cfqd->icq_list);
+		icq = NULL;
 	} else if (ret == -EEXIST) {
 		/* someone else already did it */
 		ret = 0;
@@ -3047,29 +3066,28 @@ static int cfq_create_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 	radix_tree_preload_end();
 out:
 	if (ret)
-		printk(KERN_ERR "cfq: cic link failed!\n");
-	if (cic)
-		cfq_cic_free(cic);
+		printk(KERN_ERR "cfq: icq link failed!\n");
+	if (icq)
+		cfq_icq_free(icq);
 	return ret;
 }
 
 /**
- * cfq_get_io_context - acquire cfq_io_context and bump refcnt on io_context
+ * cfq_get_cic - acquire cfq_io_cq and bump refcnt on io_context
  * @cfqd: cfqd to setup cic for
  * @gfp_mask: allocation mask
  *
- * Return cfq_io_context associating @cfqd and %current->io_context and
+ * Return cfq_io_cq associating @cfqd and %current->io_context and
  * bump refcnt on io_context.  If ioc or cic doesn't exist, they're created
  * using @gfp_mask.
  *
  * Must be called under queue_lock which may be released and re-acquired.
  * This function also may sleep depending on @gfp_mask.
  */
-static struct cfq_io_context *
-cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
+static struct cfq_io_cq *cfq_get_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct request_queue *q = cfqd->queue;
-	struct cfq_io_context *cic = NULL;
+	struct cfq_io_cq *cic = NULL;
 	struct io_context *ioc;
 	int err;
 
@@ -3095,11 +3113,11 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	/* bump @ioc's refcnt and handle changed notifications */
 	get_io_context(ioc);
 
-	if (unlikely(cic->changed)) {
-		if (test_and_clear_bit(CIC_IOPRIO_CHANGED, &cic->changed))
+	if (unlikely(cic->icq.changed)) {
+		if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
 			changed_ioprio(cic);
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-		if (test_and_clear_bit(CIC_CGROUP_CHANGED, &cic->changed))
+		if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
 			changed_cgroup(cic);
 #endif
 	}
@@ -3120,7 +3138,7 @@ __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
 
 static void
 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-	struct cfq_io_context *cic)
+			struct cfq_io_cq *cic)
 {
 	if (cfq_cfqq_sync(cfqq)) {
 		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
@@ -3158,7 +3176,7 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
  */
 static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		       struct cfq_io_context *cic)
+		       struct cfq_io_cq *cic)
 {
 	int old_idle, enable_idle;
 
@@ -3175,8 +3193,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
 		enable_idle = 0;
-	else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
+	else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
+		 !cfqd->cfq_slice_idle ||
+		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime.ttime_samples)) {
 		if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
@@ -3308,7 +3327,7 @@ static void
 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct request *rq)
 {
-	struct cfq_io_context *cic = RQ_CIC(rq);
+	struct cfq_io_cq *cic = RQ_CIC(rq);
 
 	cfqd->rq_queued++;
 	if (rq->cmd_flags & REQ_PRIO)
@@ -3361,7 +3380,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
 	cfq_log_cfqq(cfqd, cfqq, "insert_request");
-	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
+	cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
 
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
@@ -3411,7 +3430,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 
 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	struct cfq_io_context *cic = cfqd->active_cic;
+	struct cfq_io_cq *cic = cfqd->active_cic;
 
 	/* If the queue already has requests, don't wait */
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3548,7 +3567,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
 
 	/*
@@ -3563,7 +3582,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 
 	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
 	if (cfqq) {
-		cfq_init_prio_data(cfqq, cic->ioc);
+		cfq_init_prio_data(cfqq, cic->icq.ioc);
 
 		return __cfq_may_queue(cfqq);
 	}
@@ -3584,7 +3603,7 @@ static void cfq_put_request(struct request *rq)
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
 
-		put_io_context(RQ_CIC(rq)->ioc, cfqq->cfqd->queue);
+		put_io_context(RQ_CIC(rq)->icq.ioc, cfqq->cfqd->queue);
 
 		rq->elevator_private[0] = NULL;
 		rq->elevator_private[1] = NULL;
@@ -3598,7 +3617,7 @@ static void cfq_put_request(struct request *rq)
 }
 
 static struct cfq_queue *
-cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
+cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
 		struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
@@ -3613,7 +3632,7 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
  * was the last process referring to said cfqq.
  */
 static struct cfq_queue *
-split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
+split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
 {
 	if (cfqq_process_refs(cfqq) == 1) {
 		cfqq->pid = current->pid;
@@ -3636,7 +3655,7 @@ static int
 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
@@ -3644,14 +3663,14 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
 	spin_lock_irq(q->queue_lock);
-	cic = cfq_get_io_context(cfqd, gfp_mask);
+	cic = cfq_get_cic(cfqd, gfp_mask);
 	if (!cic)
 		goto queue_fail;
 
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
@@ -3677,7 +3696,7 @@ new_queue:
 	cfqq->allocated[rw]++;
 
 	cfqq->ref++;
-	rq->elevator_private[0] = cic;
+	rq->elevator_private[0] = &cic->icq;
 	rq->elevator_private[1] = cfqq;
 	rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
 	spin_unlock_irq(q->queue_lock);
@@ -3791,15 +3810,14 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
-	while (!list_empty(&cfqd->cic_list)) {
-		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
-							struct cfq_io_context,
-							queue_list);
-		struct io_context *ioc = cic->ioc;
+	while (!list_empty(&cfqd->icq_list)) {
+		struct io_cq *icq = list_entry(cfqd->icq_list.next,
+					       struct io_cq, q_node);
+		struct io_context *ioc = icq->ioc;
 
 		spin_lock(&ioc->lock);
-		cfq_exit_cic(cic);
-		cfq_release_cic(cic);
+		cfq_exit_icq(icq);
+		cfq_release_icq(icq);
 		spin_unlock(&ioc->lock);
 	}
 
@@ -3904,7 +3922,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->oom_cfqq.ref++;
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
 
-	INIT_LIST_HEAD(&cfqd->cic_list);
+	INIT_LIST_HEAD(&cfqd->icq_list);
 
 	cfqd->queue = q;
 
@@ -3942,8 +3960,8 @@ static void cfq_slab_kill(void)
 	 */
 	if (cfq_pool)
 		kmem_cache_destroy(cfq_pool);
-	if (cfq_ioc_pool)
-		kmem_cache_destroy(cfq_ioc_pool);
+	if (cfq_icq_pool)
+		kmem_cache_destroy(cfq_icq_pool);
 }
 
 static int __init cfq_slab_setup(void)
@@ -3952,8 +3970,8 @@ static int __init cfq_slab_setup(void)
 	if (!cfq_pool)
 		goto fail;
 
-	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
-	if (!cfq_ioc_pool)
+	cfq_icq_pool = KMEM_CACHE(cfq_io_cq, 0);
+	if (!cfq_icq_pool)
 		goto fail;
 
 	return 0;
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index b2b75a54f252..d15ca6591f96 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -5,38 +5,23 @@
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
 
-struct cfq_queue;
-struct cfq_ttime {
-	unsigned long last_end_request;
-
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
-};
-
 enum {
-	CIC_IOPRIO_CHANGED,
-	CIC_CGROUP_CHANGED,
+	ICQ_IOPRIO_CHANGED,
+	ICQ_CGROUP_CHANGED,
 };
 
-struct cfq_io_context {
-	struct request_queue *q;
-
-	struct cfq_queue *cfqq[2];
-
-	struct io_context *ioc;
-
-	struct cfq_ttime ttime;
-
-	struct list_head queue_list;
-	struct hlist_node cic_list;
+struct io_cq {
+	struct request_queue	*q;
+	struct io_context	*ioc;
 
-	unsigned long changed;
+	struct list_head	q_node;
+	struct hlist_node	ioc_node;
 
-	void (*exit)(struct cfq_io_context *);
-	void (*release)(struct cfq_io_context *);
+	unsigned long		changed;
+	struct rcu_head		rcu_head;
 
-	struct rcu_head rcu_head;
+	void (*exit)(struct io_cq *);
+	void (*release)(struct io_cq *);
 };
 
 /*
@@ -58,9 +43,9 @@ struct io_context {
 	int nr_batch_requests;     /* Number of requests left in the batch */
 	unsigned long last_waited; /* Time last woken after wait for request */
 
-	struct radix_tree_root radix_root;
-	struct hlist_head cic_list;
-	void __rcu *ioc_data;
+	struct radix_tree_root	icq_tree;
+	struct io_cq __rcu	*icq_hint;
+	struct hlist_head	icq_list;
 
 	struct work_struct release_work;
 };
-- 
cgit v1.2.3


From a612fddf0d8090f2877305c9168b6c1a34fb5d90 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:41 +0100
Subject: block, cfq: move cfqd->icq_list to request_queue and add
 request->elv.icq

Most of icq management is about to be moved out of cfq into blk-ioc.
This patch prepares for it.

* Move cfqd->icq_list to request_queue->icq_list

* Make request explicitly point to icq instead of through elevator
  private data.  ->elevator_private[3] is replaced with sub struct elv
  which contains icq pointer and priv[2].  cfq is updated accordingly.

* Meaningless clearing of ->elevator_private[0] removed from
  elv_set_request().  At that point in code, the field was guaranteed
  to be %NULL anyway.

This patch doesn't introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  1 +
 block/cfq-iosched.c    | 28 +++++++++++-----------------
 block/elevator.c       |  2 --
 include/linux/blkdev.h | 10 ++++++++--
 4 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 6804fdf27eff..3c26c7f48703 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -497,6 +497,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
+	INIT_LIST_HEAD(&q->icq_list);
 	INIT_LIST_HEAD(&q->flush_queue[0]);
 	INIT_LIST_HEAD(&q->flush_queue[1]);
 	INIT_LIST_HEAD(&q->flush_data_in_flight);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d2f16fcdec7f..9bc5ecc1b336 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
 #define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 
-#define RQ_CIC(rq)		icq_to_cic((rq)->elevator_private[0])
-#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private[1])
-#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private[2])
+#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
+#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
+#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
 
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_icq_pool;
@@ -297,8 +297,6 @@ struct cfq_data {
 	unsigned int cfq_group_idle;
 	unsigned int cfq_latency;
 
-	struct list_head icq_list;
-
 	/*
 	 * Fallback dummy cfqq for extreme OOM conditions
 	 */
@@ -3053,7 +3051,7 @@ static int cfq_create_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 	ret = radix_tree_insert(&ioc->icq_tree, q->id, icq);
 	if (likely(!ret)) {
 		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
-		list_add(&icq->q_node, &cfqd->icq_list);
+		list_add(&icq->q_node, &q->icq_list);
 		icq = NULL;
 	} else if (ret == -EEXIST) {
 		/* someone else already did it */
@@ -3605,12 +3603,10 @@ static void cfq_put_request(struct request *rq)
 
 		put_io_context(RQ_CIC(rq)->icq.ioc, cfqq->cfqd->queue);
 
-		rq->elevator_private[0] = NULL;
-		rq->elevator_private[1] = NULL;
-
 		/* Put down rq reference on cfqg */
 		cfq_put_cfqg(RQ_CFQG(rq));
-		rq->elevator_private[2] = NULL;
+		rq->elv.priv[0] = NULL;
+		rq->elv.priv[1] = NULL;
 
 		cfq_put_queue(cfqq);
 	}
@@ -3696,9 +3692,9 @@ new_queue:
 	cfqq->allocated[rw]++;
 
 	cfqq->ref++;
-	rq->elevator_private[0] = &cic->icq;
-	rq->elevator_private[1] = cfqq;
-	rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
+	rq->elv.icq = &cic->icq;
+	rq->elv.priv[0] = cfqq;
+	rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 
@@ -3810,8 +3806,8 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
-	while (!list_empty(&cfqd->icq_list)) {
-		struct io_cq *icq = list_entry(cfqd->icq_list.next,
+	while (!list_empty(&q->icq_list)) {
+		struct io_cq *icq = list_entry(q->icq_list.next,
 					       struct io_cq, q_node);
 		struct io_context *ioc = icq->ioc;
 
@@ -3922,8 +3918,6 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->oom_cfqq.ref++;
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
 
-	INIT_LIST_HEAD(&cfqd->icq_list);
-
 	cfqd->queue = q;
 
 	init_timer(&cfqd->idle_slice_timer);
diff --git a/block/elevator.c b/block/elevator.c
index 31ffe76aed3d..c5c6214829cb 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -745,8 +745,6 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 
 	if (e->type->ops.elevator_set_req_fn)
 		return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask);
-
-	rq->elevator_private[0] = NULL;
 	return 0;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 65c2f8c70089..8bca04873f53 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -111,10 +111,14 @@ struct request {
 	 * Three pointers are available for the IO schedulers, if they need
 	 * more they have to dynamically allocate it.  Flush requests are
 	 * never put on the IO scheduler. So let the flush fields share
-	 * space with the three elevator_private pointers.
+	 * space with the elevator data.
 	 */
 	union {
-		void *elevator_private[3];
+		struct {
+			struct io_cq		*icq;
+			void			*priv[2];
+		} elv;
+
 		struct {
 			unsigned int		seq;
 			struct list_head	list;
@@ -357,6 +361,8 @@ struct request_queue {
 	struct timer_list	timeout;
 	struct list_head	timeout_list;
 
+	struct list_head	icq_list;
+
 	struct queue_limits	limits;
 
 	/*
-- 
cgit v1.2.3


From 47fdd4ca96bf4b28ac4d05d7a6e382df31d3d758 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:42 +0100
Subject: block, cfq: move io_cq lookup to blk-ioc.c

Now that all io_cq related data structures are in block core layer,
io_cq lookup can be moved from cfq-iosched.c to blk-ioc.c.

Lookup logic from cfq_cic_lookup() is moved to ioc_lookup_icq() with
parameter return type changes (cfqd -> request_queue, cfq_io_cq ->
io_cq) and cfq_cic_lookup() becomes thin wrapper around
cfq_cic_lookup().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c     | 36 ++++++++++++++++++++++++++++++++++++
 block/blk.h         |  1 +
 block/cfq-iosched.c | 48 ++++++++----------------------------------------
 3 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index dc5e69d335a0..87ecc98b8ade 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -266,6 +266,42 @@ struct io_context *get_task_io_context(struct task_struct *task,
 }
 EXPORT_SYMBOL(get_task_io_context);
 
+/**
+ * ioc_lookup_icq - lookup io_cq from ioc
+ * @ioc: the associated io_context
+ * @q: the associated request_queue
+ *
+ * Look up io_cq associated with @ioc - @q pair from @ioc.  Must be called
+ * with @q->queue_lock held.
+ */
+struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
+{
+	struct io_cq *icq;
+
+	lockdep_assert_held(q->queue_lock);
+
+	/*
+	 * icq's are indexed from @ioc using radix tree and hint pointer,
+	 * both of which are protected with RCU.  All removals are done
+	 * holding both q and ioc locks, and we're holding q lock - if we
+	 * find a icq which points to us, it's guaranteed to be valid.
+	 */
+	rcu_read_lock();
+	icq = rcu_dereference(ioc->icq_hint);
+	if (icq && icq->q == q)
+		goto out;
+
+	icq = radix_tree_lookup(&ioc->icq_tree, q->id);
+	if (icq && icq->q == q)
+		rcu_assign_pointer(ioc->icq_hint, icq);	/* allowed to race */
+	else
+		icq = NULL;
+out:
+	rcu_read_unlock();
+	return icq;
+}
+EXPORT_SYMBOL(ioc_lookup_icq);
+
 void ioc_set_changed(struct io_context *ioc, int which)
 {
 	struct io_cq *icq;
diff --git a/block/blk.h b/block/blk.h
index 4943770e0792..3c510a4b5054 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -199,6 +199,7 @@ static inline int blk_do_io_stat(struct request *rq)
  * Internal io_context interface
  */
 void get_io_context(struct io_context *ioc);
+struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
 
 void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
 				int node);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9bc5ecc1b336..048fa699adf9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -468,7 +468,6 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
 				       struct io_context *, gfp_t);
-static struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *, struct io_context *);
 
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
@@ -476,6 +475,14 @@ static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 	return container_of(icq, struct cfq_io_cq, icq);
 }
 
+static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
+					       struct io_context *ioc)
+{
+	if (ioc)
+		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
+	return NULL;
+}
+
 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
 {
 	return cic->cfqq[is_sync];
@@ -2970,45 +2977,6 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 	return cfqq;
 }
 
-/**
- * cfq_cic_lookup - lookup cfq_io_cq
- * @cfqd: the associated cfq_data
- * @ioc: the associated io_context
- *
- * Look up cfq_io_cq associated with @cfqd - @ioc pair.  Must be called
- * with queue_lock held.
- */
-static struct cfq_io_cq *
-cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
-{
-	struct request_queue *q = cfqd->queue;
-	struct io_cq *icq;
-
-	lockdep_assert_held(cfqd->queue->queue_lock);
-	if (unlikely(!ioc))
-		return NULL;
-
-	/*
-	 * icq's are indexed from @ioc using radix tree and hint pointer,
-	 * both of which are protected with RCU.  All removals are done
-	 * holding both q and ioc locks, and we're holding q lock - if we
-	 * find a icq which points to us, it's guaranteed to be valid.
-	 */
-	rcu_read_lock();
-	icq = rcu_dereference(ioc->icq_hint);
-	if (icq && icq->q == q)
-		goto out;
-
-	icq = radix_tree_lookup(&ioc->icq_tree, cfqd->queue->id);
-	if (icq && icq->q == q)
-		rcu_assign_pointer(ioc->icq_hint, icq);	/* allowed to race */
-	else
-		icq = NULL;
-out:
-	rcu_read_unlock();
-	return icq_to_cic(icq);
-}
-
 /**
  * cfq_create_cic - create and link a cfq_io_cq
  * @cfqd: cfqd of interest
-- 
cgit v1.2.3


From 3d3c2379feb177a5fd55bb0ed76776dc9d4f3243 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:42 +0100
Subject: block, cfq: move icq cache management to block core

Let elevators set ->icq_size and ->icq_align in elevator_type and
elv_register() and elv_unregister() respectively create and destroy
kmem_cache for icq.

* elv_register() now can return failure.  All callers updated.

* icq caches are automatically named "ELVNAME_io_cq".

* cfq_slab_setup/kill() are collapsed into cfq_init/exit().

* While at it, minor indentation change for iosched_cfq.elevator_name
  for consistency.

This will help moving icq management to block core.  This doesn't
introduce any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c      | 48 +++++++++++++++---------------------------------
 block/deadline-iosched.c |  4 +---
 block/elevator.c         | 37 +++++++++++++++++++++++++++++++++++--
 block/noop-iosched.c     |  4 +---
 include/linux/elevator.h | 11 ++++++++++-
 5 files changed, 62 insertions(+), 42 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 048fa699adf9..06e59abcb57f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3914,34 +3914,6 @@ static void *cfq_init_queue(struct request_queue *q)
 	return cfqd;
 }
 
-static void cfq_slab_kill(void)
-{
-	/*
-	 * Caller already ensured that pending RCU callbacks are completed,
-	 * so we should have no busy allocations at this point.
-	 */
-	if (cfq_pool)
-		kmem_cache_destroy(cfq_pool);
-	if (cfq_icq_pool)
-		kmem_cache_destroy(cfq_icq_pool);
-}
-
-static int __init cfq_slab_setup(void)
-{
-	cfq_pool = KMEM_CACHE(cfq_queue, 0);
-	if (!cfq_pool)
-		goto fail;
-
-	cfq_icq_pool = KMEM_CACHE(cfq_io_cq, 0);
-	if (!cfq_icq_pool)
-		goto fail;
-
-	return 0;
-fail:
-	cfq_slab_kill();
-	return -ENOMEM;
-}
-
 /*
  * sysfs parts below -->
  */
@@ -4053,8 +4025,10 @@ static struct elevator_type iosched_cfq = {
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
 	},
+	.icq_size	=	sizeof(struct cfq_io_cq),
+	.icq_align	=	__alignof__(struct cfq_io_cq),
 	.elevator_attrs =	cfq_attrs,
-	.elevator_name =	"cfq",
+	.elevator_name	=	"cfq",
 	.elevator_owner =	THIS_MODULE,
 };
 
@@ -4072,6 +4046,8 @@ static struct blkio_policy_type blkio_policy_cfq;
 
 static int __init cfq_init(void)
 {
+	int ret;
+
 	/*
 	 * could be 0 on HZ < 1000 setups
 	 */
@@ -4086,10 +4062,17 @@ static int __init cfq_init(void)
 #else
 		cfq_group_idle = 0;
 #endif
-	if (cfq_slab_setup())
+	cfq_pool = KMEM_CACHE(cfq_queue, 0);
+	if (!cfq_pool)
 		return -ENOMEM;
 
-	elv_register(&iosched_cfq);
+	ret = elv_register(&iosched_cfq);
+	if (ret) {
+		kmem_cache_destroy(cfq_pool);
+		return ret;
+	}
+	cfq_icq_pool = iosched_cfq.icq_cache;
+
 	blkio_policy_register(&blkio_policy_cfq);
 
 	return 0;
@@ -4099,8 +4082,7 @@ static void __exit cfq_exit(void)
 {
 	blkio_policy_unregister(&blkio_policy_cfq);
 	elv_unregister(&iosched_cfq);
-	rcu_barrier();	/* make sure all cic RCU frees are complete */
-	cfq_slab_kill();
+	kmem_cache_destroy(cfq_pool);
 }
 
 module_init(cfq_init);
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index c644137d9cd6..7bf12d793fcd 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -448,9 +448,7 @@ static struct elevator_type iosched_deadline = {
 
 static int __init deadline_init(void)
 {
-	elv_register(&iosched_deadline);
-
-	return 0;
+	return elv_register(&iosched_deadline);
 }
 
 static void __exit deadline_exit(void)
diff --git a/block/elevator.c b/block/elevator.c
index c5c6214829cb..cca049fb45c8 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -886,15 +886,36 @@ void elv_unregister_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(elv_unregister_queue);
 
-void elv_register(struct elevator_type *e)
+int elv_register(struct elevator_type *e)
 {
 	char *def = "";
 
+	/* create icq_cache if requested */
+	if (e->icq_size) {
+		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
+		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
+			return -EINVAL;
+
+		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
+			 "%s_io_cq", e->elevator_name);
+		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
+						 e->icq_align, 0, NULL);
+		if (!e->icq_cache)
+			return -ENOMEM;
+	}
+
+	/* register, don't allow duplicate names */
 	spin_lock(&elv_list_lock);
-	BUG_ON(elevator_find(e->elevator_name));
+	if (elevator_find(e->elevator_name)) {
+		spin_unlock(&elv_list_lock);
+		if (e->icq_cache)
+			kmem_cache_destroy(e->icq_cache);
+		return -EBUSY;
+	}
 	list_add_tail(&e->list, &elv_list);
 	spin_unlock(&elv_list_lock);
 
+	/* print pretty message */
 	if (!strcmp(e->elevator_name, chosen_elevator) ||
 			(!*chosen_elevator &&
 			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
@@ -902,14 +923,26 @@ void elv_register(struct elevator_type *e)
 
 	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
 								def);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(elv_register);
 
 void elv_unregister(struct elevator_type *e)
 {
+	/* unregister */
 	spin_lock(&elv_list_lock);
 	list_del_init(&e->list);
 	spin_unlock(&elv_list_lock);
+
+	/*
+	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
+	 * sure all RCU operations are complete before proceeding.
+	 */
+	if (e->icq_cache) {
+		rcu_barrier();
+		kmem_cache_destroy(e->icq_cache);
+		e->icq_cache = NULL;
+	}
 }
 EXPORT_SYMBOL_GPL(elv_unregister);
 
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 06389e9ef96d..413a0b1d788c 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -94,9 +94,7 @@ static struct elevator_type elevator_noop = {
 
 static int __init noop_init(void)
 {
-	elv_register(&elevator_noop);
-
-	return 0;
+	return elv_register(&elevator_noop);
 }
 
 static void __exit noop_exit(void)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 04958ef53e62..d3d3e28cbfd4 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -78,10 +78,19 @@ struct elv_fs_entry {
  */
 struct elevator_type
 {
+	/* managed by elevator core */
+	struct kmem_cache *icq_cache;
+
+	/* fields provided by elevator implementation */
 	struct elevator_ops ops;
+	size_t icq_size;
+	size_t icq_align;
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
+
+	/* managed by elevator core */
+	char icq_cache_name[ELV_NAME_MAX + 5];	/* elvname + "_io_cq" */
 	struct list_head list;
 };
 
@@ -127,7 +136,7 @@ extern void elv_drain_elevator(struct request_queue *);
 /*
  * io scheduler registration
  */
-extern void elv_register(struct elevator_type *);
+extern int elv_register(struct elevator_type *);
 extern void elv_unregister(struct elevator_type *);
 
 /*
-- 
cgit v1.2.3


From 7e5a8794492e43e9eebb68a98a23be055888ccd0 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:42 +0100
Subject: block, cfq: move io_cq exit/release to blk-ioc.c

With kmem_cache managed by blk-ioc, io_cq exit/release can be moved to
blk-ioc too.  The odd ->io_cq->exit/release() callbacks are replaced
with elevator_ops->elevator_exit_icq_fn() with unlinking from both ioc
and q, and freeing automatically handled by blk-ioc.  The elevator
operation only need to perform exit operation specific to the elevator
- in cfq's case, exiting the cfqq's.

Also, clearing of io_cq's on q detach is moved to block core and
automatically performed on elevator switch and q release.

Because the q io_cq points to might be freed before RCU callback for
the io_cq runs, blk-ioc code should remember to which cache the io_cq
needs to be freed when the io_cq is released.  New field
io_cq->__rcu_icq_cache is added for this purpose.  As both the new
field and rcu_head are used only after io_cq is released and the
q/ioc_node fields aren't, they are put into unions.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c           | 76 ++++++++++++++++++++++++++++++++++++++++++-----
 block/blk-sysfs.c         |  6 +++-
 block/blk.h               |  1 +
 block/cfq-iosched.c       | 47 ++---------------------------
 block/elevator.c          |  3 +-
 include/linux/elevator.h  |  5 ++++
 include/linux/iocontext.h | 20 +++++++++----
 7 files changed, 97 insertions(+), 61 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 87ecc98b8ade..0910a5584d38 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -44,6 +44,51 @@ EXPORT_SYMBOL(get_io_context);
 #define ioc_release_depth_dec(q)	do { } while (0)
 #endif
 
+static void icq_free_icq_rcu(struct rcu_head *head)
+{
+	struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
+
+	kmem_cache_free(icq->__rcu_icq_cache, icq);
+}
+
+/*
+ * Exit and free an icq.  Called with both ioc and q locked.
+ */
+static void ioc_exit_icq(struct io_cq *icq)
+{
+	struct io_context *ioc = icq->ioc;
+	struct request_queue *q = icq->q;
+	struct elevator_type *et = q->elevator->type;
+
+	lockdep_assert_held(&ioc->lock);
+	lockdep_assert_held(q->queue_lock);
+
+	radix_tree_delete(&ioc->icq_tree, icq->q->id);
+	hlist_del_init(&icq->ioc_node);
+	list_del_init(&icq->q_node);
+
+	/*
+	 * Both setting lookup hint to and clearing it from @icq are done
+	 * under queue_lock.  If it's not pointing to @icq now, it never
+	 * will.  Hint assignment itself can race safely.
+	 */
+	if (rcu_dereference_raw(ioc->icq_hint) == icq)
+		rcu_assign_pointer(ioc->icq_hint, NULL);
+
+	if (et->ops.elevator_exit_icq_fn) {
+		ioc_release_depth_inc(q);
+		et->ops.elevator_exit_icq_fn(icq);
+		ioc_release_depth_dec(q);
+	}
+
+	/*
+	 * @icq->q might have gone away by the time RCU callback runs
+	 * making it impossible to determine icq_cache.  Record it in @icq.
+	 */
+	icq->__rcu_icq_cache = et->icq_cache;
+	call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
+}
+
 /*
  * Slow path for ioc release in put_io_context().  Performs double-lock
  * dancing to unlink all icq's and then frees ioc.
@@ -87,10 +132,7 @@ static void ioc_release_fn(struct work_struct *work)
 			spin_lock(&ioc->lock);
 			continue;
 		}
-		ioc_release_depth_inc(this_q);
-		icq->exit(icq);
-		icq->release(icq);
-		ioc_release_depth_dec(this_q);
+		ioc_exit_icq(icq);
 	}
 
 	if (last_q) {
@@ -167,10 +209,7 @@ void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
 			last_q = this_q;
 			continue;
 		}
-		ioc_release_depth_inc(this_q);
-		icq->exit(icq);
-		icq->release(icq);
-		ioc_release_depth_dec(this_q);
+		ioc_exit_icq(icq);
 	}
 
 	if (last_q && last_q != locked_q)
@@ -203,6 +242,27 @@ void exit_io_context(struct task_struct *task)
 	put_io_context(ioc, NULL);
 }
 
+/**
+ * ioc_clear_queue - break any ioc association with the specified queue
+ * @q: request_queue being cleared
+ *
+ * Walk @q->icq_list and exit all io_cq's.  Must be called with @q locked.
+ */
+void ioc_clear_queue(struct request_queue *q)
+{
+	lockdep_assert_held(q->queue_lock);
+
+	while (!list_empty(&q->icq_list)) {
+		struct io_cq *icq = list_entry(q->icq_list.next,
+					       struct io_cq, q_node);
+		struct io_context *ioc = icq->ioc;
+
+		spin_lock(&ioc->lock);
+		ioc_exit_icq(icq);
+		spin_unlock(&ioc->lock);
+	}
+}
+
 void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
 				int node)
 {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5b4b4ab5e785..cf150011d808 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -479,8 +479,12 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_sync_queue(q);
 
-	if (q->elevator)
+	if (q->elevator) {
+		spin_lock_irq(q->queue_lock);
+		ioc_clear_queue(q);
+		spin_unlock_irq(q->queue_lock);
 		elevator_exit(q->elevator);
+	}
 
 	blk_throtl_exit(q);
 
diff --git a/block/blk.h b/block/blk.h
index 3c510a4b5054..ed4d9bf2ab16 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -200,6 +200,7 @@ static inline int blk_do_io_stat(struct request *rq)
  */
 void get_io_context(struct io_context *ioc);
 struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
+void ioc_clear_queue(struct request_queue *q);
 
 void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
 				int node);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 06e59abcb57f..f6d315551496 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2674,26 +2674,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	cfq_put_cfqg(cfqg);
 }
 
-static void cfq_icq_free_rcu(struct rcu_head *head)
-{
-	kmem_cache_free(cfq_icq_pool,
-			icq_to_cic(container_of(head, struct io_cq, rcu_head)));
-}
-
-static void cfq_icq_free(struct io_cq *icq)
-{
-	call_rcu(&icq->rcu_head, cfq_icq_free_rcu);
-}
-
-static void cfq_release_icq(struct io_cq *icq)
-{
-	struct io_context *ioc = icq->ioc;
-
-	radix_tree_delete(&ioc->icq_tree, icq->q->id);
-	hlist_del(&icq->ioc_node);
-	cfq_icq_free(icq);
-}
-
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
 {
 	struct cfq_queue *__cfqq, *next;
@@ -2731,17 +2711,6 @@ static void cfq_exit_icq(struct io_cq *icq)
 {
 	struct cfq_io_cq *cic = icq_to_cic(icq);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
-	struct io_context *ioc = icq->ioc;
-
-	list_del_init(&icq->q_node);
-
-	/*
-	 * Both setting lookup hint to and clearing it from @icq are done
-	 * under queue_lock.  If it's not pointing to @icq now, it never
-	 * will.  Hint assignment itself can race safely.
-	 */
-	if (rcu_dereference_raw(ioc->icq_hint) == icq)
-		rcu_assign_pointer(ioc->icq_hint, NULL);
 
 	if (cic->cfqq[BLK_RW_ASYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -2764,8 +2733,6 @@ static struct cfq_io_cq *cfq_alloc_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 		cic->ttime.last_end_request = jiffies;
 		INIT_LIST_HEAD(&cic->icq.q_node);
 		INIT_HLIST_NODE(&cic->icq.ioc_node);
-		cic->icq.exit = cfq_exit_icq;
-		cic->icq.release = cfq_release_icq;
 	}
 
 	return cic;
@@ -3034,7 +3001,7 @@ out:
 	if (ret)
 		printk(KERN_ERR "cfq: icq link failed!\n");
 	if (icq)
-		cfq_icq_free(icq);
+		kmem_cache_free(cfq_icq_pool, icq);
 	return ret;
 }
 
@@ -3774,17 +3741,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
-	while (!list_empty(&q->icq_list)) {
-		struct io_cq *icq = list_entry(q->icq_list.next,
-					       struct io_cq, q_node);
-		struct io_context *ioc = icq->ioc;
-
-		spin_lock(&ioc->lock);
-		cfq_exit_icq(icq);
-		cfq_release_icq(icq);
-		spin_unlock(&ioc->lock);
-	}
-
 	cfq_put_async_queues(cfqd);
 	cfq_release_cfq_groups(cfqd);
 
@@ -4019,6 +3975,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_completed_req_fn =	cfq_completed_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
+		.elevator_exit_icq_fn =		cfq_exit_icq,
 		.elevator_set_req_fn =		cfq_set_request,
 		.elevator_put_req_fn =		cfq_put_request,
 		.elevator_may_queue_fn =	cfq_may_queue,
diff --git a/block/elevator.c b/block/elevator.c
index cca049fb45c8..91e18f8af9be 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -979,8 +979,9 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 			goto fail_register;
 	}
 
-	/* done, replace the old one with new one and turn off BYPASS */
+	/* done, clear io_cq's, switch elevators and turn off BYPASS */
 	spin_lock_irq(q->queue_lock);
+	ioc_clear_queue(q);
 	old_elevator = q->elevator;
 	q->elevator = e;
 	spin_unlock_irq(q->queue_lock);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index d3d3e28cbfd4..06e4dd568717 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -5,6 +5,8 @@
 
 #ifdef CONFIG_BLOCK
 
+struct io_cq;
+
 typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
 				 struct bio *);
 
@@ -24,6 +26,7 @@ typedef struct request *(elevator_request_list_fn) (struct request_queue *, stru
 typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
 typedef int (elevator_may_queue_fn) (struct request_queue *, int);
 
+typedef void (elevator_exit_icq_fn) (struct io_cq *);
 typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t);
 typedef void (elevator_put_req_fn) (struct request *);
 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
@@ -56,6 +59,8 @@ struct elevator_ops
 	elevator_request_list_fn *elevator_former_req_fn;
 	elevator_request_list_fn *elevator_latter_req_fn;
 
+	elevator_exit_icq_fn *elevator_exit_icq_fn;
+
 	elevator_set_req_fn *elevator_set_req_fn;
 	elevator_put_req_fn *elevator_put_req_fn;
 
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index d15ca6591f96..ac390a34c0e7 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -14,14 +14,22 @@ struct io_cq {
 	struct request_queue	*q;
 	struct io_context	*ioc;
 
-	struct list_head	q_node;
-	struct hlist_node	ioc_node;
+	/*
+	 * q_node and ioc_node link io_cq through icq_list of q and ioc
+	 * respectively.  Both fields are unused once ioc_exit_icq() is
+	 * called and shared with __rcu_icq_cache and __rcu_head which are
+	 * used for RCU free of io_cq.
+	 */
+	union {
+		struct list_head	q_node;
+		struct kmem_cache	*__rcu_icq_cache;
+	};
+	union {
+		struct hlist_node	ioc_node;
+		struct rcu_head		__rcu_head;
+	};
 
 	unsigned long		changed;
-	struct rcu_head		rcu_head;
-
-	void (*exit)(struct io_cq *);
-	void (*release)(struct io_cq *);
 };
 
 /*
-- 
cgit v1.2.3


From 9b84cacd013996f244d85b3d873287c2a8f88658 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:42 +0100
Subject: block, cfq: restructure io_cq creation path for io_context interface
 cleanup

Add elevator_ops->elevator_init_icq_fn() and restructure
cfq_create_cic() and rename it to ioc_create_icq().

The new function expects its caller to pass in io_context, uses
elevator_type->icq_cache, handles generic init, calls the new elevator
operation for elevator specific initialization, and returns pointer to
created or looked up icq.  This leaves cfq_icq_pool variable without
any user.  Removed.

This prepares for io_context interface cleanup and doesn't introduce
any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c      | 94 +++++++++++++++++++++---------------------------
 include/linux/elevator.h |  2 ++
 2 files changed, 43 insertions(+), 53 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f6d315551496..11f49d036845 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -59,7 +59,6 @@ static const int cfq_hist_divisor = 4;
 #define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
 
 static struct kmem_cache *cfq_pool;
-static struct kmem_cache *cfq_icq_pool;
 
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
@@ -2707,6 +2706,13 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_put_queue(cfqq);
 }
 
+static void cfq_init_icq(struct io_cq *icq)
+{
+	struct cfq_io_cq *cic = icq_to_cic(icq);
+
+	cic->ttime.last_end_request = jiffies;
+}
+
 static void cfq_exit_icq(struct io_cq *icq)
 {
 	struct cfq_io_cq *cic = icq_to_cic(icq);
@@ -2723,21 +2729,6 @@ static void cfq_exit_icq(struct io_cq *icq)
 	}
 }
 
-static struct cfq_io_cq *cfq_alloc_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
-{
-	struct cfq_io_cq *cic;
-
-	cic = kmem_cache_alloc_node(cfq_icq_pool, gfp_mask | __GFP_ZERO,
-							cfqd->queue->node);
-	if (cic) {
-		cic->ttime.last_end_request = jiffies;
-		INIT_LIST_HEAD(&cic->icq.q_node);
-		INIT_HLIST_NODE(&cic->icq.ioc_node);
-	}
-
-	return cic;
-}
-
 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
 	struct task_struct *tsk = current;
@@ -2945,64 +2936,62 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 }
 
 /**
- * cfq_create_cic - create and link a cfq_io_cq
- * @cfqd: cfqd of interest
+ * ioc_create_icq - create and link io_cq
+ * @q: request_queue of interest
  * @gfp_mask: allocation mask
  *
- * Make sure cfq_io_cq linking %current->io_context and @cfqd exists.  If
- * ioc and/or cic doesn't exist, they will be created using @gfp_mask.
+ * Make sure io_cq linking %current->io_context and @q exists.  If either
+ * io_context and/or icq don't exist, they will be created using @gfp_mask.
+ *
+ * The caller is responsible for ensuring @ioc won't go away and @q is
+ * alive and will stay alive until this function returns.
  */
-static int cfq_create_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
+static struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
 {
-	struct request_queue *q = cfqd->queue;
-	struct io_cq *icq = NULL;
-	struct cfq_io_cq *cic;
+	struct elevator_type *et = q->elevator->type;
 	struct io_context *ioc;
-	int ret = -ENOMEM;
-
-	might_sleep_if(gfp_mask & __GFP_WAIT);
+	struct io_cq *icq;
 
 	/* allocate stuff */
 	ioc = create_io_context(current, gfp_mask, q->node);
 	if (!ioc)
-		goto out;
+		return NULL;
 
-	cic = cfq_alloc_cic(cfqd, gfp_mask);
-	if (!cic)
-		goto out;
-	icq = &cic->icq;
+	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
+				    q->node);
+	if (!icq)
+		return NULL;
 
-	ret = radix_tree_preload(gfp_mask);
-	if (ret)
-		goto out;
+	if (radix_tree_preload(gfp_mask) < 0) {
+		kmem_cache_free(et->icq_cache, icq);
+		return NULL;
+	}
 
 	icq->ioc = ioc;
-	icq->q = cfqd->queue;
+	icq->q = q;
+	INIT_LIST_HEAD(&icq->q_node);
+	INIT_HLIST_NODE(&icq->ioc_node);
 
 	/* lock both q and ioc and try to link @icq */
 	spin_lock_irq(q->queue_lock);
 	spin_lock(&ioc->lock);
 
-	ret = radix_tree_insert(&ioc->icq_tree, q->id, icq);
-	if (likely(!ret)) {
+	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
 		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
 		list_add(&icq->q_node, &q->icq_list);
-		icq = NULL;
-	} else if (ret == -EEXIST) {
-		/* someone else already did it */
-		ret = 0;
+		if (et->ops.elevator_init_icq_fn)
+			et->ops.elevator_init_icq_fn(icq);
+	} else {
+		kmem_cache_free(et->icq_cache, icq);
+		icq = ioc_lookup_icq(ioc, q);
+		if (!icq)
+			printk(KERN_ERR "cfq: icq link failed!\n");
 	}
 
 	spin_unlock(&ioc->lock);
 	spin_unlock_irq(q->queue_lock);
-
 	radix_tree_preload_end();
-out:
-	if (ret)
-		printk(KERN_ERR "cfq: icq link failed!\n");
-	if (icq)
-		kmem_cache_free(cfq_icq_pool, icq);
-	return ret;
+	return icq;
 }
 
 /**
@@ -3022,7 +3011,6 @@ static struct cfq_io_cq *cfq_get_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 	struct request_queue *q = cfqd->queue;
 	struct cfq_io_cq *cic = NULL;
 	struct io_context *ioc;
-	int err;
 
 	lockdep_assert_held(q->queue_lock);
 
@@ -3037,9 +3025,9 @@ static struct cfq_io_cq *cfq_get_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 
 		/* slow path - unlock, create missing ones and retry */
 		spin_unlock_irq(q->queue_lock);
-		err = cfq_create_cic(cfqd, gfp_mask);
+		cic = icq_to_cic(ioc_create_icq(q, gfp_mask));
 		spin_lock_irq(q->queue_lock);
-		if (err)
+		if (!cic)
 			return NULL;
 	}
 
@@ -3975,6 +3963,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_completed_req_fn =	cfq_completed_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
+		.elevator_init_icq_fn =		cfq_init_icq,
 		.elevator_exit_icq_fn =		cfq_exit_icq,
 		.elevator_set_req_fn =		cfq_set_request,
 		.elevator_put_req_fn =		cfq_put_request,
@@ -4028,7 +4017,6 @@ static int __init cfq_init(void)
 		kmem_cache_destroy(cfq_pool);
 		return ret;
 	}
-	cfq_icq_pool = iosched_cfq.icq_cache;
 
 	blkio_policy_register(&blkio_policy_cfq);
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 06e4dd568717..c8f1e67a8ebe 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -26,6 +26,7 @@ typedef struct request *(elevator_request_list_fn) (struct request_queue *, stru
 typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
 typedef int (elevator_may_queue_fn) (struct request_queue *, int);
 
+typedef void (elevator_init_icq_fn) (struct io_cq *);
 typedef void (elevator_exit_icq_fn) (struct io_cq *);
 typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t);
 typedef void (elevator_put_req_fn) (struct request *);
@@ -59,6 +60,7 @@ struct elevator_ops
 	elevator_request_list_fn *elevator_former_req_fn;
 	elevator_request_list_fn *elevator_latter_req_fn;
 
+	elevator_init_icq_fn *elevator_init_icq_fn;
 	elevator_exit_icq_fn *elevator_exit_icq_fn;
 
 	elevator_set_req_fn *elevator_set_req_fn;
-- 
cgit v1.2.3


From f1f8cc94651738b418ba54c039df536303b91704 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Wed, 14 Dec 2011 00:33:42 +0100
Subject: block, cfq: move icq creation and rq->elv.icq association to block
 core

Now block layer knows everything necessary to create and associate
icq's with requests.  Move ioc_create_icq() to blk-ioc.c and update
get_request() such that, if elevator_type->icq_size is set, requests
are automatically associated with their matching icq's before
elv_set_request().  io_context reference is also managed by block core
on request alloc/free.

* Only ioprio/cgroup changed handling remains from cfq_get_cic().
  Collapsed into cfq_set_request().

* This removes queue kicking on icq allocation failure (for now).  As
  icq allocation failure is rare and the only effect of queue kicking
  achieved was possibily accelerating queue processing, this change
  shouldn't be noticeable.

  There is a larger underlying problem.  Unlike request allocation,
  icq allocation is not guaranteed to succeed eventually after
  retries.  The number of icq is unbound and thus mempool can't be the
  solution either.  This effectively adds allocation dependency on
  memory free path and thus possibility of deadlock.

  This usually wouldn't happen because icq allocation is not a hot
  path and, even when the condition triggers, it's highly unlikely
  that none of the writeback workers already has icq.

  However, this is still possible especially if elevator is being
  switched under high memory pressure, so we better get it fixed.
  Probably the only solution is just bypassing elevator and appending
  to dispatch queue on any elevator allocation failure.

* Comment added to explain how icq's are managed and synchronized.

This completes cleanup of io_context interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          |  46 +++++++++++++---
 block/blk-ioc.c           |  60 ++++++++++++++++++++-
 block/blk.h               |   1 +
 block/cfq-iosched.c       | 135 ++++------------------------------------------
 include/linux/elevator.h  |   8 +--
 include/linux/iocontext.h |  59 ++++++++++++++++++++
 6 files changed, 173 insertions(+), 136 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3c26c7f48703..8fbdac7010bb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -640,13 +640,18 @@ EXPORT_SYMBOL(blk_get_queue);
 
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
-	if (rq->cmd_flags & REQ_ELVPRIV)
+	if (rq->cmd_flags & REQ_ELVPRIV) {
 		elv_put_request(q, rq);
+		if (rq->elv.icq)
+			put_io_context(rq->elv.icq->ioc, q);
+	}
+
 	mempool_free(rq, q->rq.rq_pool);
 }
 
 static struct request *
-blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
+blk_alloc_request(struct request_queue *q, struct io_cq *icq,
+		  unsigned int flags, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
@@ -657,10 +662,15 @@ blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
 
 	rq->cmd_flags = flags | REQ_ALLOCED;
 
-	if ((flags & REQ_ELVPRIV) &&
-	    unlikely(elv_set_request(q, rq, gfp_mask))) {
-		mempool_free(rq, q->rq.rq_pool);
-		return NULL;
+	if (flags & REQ_ELVPRIV) {
+		rq->elv.icq = icq;
+		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
+			mempool_free(rq, q->rq.rq_pool);
+			return NULL;
+		}
+		/* @rq->elv.icq holds on to io_context until @rq is freed */
+		if (icq)
+			get_io_context(icq->ioc);
 	}
 
 	return rq;
@@ -772,11 +782,14 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
+	struct elevator_type *et;
 	struct io_context *ioc;
+	struct io_cq *icq = NULL;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	bool retried = false;
 	int may_queue;
 retry:
+	et = q->elevator->type;
 	ioc = current->io_context;
 
 	if (unlikely(blk_queue_dead(q)))
@@ -837,17 +850,36 @@ retry:
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
 
+	/*
+	 * Decide whether the new request will be managed by elevator.  If
+	 * so, mark @rw_flags and increment elvpriv.  Non-zero elvpriv will
+	 * prevent the current elevator from being destroyed until the new
+	 * request is freed.  This guarantees icq's won't be destroyed and
+	 * makes creating new ones safe.
+	 *
+	 * Also, lookup icq while holding queue_lock.  If it doesn't exist,
+	 * it will be created after releasing queue_lock.
+	 */
 	if (blk_rq_should_init_elevator(bio) &&
 	    !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
 		rw_flags |= REQ_ELVPRIV;
 		rl->elvpriv++;
+		if (et->icq_cache && ioc)
+			icq = ioc_lookup_icq(ioc, q);
 	}
 
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
-	rq = blk_alloc_request(q, rw_flags, gfp_mask);
+	/* create icq if missing */
+	if (unlikely(et->icq_cache && !icq))
+		icq = ioc_create_icq(q, gfp_mask);
+
+	/* rqs are guaranteed to have icq on elv_set_request() if requested */
+	if (likely(!et->icq_cache || icq))
+		rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
+
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 0910a5584d38..c04d16b02225 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -289,7 +289,6 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
 		kmem_cache_free(iocontext_cachep, ioc);
 	task_unlock(task);
 }
-EXPORT_SYMBOL(create_io_context_slowpath);
 
 /**
  * get_task_io_context - get io_context of a task
@@ -362,6 +361,65 @@ out:
 }
 EXPORT_SYMBOL(ioc_lookup_icq);
 
+/**
+ * ioc_create_icq - create and link io_cq
+ * @q: request_queue of interest
+ * @gfp_mask: allocation mask
+ *
+ * Make sure io_cq linking %current->io_context and @q exists.  If either
+ * io_context and/or icq don't exist, they will be created using @gfp_mask.
+ *
+ * The caller is responsible for ensuring @ioc won't go away and @q is
+ * alive and will stay alive until this function returns.
+ */
+struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
+{
+	struct elevator_type *et = q->elevator->type;
+	struct io_context *ioc;
+	struct io_cq *icq;
+
+	/* allocate stuff */
+	ioc = create_io_context(current, gfp_mask, q->node);
+	if (!ioc)
+		return NULL;
+
+	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
+				    q->node);
+	if (!icq)
+		return NULL;
+
+	if (radix_tree_preload(gfp_mask) < 0) {
+		kmem_cache_free(et->icq_cache, icq);
+		return NULL;
+	}
+
+	icq->ioc = ioc;
+	icq->q = q;
+	INIT_LIST_HEAD(&icq->q_node);
+	INIT_HLIST_NODE(&icq->ioc_node);
+
+	/* lock both q and ioc and try to link @icq */
+	spin_lock_irq(q->queue_lock);
+	spin_lock(&ioc->lock);
+
+	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
+		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
+		list_add(&icq->q_node, &q->icq_list);
+		if (et->ops.elevator_init_icq_fn)
+			et->ops.elevator_init_icq_fn(icq);
+	} else {
+		kmem_cache_free(et->icq_cache, icq);
+		icq = ioc_lookup_icq(ioc, q);
+		if (!icq)
+			printk(KERN_ERR "cfq: icq link failed!\n");
+	}
+
+	spin_unlock(&ioc->lock);
+	spin_unlock_irq(q->queue_lock);
+	radix_tree_preload_end();
+	return icq;
+}
+
 void ioc_set_changed(struct io_context *ioc, int which)
 {
 	struct io_cq *icq;
diff --git a/block/blk.h b/block/blk.h
index ed4d9bf2ab16..7efd772336de 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -200,6 +200,7 @@ static inline int blk_do_io_stat(struct request *rq)
  */
 void get_io_context(struct io_context *ioc);
 struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
+struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask);
 void ioc_clear_queue(struct request_queue *q);
 
 void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 11f49d036845..f3b44c394e6d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2935,117 +2935,6 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 	return cfqq;
 }
 
-/**
- * ioc_create_icq - create and link io_cq
- * @q: request_queue of interest
- * @gfp_mask: allocation mask
- *
- * Make sure io_cq linking %current->io_context and @q exists.  If either
- * io_context and/or icq don't exist, they will be created using @gfp_mask.
- *
- * The caller is responsible for ensuring @ioc won't go away and @q is
- * alive and will stay alive until this function returns.
- */
-static struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
-{
-	struct elevator_type *et = q->elevator->type;
-	struct io_context *ioc;
-	struct io_cq *icq;
-
-	/* allocate stuff */
-	ioc = create_io_context(current, gfp_mask, q->node);
-	if (!ioc)
-		return NULL;
-
-	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
-				    q->node);
-	if (!icq)
-		return NULL;
-
-	if (radix_tree_preload(gfp_mask) < 0) {
-		kmem_cache_free(et->icq_cache, icq);
-		return NULL;
-	}
-
-	icq->ioc = ioc;
-	icq->q = q;
-	INIT_LIST_HEAD(&icq->q_node);
-	INIT_HLIST_NODE(&icq->ioc_node);
-
-	/* lock both q and ioc and try to link @icq */
-	spin_lock_irq(q->queue_lock);
-	spin_lock(&ioc->lock);
-
-	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
-		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
-		list_add(&icq->q_node, &q->icq_list);
-		if (et->ops.elevator_init_icq_fn)
-			et->ops.elevator_init_icq_fn(icq);
-	} else {
-		kmem_cache_free(et->icq_cache, icq);
-		icq = ioc_lookup_icq(ioc, q);
-		if (!icq)
-			printk(KERN_ERR "cfq: icq link failed!\n");
-	}
-
-	spin_unlock(&ioc->lock);
-	spin_unlock_irq(q->queue_lock);
-	radix_tree_preload_end();
-	return icq;
-}
-
-/**
- * cfq_get_cic - acquire cfq_io_cq and bump refcnt on io_context
- * @cfqd: cfqd to setup cic for
- * @gfp_mask: allocation mask
- *
- * Return cfq_io_cq associating @cfqd and %current->io_context and
- * bump refcnt on io_context.  If ioc or cic doesn't exist, they're created
- * using @gfp_mask.
- *
- * Must be called under queue_lock which may be released and re-acquired.
- * This function also may sleep depending on @gfp_mask.
- */
-static struct cfq_io_cq *cfq_get_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
-{
-	struct request_queue *q = cfqd->queue;
-	struct cfq_io_cq *cic = NULL;
-	struct io_context *ioc;
-
-	lockdep_assert_held(q->queue_lock);
-
-	while (true) {
-		/* fast path */
-		ioc = current->io_context;
-		if (likely(ioc)) {
-			cic = cfq_cic_lookup(cfqd, ioc);
-			if (likely(cic))
-				break;
-		}
-
-		/* slow path - unlock, create missing ones and retry */
-		spin_unlock_irq(q->queue_lock);
-		cic = icq_to_cic(ioc_create_icq(q, gfp_mask));
-		spin_lock_irq(q->queue_lock);
-		if (!cic)
-			return NULL;
-	}
-
-	/* bump @ioc's refcnt and handle changed notifications */
-	get_io_context(ioc);
-
-	if (unlikely(cic->icq.changed)) {
-		if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
-			changed_ioprio(cic);
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-		if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
-			changed_cgroup(cic);
-#endif
-	}
-
-	return cic;
-}
-
 static void
 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
 {
@@ -3524,8 +3413,6 @@ static void cfq_put_request(struct request *rq)
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
 
-		put_io_context(RQ_CIC(rq)->icq.ioc, cfqq->cfqd->queue);
-
 		/* Put down rq reference on cfqg */
 		cfq_put_cfqg(RQ_CFQG(rq));
 		rq->elv.priv[0] = NULL;
@@ -3574,7 +3461,7 @@ static int
 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_io_cq *cic;
+	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
@@ -3582,9 +3469,16 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
 	spin_lock_irq(q->queue_lock);
-	cic = cfq_get_cic(cfqd, gfp_mask);
-	if (!cic)
-		goto queue_fail;
+
+	/* handle changed notifications */
+	if (unlikely(cic->icq.changed)) {
+		if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
+			changed_ioprio(cic);
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+		if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
+			changed_cgroup(cic);
+#endif
+	}
 
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
@@ -3615,17 +3509,10 @@ new_queue:
 	cfqq->allocated[rw]++;
 
 	cfqq->ref++;
-	rq->elv.icq = &cic->icq;
 	rq->elv.priv[0] = cfqq;
 	rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
-
-queue_fail:
-	cfq_schedule_dispatch(cfqd);
-	spin_unlock_irq(q->queue_lock);
-	cfq_log(cfqd, "set_request fail");
-	return 1;
 }
 
 static void cfq_kick_queue(struct work_struct *work)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index c8f1e67a8ebe..c24f3d7fbf1e 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -60,8 +60,8 @@ struct elevator_ops
 	elevator_request_list_fn *elevator_former_req_fn;
 	elevator_request_list_fn *elevator_latter_req_fn;
 
-	elevator_init_icq_fn *elevator_init_icq_fn;
-	elevator_exit_icq_fn *elevator_exit_icq_fn;
+	elevator_init_icq_fn *elevator_init_icq_fn;	/* see iocontext.h */
+	elevator_exit_icq_fn *elevator_exit_icq_fn;	/* ditto */
 
 	elevator_set_req_fn *elevator_set_req_fn;
 	elevator_put_req_fn *elevator_put_req_fn;
@@ -90,8 +90,8 @@ struct elevator_type
 
 	/* fields provided by elevator implementation */
 	struct elevator_ops ops;
-	size_t icq_size;
-	size_t icq_align;
+	size_t icq_size;	/* see iocontext.h */
+	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index ac390a34c0e7..7e1371c4bccf 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -10,6 +10,65 @@ enum {
 	ICQ_CGROUP_CHANGED,
 };
 
+/*
+ * An io_cq (icq) is association between an io_context (ioc) and a
+ * request_queue (q).  This is used by elevators which need to track
+ * information per ioc - q pair.
+ *
+ * Elevator can request use of icq by setting elevator_type->icq_size and
+ * ->icq_align.  Both size and align must be larger than that of struct
+ * io_cq and elevator can use the tail area for private information.  The
+ * recommended way to do this is defining a struct which contains io_cq as
+ * the first member followed by private members and using its size and
+ * align.  For example,
+ *
+ *	struct snail_io_cq {
+ *		struct io_cq	icq;
+ *		int		poke_snail;
+ *		int		feed_snail;
+ *	};
+ *
+ *	struct elevator_type snail_elv_type {
+ *		.ops =		{ ... },
+ *		.icq_size =	sizeof(struct snail_io_cq),
+ *		.icq_align =	__alignof__(struct snail_io_cq),
+ *		...
+ *	};
+ *
+ * If icq_size is set, block core will manage icq's.  All requests will
+ * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn()
+ * is called and be holding a reference to the associated io_context.
+ *
+ * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is
+ * called and, on destruction, ->elevator_exit_icq_fn().  Both functions
+ * are called with both the associated io_context and queue locks held.
+ *
+ * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding
+ * queue lock but the returned icq is valid only until the queue lock is
+ * released.  Elevators can not and should not try to create or destroy
+ * icq's.
+ *
+ * As icq's are linked from both ioc and q, the locking rules are a bit
+ * complex.
+ *
+ * - ioc lock nests inside q lock.
+ *
+ * - ioc->icq_list and icq->ioc_node are protected by ioc lock.
+ *   q->icq_list and icq->q_node by q lock.
+ *
+ * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq
+ *   itself is protected by q lock.  However, both the indexes and icq
+ *   itself are also RCU managed and lookup can be performed holding only
+ *   the q lock.
+ *
+ * - icq's are not reference counted.  They are destroyed when either the
+ *   ioc or q goes away.  Each request with icq set holds an extra
+ *   reference to ioc to ensure it stays until the request is completed.
+ *
+ * - Linking and unlinking icq's are performed while holding both ioc and q
+ *   locks.  Due to the lock ordering, q exit is simple but ioc exit
+ *   requires reverse-order double lock dance.
+ */
 struct io_cq {
 	struct request_queue	*q;
 	struct io_context	*ioc;
-- 
cgit v1.2.3


From 4a0b75c7d02c2bd46ed227d4ba5941ba8a0aba5d Mon Sep 17 00:00:00 2001
From: Shaohua Li
Date: Fri, 16 Dec 2011 14:00:22 +0100
Subject: block, cfq: fix empty queue crash caused by request merge

All requests of a queue could be merged to other requests of other queue.
Such queue will not have request in it, but it's in service tree. This
will cause kernel oops.
I encounter a BUG_ON() in cfq_dispatch_request() with next patch, but the
issue should exist without the patch.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f3b44c394e6d..163263ddd381 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1656,6 +1656,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 		    struct request *next)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+
 	/*
 	 * reposition in fifo if next is older than rq
 	 */
@@ -1670,6 +1672,16 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	cfq_remove_request(next);
 	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
 					rq_data_dir(next), rq_is_sync(next));
+
+	cfqq = RQ_CFQQ(next);
+	/*
+	 * all requests of this queue are merged to other queues, delete it
+	 * from the service tree. If it's the active_queue,
+	 * cfq_dispatch_requests() will choose to expire it or do idle
+	 */
+	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
+	    cfqq != cfqd->active_queue)
+		cfq_del_cfqq_rr(cfqd, cfqq);
 }
 
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
-- 
cgit v1.2.3


From 274193224cdabd687d804a26e0150bb20f2dd52c Mon Sep 17 00:00:00 2001
From: Shaohua Li
Date: Fri, 16 Dec 2011 14:00:31 +0100
Subject: block: recursive merge requests

In my workload, thread 1 accesses a, a+2, ..., thread 2 accesses a+1,
a+3,.... When the requests are flushed to queue, a and a+1 are merged
to (a, a+1), a+2 and a+3 too to (a+2, a+3), but (a, a+1) and (a+2, a+3)
aren't merged.
With recursive merge below, the workload throughput gets improved 20%
and context switch drops 60%.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 91e18f8af9be..99838f460b44 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -515,6 +515,7 @@ static bool elv_attempt_insert_merge(struct request_queue *q,
 				     struct request *rq)
 {
 	struct request *__rq;
+	bool ret;
 
 	if (blk_queue_nomerges(q))
 		return false;
@@ -528,14 +529,21 @@ static bool elv_attempt_insert_merge(struct request_queue *q,
 	if (blk_queue_noxmerges(q))
 		return false;
 
+	ret = false;
 	/*
 	 * See if our hash lookup can find a potential backmerge.
 	 */
-	__rq = elv_rqhash_find(q, blk_rq_pos(rq));
-	if (__rq && blk_attempt_req_merge(q, __rq, rq))
-		return true;
+	while (1) {
+		__rq = elv_rqhash_find(q, blk_rq_pos(rq));
+		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
+			break;
 
-	return false;
+		/* The merged request could be merged with others, try again */
+		ret = true;
+		rq = __rq;
+	}
+
+	return ret;
 }
 
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
-- 
cgit v1.2.3


From 64c42998f14d5894ea3138625897d620b30c8e4e Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Mon, 19 Dec 2011 10:36:44 +0100
Subject: block: ioc_cgroup_changed() needs to be exported

With the ioc changed, ioc_cgroup_changed() can be used by modular
code. So ensure that it is exported.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index c04d16b02225..ce9b35a9468e 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -464,6 +464,7 @@ void ioc_cgroup_changed(struct io_context *ioc)
 	ioc_set_changed(ioc, ICQ_CGROUP_CHANGED);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 }
+EXPORT_SYMBOL(ioc_cgroup_changed);
 
 static int __init blk_ioc_init(void)
 {
-- 
cgit v1.2.3


From fd63836811d6e5b5f5f608abf865bc9e91762c8c Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Sun, 25 Dec 2011 14:29:14 +0100
Subject: block: an exiting task should be allowed to create io_context

While fixing io_context creation / task exit race condition,
6e736be7f2 "block: make ioc get/put interface more conventional and
fix race on alloction" also prevented an exiting (%PF_EXITING) task
from creating its own io_context.  This is incorrect as exit path may
issue IOs, e.g. from exit_files(), and if those IOs are the first ones
issued by the task, io_context needs to be created to process the IOs.

Combined with the existing problem of io_context / io_cq creation
failure having the possibility of stalling IO, this problem results in
deterministic full IO lockup with certain workloads.

Fix it by allowing io_context creation regardless of %PF_EXITING for
%current.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index ce9b35a9468e..33fae7df16a0 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -281,9 +281,16 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
 	INIT_HLIST_HEAD(&ioc->icq_list);
 	INIT_WORK(&ioc->release_work, ioc_release_fn);
 
-	/* try to install, somebody might already have beaten us to it */
+	/*
+	 * Try to install.  ioc shouldn't be installed if someone else
+	 * already did or @task, which isn't %current, is exiting.  Note
+	 * that we need to allow ioc creation on exiting %current as exit
+	 * path may issue IOs from e.g. exit_files().  The exit path is
+	 * responsible for not issuing IO after exit_io_context().
+	 */
 	task_lock(task);
-	if (!task->io_context && !(task->flags & PF_EXITING))
+	if (!task->io_context &&
+	    (task == current || !(task->flags & PF_EXITING)))
 		task->io_context = ioc;
 	else
 		kmem_cache_free(iocontext_cachep, ioc);
-- 
cgit v1.2.3


From c98b2cc29af8e84e7364b53e9bb4cc7cfaf62555 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Tue, 27 Dec 2011 18:52:16 +0100
Subject: block: remove WARN_ON_ONCE() in exit_io_context()

6e736be7 "block: make ioc get/put interface more conventional and fix
race on alloction" added WARN_ON_ONCE() in exit_io_context() which
triggers if !PF_EXITING.  All tasks hitting exit_io_context() from
task exit should have PF_EXITING set but task struct tearing down
after fork failure calls into the function without PF_EXITING,
triggering the condition.

  WARNING: at block/blk-ioc.c:234 exit_io_context+0x40/0x92()
  Pid: 17090, comm: trinity Not tainted 3.2.0-rc6-next-20111222-sasha-dirty #77
  Call Trace:
   [<ffffffff810b69a3>] warn_slowpath_common+0x8f/0xb2
   [<ffffffff810b6a77>] warn_slowpath_null+0x18/0x1a
   [<ffffffff8181a7a2>] exit_io_context+0x40/0x92
   [<ffffffff810b58c9>] copy_process+0x126f/0x1453
   [<ffffffff810b5c1b>] do_fork+0x120/0x3e9
   [<ffffffff8106242f>] sys_clone+0x26/0x28
   [<ffffffff82425803>] stub_clone+0x13/0x20
  ---[ end trace a2e4eb670b375238 ]---

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 33fae7df16a0..27a06e00eaec 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -230,9 +230,6 @@ void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
 
-	/* PF_EXITING prevents new io_context from being attached to @task */
-	WARN_ON_ONCE(!(current->flags & PF_EXITING));
-
 	task_lock(task);
 	ioc = task->io_context;
 	task->io_context = NULL;
-- 
cgit v1.2.3


From b1bd055d397e09f99dcef9b138ed104ff1812fcb Mon Sep 17 00:00:00 2001
From: Martin K. Petersen
Date: Wed, 11 Jan 2012 16:27:11 +0100
Subject: block: Introduce blk_set_stacking_limits function

Stacking driver queue limits are typically bounded exclusively by the
capabilities of the low level devices, not by the stacking driver
itself.

This patch introduces blk_set_stacking_limits() which has more liberal
metrics than the default queue limits function. This allows us to
inherit topology parameters from bottom devices without manually
tweaking the default limits in each driver prior to calling the stacking
function.

Since there is now a clear distinction between stacking and low-level
devices, blk_set_default_limits() has been modified to carry the more
conservative values that we used to manually set in
blk_queue_make_request().

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c   | 32 ++++++++++++++++++++++++--------
 drivers/md/dm-table.c  |  6 +++---
 drivers/md/md.c        |  1 +
 include/linux/blkdev.h |  1 +
 4 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index fa1eb0449a05..d3234fc494ad 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -104,9 +104,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
  * @lim:  the queue_limits structure to reset
  *
  * Description:
- *   Returns a queue_limit struct to its default state.  Can be used by
- *   stacking drivers like DM that stage table swaps and reuse an
- *   existing device queue.
+ *   Returns a queue_limit struct to its default state.
  */
 void blk_set_default_limits(struct queue_limits *lim)
 {
@@ -114,13 +112,12 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_integrity_segments = 0;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
-	lim->max_sectors = BLK_DEF_MAX_SECTORS;
-	lim->max_hw_sectors = INT_MAX;
+	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
 	lim->max_discard_sectors = 0;
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
-	lim->discard_zeroes_data = 1;
+	lim->discard_zeroes_data = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -130,6 +127,27 @@ void blk_set_default_limits(struct queue_limits *lim)
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
+/**
+ * blk_set_stacking_limits - set default limits for stacking devices
+ * @lim:  the queue_limits structure to reset
+ *
+ * Description:
+ *   Returns a queue_limit struct to its default state. Should be used
+ *   by stacking drivers like DM that have no internal limits.
+ */
+void blk_set_stacking_limits(struct queue_limits *lim)
+{
+	blk_set_default_limits(lim);
+
+	/* Inherit limits from component devices */
+	lim->discard_zeroes_data = 1;
+	lim->max_segments = USHRT_MAX;
+	lim->max_hw_sectors = UINT_MAX;
+
+	lim->max_sectors = BLK_DEF_MAX_SECTORS;
+}
+EXPORT_SYMBOL(blk_set_stacking_limits);
+
 /**
  * blk_queue_make_request - define an alternate make_request function for a device
  * @q:  the request queue for the device to be affected
@@ -165,8 +183,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->nr_batching = BLK_BATCH_REQ;
 
 	blk_set_default_limits(&q->limits);
-	blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
-	q->limits.discard_zeroes_data = 0;
 
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8e9132130142..63cc54289aff 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -699,7 +699,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
 	while (i < dm_table_get_num_targets(table)) {
 		ti = dm_table_get_target(table, i++);
 
-		blk_set_default_limits(&ti_limits);
+		blk_set_stacking_limits(&ti_limits);
 
 		/* combine all target devices' limits */
 		if (ti->type->iterate_devices)
@@ -1221,10 +1221,10 @@ int dm_calculate_queue_limits(struct dm_table *table,
 	struct queue_limits ti_limits;
 	unsigned i = 0;
 
-	blk_set_default_limits(limits);
+	blk_set_stacking_limits(limits);
 
 	while (i < dm_table_get_num_targets(table)) {
-		blk_set_default_limits(&ti_limits);
+		blk_set_stacking_limits(&ti_limits);
 
 		ti = dm_table_get_target(table, i++);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ee981737edfc..114ba155af87 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4622,6 +4622,7 @@ static int md_alloc(dev_t dev, char *name)
 	mddev->queue->queuedata = mddev;
 
 	blk_queue_make_request(mddev->queue, md_make_request);
+	blk_set_stacking_limits(&mddev->queue->limits);
 
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8bca04873f53..adc34133a56a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -844,6 +844,7 @@ extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
 extern void blk_set_default_limits(struct queue_limits *lim);
+extern void blk_set_stacking_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			    sector_t offset);
 extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
-- 
cgit v1.2.3


From ef00f59c95fe6e002e7c6e3663cdea65e253f4cc Mon Sep 17 00:00:00 2001
From: Martin K. Petersen
Date: Wed, 11 Jan 2012 16:29:31 +0100
Subject: block: Add BLKROTATIONAL ioctl

Introduce an ioctl which permits applications to query whether a block
device is rotational.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/compat_ioctl.c | 3 +++
 block/ioctl.c        | 2 ++
 include/linux/fs.h   | 1 +
 3 files changed, 6 insertions(+)

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7b725020823c..7c668c8a6f95 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -719,6 +719,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKSECTGET:
 		return compat_put_ushort(arg,
 					 queue_max_sectors(bdev_get_queue(bdev)));
+	case BLKROTATIONAL:
+		return compat_put_ushort(arg,
+					 !blk_queue_nonrot(bdev_get_queue(bdev)));
 	case BLKRASET: /* compatible, but no compat_ptr (!) */
 	case BLKFRASET:
 		if (!capable(CAP_SYS_ADMIN))
diff --git a/block/ioctl.c b/block/ioctl.c
index ca939fc1030f..337d207ab14d 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -278,6 +278,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		return put_uint(arg, bdev_discard_zeroes_data(bdev));
 	case BLKSECTGET:
 		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
+	case BLKROTATIONAL:
+		return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
 	case BLKRASET:
 	case BLKFRASET:
 		if(!capable(CAP_SYS_ADMIN))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e0bc4ffb8e7f..95dd911506f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -319,6 +319,7 @@ struct inodes_stat_t {
 #define BLKPBSZGET _IO(0x12,123)
 #define BLKDISCARDZEROES _IO(0x12,124)
 #define BLKSECDISCARD _IO(0x12,125)
+#define BLKROTATIONAL _IO(0x12,126)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
-- 
cgit v1.2.3


From 0b4156eb27214e81f7012458bb15d1e038db9a00 Mon Sep 17 00:00:00 2001
From: Namjae Jeon
Date: Thu, 12 Jan 2012 09:11:56 +0100
Subject: fs: remove unneeded plug in mpage_readpages()

The block plug in mpage_readpages() duplicates the one in read_pages().

Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Signed-off-by: Amit Sahrawat <amit.sahrawat83@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/mpage.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/mpage.c b/fs/mpage.c
index fdfae9fa98cd..643e9f55ef29 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -371,9 +371,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
-	struct blk_plug plug;
-
-	blk_start_plug(&plug);
 
 	map_bh.b_state = 0;
 	map_bh.b_size = 0;
@@ -395,7 +392,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		mpage_bio_submit(READ, bio);
-	blk_finish_plug(&plug);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
-- 
cgit v1.2.3


From fd83240a60ecc59849420df3393e9e6d35c77683 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell
Date: Thu, 12 Jan 2012 09:17:30 +0100
Subject: blockdev: convert some macros to static inlines

We prefer to program in C rather than preprocessor and it fixes this
warning when CONFIG_BLK_DEV_INTEGRITY is not set:

drivers/md/dm-table.c: In function 'dm_table_set_integrity':
drivers/md/dm-table.c:1285:3: warning: statement with no effect [-Wunused-value]

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 77 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 64 insertions(+), 13 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index adc34133a56a..5cfb9b22627f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1298,19 +1298,70 @@ queue_max_integrity_segments(struct request_queue *q)
 
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 
-#define blk_integrity_rq(rq)			(0)
-#define blk_rq_count_integrity_sg(a, b)		(0)
-#define blk_rq_map_integrity_sg(a, b, c)	(0)
-#define bdev_get_integrity(a)			(0)
-#define blk_get_integrity(a)			(0)
-#define blk_integrity_compare(a, b)		(0)
-#define blk_integrity_register(a, b)		(0)
-#define blk_integrity_unregister(a)		do { } while (0)
-#define blk_queue_max_integrity_segments(a, b)	do { } while (0)
-#define queue_max_integrity_segments(a)		(0)
-#define blk_integrity_merge_rq(a, b, c)		(0)
-#define blk_integrity_merge_bio(a, b, c)	(0)
-#define blk_integrity_is_initialized(a)		(0)
+struct bio;
+struct block_device;
+struct gendisk;
+struct blk_integrity;
+
+static inline int blk_integrity_rq(struct request *rq)
+{
+	return 0;
+}
+static inline int blk_rq_count_integrity_sg(struct request_queue *q,
+					    struct bio *b)
+{
+	return 0;
+}
+static inline int blk_rq_map_integrity_sg(struct request_queue *q,
+					  struct bio *b,
+					  struct scatterlist *s)
+{
+	return 0;
+}
+static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
+{
+	return 0;
+}
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+	return NULL;
+}
+static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
+{
+	return 0;
+}
+static inline int blk_integrity_register(struct gendisk *d,
+					 struct blk_integrity *b)
+{
+	return 0;
+}
+static inline void blk_integrity_unregister(struct gendisk *d)
+{
+}
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+						    unsigned int segs)
+{
+}
+static inline unsigned short queue_max_integrity_segments(struct request_queue *q)
+{
+	return 0;
+}
+static inline int blk_integrity_merge_rq(struct request_queue *rq,
+					 struct request *r1,
+					 struct request *r2)
+{
+	return 0;
+}
+static inline int blk_integrity_merge_bio(struct request_queue *rq,
+					  struct request *r,
+					  struct bio *b)
+{
+	return 0;
+}
+static inline bool blk_integrity_is_initialized(struct gendisk *g)
+{
+	return 0;
+}
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-- 
cgit v1.2.3


From 6898e3bd11cc9a931ef115eee9000ac9d8f8c3cf Mon Sep 17 00:00:00 2001
From: Martin K. Petersen
Date: Fri, 13 Jan 2012 08:15:33 +0100
Subject: block: Stop using macro stubs for the bio data integrity calls

Replace preprocessor macro stubs with real function declarations to
prevent warnings when CONFIG_BLK_DEV_INTEGRITY is disabled.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 66 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 847994aef0e9..129a9c097958 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -515,24 +515,64 @@ extern void bio_integrity_init(void);
 
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 
-#define bio_integrity(a)		(0)
-#define bioset_integrity_create(a, b)	(0)
-#define bio_integrity_prep(a)		(0)
-#define bio_integrity_enabled(a)	(0)
+static inline int bio_integrity(struct bio *bio)
+{
+	return 0;
+}
+
+static inline int bio_integrity_enabled(struct bio *bio)
+{
+	return 0;
+}
+
+static inline int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+	return 0;
+}
+
+static inline void bioset_integrity_free (struct bio_set *bs)
+{
+	return;
+}
+
+static inline int bio_integrity_prep(struct bio *bio)
+{
+	return 0;
+}
+
+static inline void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+	return;
+}
+
 static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 				      gfp_t gfp_mask, struct bio_set *bs)
 {
 	return 0;
 }
-#define bioset_integrity_free(a)	do { } while (0)
-#define bio_integrity_free(a, b)	do { } while (0)
-#define bio_integrity_endio(a, b)	do { } while (0)
-#define bio_integrity_advance(a, b)	do { } while (0)
-#define bio_integrity_trim(a, b, c)	do { } while (0)
-#define bio_integrity_split(a, b, c)	do { } while (0)
-#define bio_integrity_set_tag(a, b, c)	do { } while (0)
-#define bio_integrity_get_tag(a, b, c)	do { } while (0)
-#define bio_integrity_init(a)		do { } while (0)
+
+static inline void bio_integrity_split(struct bio *bio, struct bio_pair *bp,
+				       int sectors)
+{
+	return;
+}
+
+static inline void bio_integrity_advance(struct bio *bio,
+					 unsigned int bytes_done)
+{
+	return;
+}
+
+static inline void bio_integrity_trim(struct bio *bio, unsigned int offset,
+				      unsigned int sectors)
+{
+	return;
+}
+
+static inline void bio_integrity_init(void)
+{
+	return;
+}
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-- 
cgit v1.2.3


From 5d381efb3d1f1ef10535a31ca0dd9b22fe1e1922 Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Sun, 15 Jan 2012 10:29:48 +0100
Subject: Revert "block: recursive merge requests"

This reverts commit 274193224cdabd687d804a26e0150bb20f2dd52c.

We have some problems related to selection of empty queues
that need to be resolved, evidence so far points to the
recursive merge logic making either being the cause or at
least the accelerator for this. So revert it for now, until
we figure this out.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 99838f460b44..91e18f8af9be 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -515,7 +515,6 @@ static bool elv_attempt_insert_merge(struct request_queue *q,
 				     struct request *rq)
 {
 	struct request *__rq;
-	bool ret;
 
 	if (blk_queue_nomerges(q))
 		return false;
@@ -529,21 +528,14 @@ static bool elv_attempt_insert_merge(struct request_queue *q,
 	if (blk_queue_noxmerges(q))
 		return false;
 
-	ret = false;
 	/*
 	 * See if our hash lookup can find a potential backmerge.
 	 */
-	while (1) {
-		__rq = elv_rqhash_find(q, blk_rq_pos(rq));
-		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
-			break;
-
-		/* The merged request could be merged with others, try again */
-		ret = true;
-		rq = __rq;
-	}
+	__rq = elv_rqhash_find(q, blk_rq_pos(rq));
+	if (__rq && blk_attempt_req_merge(q, __rq, rq))
+		return true;
 
-	return ret;
+	return false;
 }
 
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
-- 
cgit v1.2.3