From 0cc61e64e21cfc24fa0d938fd148aba4a595163b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Tue, 19 Jun 2018 18:40:14 +0200
Subject: block: fix timeout changes for legacy request drivers

blk_mq_complete_request can only be called for blk-mq drivers, but when
removing the BLK_EH_HANDLED return value, two legacy request timeout
methods incorrectly got switched to call blk_mq_complete_request.
Call __blk_complete_request instead to reinstance the previous behavior.
For that __blk_complete_request needs to be exported.

Fixes: 1fc2b62e ("scsi_transport_fc: complete requests from ->timeout")
Fixes: 0df0bb08 ("null_blk: complete requests from ->timeout")
Reported-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-softirq.c              | 1 +
 drivers/block/null_blk.c         | 2 +-
 drivers/scsi/scsi_transport_fc.c | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 01e2b353a2b9..15c1f5e12eb8 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -144,6 +144,7 @@ do_local:
 
 	local_irq_restore(flags);
 }
+EXPORT_SYMBOL(__blk_complete_request);
 
 /**
  * blk_complete_request - end I/O on a request
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 2bdadd7f1454..3d8bdbe9bd35 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -1365,7 +1365,7 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
 static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
 {
 	pr_info("null: rq %p timed out\n", rq);
-	blk_mq_complete_request(rq);
+	__blk_complete_request(rq);
 	return BLK_EH_DONE;
 }
 
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 1da3d71e9f61..13948102ca29 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3592,7 +3592,7 @@ fc_bsg_job_timeout(struct request *req)
 
 	/* the blk_end_sync_io() doesn't check the error */
 	if (inflight)
-		blk_mq_complete_request(req);
+		__blk_complete_request(req);
 	return BLK_EH_DONE;
 }
 
-- 
cgit v1.2.3


From 9c24c10a2c1e1bb478b6bb70612d9e885aee044f Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Tue, 19 Jun 2018 10:26:40 -0700
Subject: Revert "block: Add warning for bi_next not NULL in bio_endio()"

Commit 0ba99ca4838b ("block: Add warning for bi_next not NULL in
bio_endio()") breaks the dm driver. end_clone_bio() detects whether
or not a bio is the last bio associated with a request by checking
the .bi_next field. Commit 0ba99ca4838b clears that field before
end_clone_bio() has had a chance to inspect that field. Hence revert
commit 0ba99ca4838b.

This patch avoids that KASAN reports the following complaint when
running the srp-test software (srp-test/run_tests -c -d -r 10 -t 02-mq):

==================================================================
BUG: KASAN: use-after-free in bio_advance+0x11b/0x1d0
Read of size 4 at addr ffff8801300e06d0 by task ksoftirqd/0/9

CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 4.18.0-rc1-dbg+ #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
Call Trace:
 dump_stack+0xa4/0xf5
 print_address_description+0x6f/0x270
 kasan_report+0x241/0x360
 __asan_load4+0x78/0x80
 bio_advance+0x11b/0x1d0
 blk_update_request+0xa7/0x5b0
 scsi_end_request+0x56/0x320 [scsi_mod]
 scsi_io_completion+0x7d6/0xb20 [scsi_mod]
 scsi_finish_command+0x1c0/0x280 [scsi_mod]
 scsi_softirq_done+0x19a/0x230 [scsi_mod]
 blk_mq_complete_request+0x160/0x240
 scsi_mq_done+0x50/0x1a0 [scsi_mod]
 srp_recv_done+0x515/0x1330 [ib_srp]
 __ib_process_cq+0xa0/0xf0 [ib_core]
 ib_poll_handler+0x38/0xa0 [ib_core]
 irq_poll_softirq+0xe8/0x1f0
 __do_softirq+0x128/0x60d
 run_ksoftirqd+0x3f/0x60
 smpboot_thread_fn+0x352/0x460
 kthread+0x1c1/0x1e0
 ret_from_fork+0x24/0x30

Allocated by task 1918:
 save_stack+0x43/0xd0
 kasan_kmalloc+0xad/0xe0
 kasan_slab_alloc+0x11/0x20
 kmem_cache_alloc+0xfe/0x350
 mempool_alloc_slab+0x15/0x20
 mempool_alloc+0xfb/0x270
 bio_alloc_bioset+0x244/0x350
 submit_bh_wbc+0x9c/0x2f0
 __block_write_full_page+0x299/0x5a0
 block_write_full_page+0x16b/0x180
 blkdev_writepage+0x18/0x20
 __writepage+0x42/0x80
 write_cache_pages+0x376/0x8a0
 generic_writepages+0xbe/0x110
 blkdev_writepages+0xe/0x10
 do_writepages+0x9b/0x180
 __filemap_fdatawrite_range+0x178/0x1c0
 file_write_and_wait_range+0x59/0xc0
 blkdev_fsync+0x46/0x80
 vfs_fsync_range+0x66/0x100
 do_fsync+0x3d/0x70
 __x64_sys_fsync+0x21/0x30
 do_syscall_64+0x77/0x230
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 9:
 save_stack+0x43/0xd0
 __kasan_slab_free+0x137/0x190
 kasan_slab_free+0xe/0x10
 kmem_cache_free+0xd3/0x380
 mempool_free_slab+0x17/0x20
 mempool_free+0x63/0x160
 bio_free+0x81/0xa0
 bio_put+0x59/0x60
 end_bio_bh_io_sync+0x5d/0x70
 bio_endio+0x1a7/0x360
 blk_update_request+0xd0/0x5b0
 end_clone_bio+0xa3/0xd0 [dm_mod]
 bio_endio+0x1a7/0x360
 blk_update_request+0xd0/0x5b0
 scsi_end_request+0x56/0x320 [scsi_mod]
 scsi_io_completion+0x7d6/0xb20 [scsi_mod]
 scsi_finish_command+0x1c0/0x280 [scsi_mod]
 scsi_softirq_done+0x19a/0x230 [scsi_mod]
 blk_mq_complete_request+0x160/0x240
 scsi_mq_done+0x50/0x1a0 [scsi_mod]
 srp_recv_done+0x515/0x1330 [ib_srp]
 __ib_process_cq+0xa0/0xf0 [ib_core]
 ib_poll_handler+0x38/0xa0 [ib_core]
 irq_poll_softirq+0xe8/0x1f0
 __do_softirq+0x128/0x60d

The buggy address belongs to the object at ffff8801300e0640
 which belongs to the cache bio-0 of size 200
The buggy address is located 144 bytes inside of
 200-byte region [ffff8801300e0640, ffff8801300e0708)
The buggy address belongs to the page:
page:ffffea0004c03800 count:1 mapcount:0 mapping:ffff88015a563a00 index:0x0 compound_mapcount: 0
flags: 0x8000000000008100(slab|head)
raw: 8000000000008100 dead000000000100 dead000000000200 ffff88015a563a00
raw: 0000000000000000 0000000000330033 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801300e0580: fb fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc
 ffff8801300e0600: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
>ffff8801300e0680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                                 ^
 ffff8801300e0700: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff8801300e0780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
==================================================================

Cc: Kent Overstreet <kent.overstreet@gmail.com>
Fixes: 0ba99ca4838b ("block: Add warning for bi_next not NULL in bio_endio()")
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c      | 3 ---
 block/blk-core.c | 8 +-------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index db9a40e9a136..f7e3d88bd0b6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1807,9 +1807,6 @@ again:
 	if (!bio_integrity_endio(bio))
 		return;
 
-	if (WARN_ONCE(bio->bi_next, "driver left bi_next not NULL"))
-		bio->bi_next = NULL;
-
 	/*
 	 * Need to have a real endio function for chained bios, otherwise
 	 * various corner cases will break (like stacking block devices that
diff --git a/block/blk-core.c b/block/blk-core.c
index cf0ee764b908..afd2596ea3d3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -273,10 +273,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	/*
-	 * XXX this code looks suspicious - it's not consistent with advancing
-	 * req->bio in caller
-	 */
 	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
 		bio_endio(bio);
 }
@@ -3081,10 +3077,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
 		struct bio *bio = req->bio;
 		unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
 
-		if (bio_bytes == bio->bi_iter.bi_size) {
+		if (bio_bytes == bio->bi_iter.bi_size)
 			req->bio = bio->bi_next;
-			bio->bi_next = NULL;
-		}
 
 		/* Completion has already been traced */
 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
-- 
cgit v1.2.3


From 3d0641015bf73aaa1cb54c936674959e7805070f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg
Date: Tue, 19 Jun 2018 15:34:09 +0300
Subject: nvme-rdma: fix possible double free condition when failing to create
 a controller

Failures after nvme_init_ctrl will defer resource cleanups to .free_ctrl
when the reference is released, hence we should not free the controller
queues for these failures.

Fix that by moving controller queues allocation before controller
initialization and correctly freeing them for failures before
initialization and skip them for failures after initialization.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index c9424da0d23e..bcb0e5d6343d 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -888,9 +888,9 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
 	list_del(&ctrl->list);
 	mutex_unlock(&nvme_rdma_ctrl_mutex);
 
-	kfree(ctrl->queues);
 	nvmf_free_options(nctrl->opts);
 free_ctrl:
+	kfree(ctrl->queues);
 	kfree(ctrl);
 }
 
@@ -1932,11 +1932,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 		goto out_free_ctrl;
 	}
 
-	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
-				0 /* no quirks, we're perfect! */);
-	if (ret)
-		goto out_free_ctrl;
-
 	INIT_DELAYED_WORK(&ctrl->reconnect_work,
 			nvme_rdma_reconnect_ctrl_work);
 	INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
@@ -1950,14 +1945,19 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 	ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
 				GFP_KERNEL);
 	if (!ctrl->queues)
-		goto out_uninit_ctrl;
+		goto out_free_ctrl;
+
+	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
+				0 /* no quirks, we're perfect! */);
+	if (ret)
+		goto out_kfree_queues;
 
 	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
 	WARN_ON_ONCE(!changed);
 
 	ret = nvme_rdma_configure_admin_queue(ctrl, true);
 	if (ret)
-		goto out_kfree_queues;
+		goto out_uninit_ctrl;
 
 	/* sanity check icdoff */
 	if (ctrl->ctrl.icdoff) {
@@ -2014,14 +2014,14 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 
 out_remove_admin_queue:
 	nvme_rdma_destroy_admin_queue(ctrl, true);
-out_kfree_queues:
-	kfree(ctrl->queues);
 out_uninit_ctrl:
 	nvme_uninit_ctrl(&ctrl->ctrl);
 	nvme_put_ctrl(&ctrl->ctrl);
 	if (ret > 0)
 		ret = -EIO;
 	return ERR_PTR(ret);
+out_kfree_queues:
+	kfree(ctrl->queues);
 out_free_ctrl:
 	kfree(ctrl);
 	return ERR_PTR(ret);
-- 
cgit v1.2.3


From 94e42213cc1ae41c57819539c0130f8dfc69d718 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg
Date: Tue, 19 Jun 2018 15:34:10 +0300
Subject: nvme-rdma: fix possible free of a non-allocated async event buffer

If nvme_rdma_configure_admin_queue fails before we allocated
the async event buffer, we will falsly free it because
nvme_rdma_free_queue is freeing it. Fix it by allocating the buffer right
after nvme_rdma_alloc_queue and free it right before nvme_rdma_queue_free
to maintain orderly reverse cleanup sequence.

Reported-by: Israel Rukshin <israelr@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index bcb0e5d6343d..f9affb71ac85 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -560,12 +560,6 @@ static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
 	if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
 		return;
 
-	if (nvme_rdma_queue_idx(queue) == 0) {
-		nvme_rdma_free_qe(queue->device->dev,
-			&queue->ctrl->async_event_sqe,
-			sizeof(struct nvme_command), DMA_TO_DEVICE);
-	}
-
 	nvme_rdma_destroy_queue_ib(queue);
 	rdma_destroy_id(queue->cm_id);
 }
@@ -739,6 +733,8 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		blk_cleanup_queue(ctrl->ctrl.admin_q);
 		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
 	}
+	nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
+		sizeof(struct nvme_command), DMA_TO_DEVICE);
 	nvme_rdma_free_queue(&ctrl->queues[0]);
 }
 
@@ -755,11 +751,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 
 	ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
 
+	error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
+			sizeof(struct nvme_command), DMA_TO_DEVICE);
+	if (error)
+		goto out_free_queue;
+
 	if (new) {
 		ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
 		if (IS_ERR(ctrl->ctrl.admin_tagset)) {
 			error = PTR_ERR(ctrl->ctrl.admin_tagset);
-			goto out_free_queue;
+			goto out_free_async_qe;
 		}
 
 		ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
@@ -795,12 +796,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 	if (error)
 		goto out_stop_queue;
 
-	error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
-			&ctrl->async_event_sqe, sizeof(struct nvme_command),
-			DMA_TO_DEVICE);
-	if (error)
-		goto out_stop_queue;
-
 	return 0;
 
 out_stop_queue:
@@ -811,6 +806,9 @@ out_cleanup_queue:
 out_free_tagset:
 	if (new)
 		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+out_free_async_qe:
+	nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
+		sizeof(struct nvme_command), DMA_TO_DEVICE);
 out_free_queue:
 	nvme_rdma_free_queue(&ctrl->queues[0]);
 	return error;
-- 
cgit v1.2.3


From c947657b15379505a9bba36a02005882b66abe57 Mon Sep 17 00:00:00 2001
From: Israel Rukshin
Date: Tue, 19 Jun 2018 15:34:11 +0300
Subject: nvme-rdma: Fix command completion race at error recovery

The race is between completing the request at error recovery work and
rdma completions.  If we cancel the request before getting the good
rdma completion we get a NULL deref of the request MR at
nvme_rdma_process_nvme_rsp().

When Canceling the request we return its mr to the mr pool (set mr to
NULL) and also unmap its data.  Canceling the requests while the rdma
queues are active is not safe.  Because rdma queues are active and we
get good rdma completions that can use the mr pointer which may be NULL.
Completing the request too soon may lead also to performing DMA to/from
user buffers which might have been already unmapped.

The commit fixes the race by draining the QP before starting the abort
commands mechanism.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f9affb71ac85..2815749f4dfb 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -728,7 +728,6 @@ out:
 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		bool remove)
 {
-	nvme_rdma_stop_queue(&ctrl->queues[0]);
 	if (remove) {
 		blk_cleanup_queue(ctrl->ctrl.admin_q);
 		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
@@ -817,7 +816,6 @@ out_free_queue:
 static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
 		bool remove)
 {
-	nvme_rdma_stop_io_queues(ctrl);
 	if (remove) {
 		blk_cleanup_queue(ctrl->ctrl.connect_q);
 		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
@@ -947,6 +945,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 	return;
 
 destroy_admin:
+	nvme_rdma_stop_queue(&ctrl->queues[0]);
 	nvme_rdma_destroy_admin_queue(ctrl, false);
 requeue:
 	dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
@@ -963,12 +962,14 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 
 	if (ctrl->ctrl.queue_count > 1) {
 		nvme_stop_queues(&ctrl->ctrl);
+		nvme_rdma_stop_io_queues(ctrl);
 		blk_mq_tagset_busy_iter(&ctrl->tag_set,
 					nvme_cancel_request, &ctrl->ctrl);
 		nvme_rdma_destroy_io_queues(ctrl, false);
 	}
 
 	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	nvme_rdma_stop_queue(&ctrl->queues[0]);
 	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
 				nvme_cancel_request, &ctrl->ctrl);
 	nvme_rdma_destroy_admin_queue(ctrl, false);
@@ -1734,6 +1735,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 {
 	if (ctrl->ctrl.queue_count > 1) {
 		nvme_stop_queues(&ctrl->ctrl);
+		nvme_rdma_stop_io_queues(ctrl);
 		blk_mq_tagset_busy_iter(&ctrl->tag_set,
 					nvme_cancel_request, &ctrl->ctrl);
 		nvme_rdma_destroy_io_queues(ctrl, shutdown);
@@ -1745,6 +1747,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 		nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
 
 	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	nvme_rdma_stop_queue(&ctrl->queues[0]);
 	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
 				nvme_cancel_request, &ctrl->ctrl);
 	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
@@ -2011,6 +2014,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 	return &ctrl->ctrl;
 
 out_remove_admin_queue:
+	nvme_rdma_stop_queue(&ctrl->queues[0]);
 	nvme_rdma_destroy_admin_queue(ctrl, true);
 out_uninit_ctrl:
 	nvme_uninit_ctrl(&ctrl->ctrl);
-- 
cgit v1.2.3


From 5e77d61cbc7e766778037127dab69e6410a8fc48 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg
Date: Tue, 19 Jun 2018 15:34:13 +0300
Subject: nvme-rdma: don't override opts->queue_size

That is user argument, and theoretically controller limits can change
over time (over reconnects/resets).  Instead, use the sqsize controller
attribute to check queue depth boundaries and use it to the tagset
allocation.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 2815749f4dfb..9544625c0b7d 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -692,7 +692,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set = &ctrl->tag_set;
 		memset(set, 0, sizeof(*set));
 		set->ops = &nvme_rdma_mq_ops;
-		set->queue_depth = nctrl->opts->queue_size;
+		set->queue_depth = nctrl->sqsize + 1;
 		set->reserved_tags = 1; /* fabric connect */
 		set->numa_node = NUMA_NO_NODE;
 		set->flags = BLK_MQ_F_SHOULD_MERGE;
@@ -1975,20 +1975,19 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 		goto out_remove_admin_queue;
 	}
 
-	if (opts->queue_size > ctrl->ctrl.maxcmd) {
-		/* warn if maxcmd is lower than queue_size */
-		dev_warn(ctrl->ctrl.device,
-			"queue_size %zu > ctrl maxcmd %u, clamping down\n",
-			opts->queue_size, ctrl->ctrl.maxcmd);
-		opts->queue_size = ctrl->ctrl.maxcmd;
-	}
-
+	/* only warn if argument is too large here, will clamp later */
 	if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
-		/* warn if sqsize is lower than queue_size */
 		dev_warn(ctrl->ctrl.device,
 			"queue_size %zu > ctrl sqsize %u, clamping down\n",
 			opts->queue_size, ctrl->ctrl.sqsize + 1);
-		opts->queue_size = ctrl->ctrl.sqsize + 1;
+	}
+
+	/* warn if maxcmd is lower than sqsize+1 */
+	if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
+		dev_warn(ctrl->ctrl.device,
+			"sqsize %u > ctrl maxcmd %u, clamping down\n",
+			ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
+		ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
 	}
 
 	if (opts->nr_io_queues) {
-- 
cgit v1.2.3


From d68a90e148f5a82aa67654c5012071e31c0e4baa Mon Sep 17 00:00:00 2001
From: Max Gurtuvoy
Date: Tue, 19 Jun 2018 15:45:33 +0300
Subject: nvmet: reset keep alive timer in controller enable

Controllers that are not yet enabled should not really enforce keep alive
timeouts, but we still want to track a timeout and cleanup in case a host
died before it enabled the controller.  Hence, simply reset the keep
alive timer when the controller is enabled.

Suggested-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index a03da764ecae..74d4b785d2da 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -686,6 +686,14 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
 	}
 
 	ctrl->csts = NVME_CSTS_RDY;
+
+	/*
+	 * Controllers that are not yet enabled should not really enforce the
+	 * keep alive timeout, but we still want to track a timeout and cleanup
+	 * in case a host died before it enabled the controller.  Hence, simply
+	 * reset the keep alive timer when the controller is enabled.
+	 */
+	mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
 }
 
 static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
-- 
cgit v1.2.3


From a1e79188628580465ac6d7a93a313336ee3364f1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter
Date: Wed, 20 Jun 2018 13:45:05 +0300
Subject: blk-mq-debugfs: Off by one in blk_mq_rq_state_name()

If rq_state == ARRAY_SIZE() then we read one element beyond the end of
the blk_mq_rq_state_name_array[] array.

Fixes: ec6dcf63c55c ("blk-mq-debugfs: Show more request state information")
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index ffa622366922..1c4532e92938 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -356,7 +356,7 @@ static const char *const blk_mq_rq_state_name_array[] = {
 
 static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
 {
-	if (WARN_ON_ONCE((unsigned int)rq_state >
+	if (WARN_ON_ONCE((unsigned int)rq_state >=
 			 ARRAY_SIZE(blk_mq_rq_state_name_array)))
 		return "(?)";
 	return blk_mq_rq_state_name_array[rq_state];
-- 
cgit v1.2.3


From ce042c183bcb94eb2919e8036473a1fc203420f9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter
Date: Wed, 20 Jun 2018 13:41:51 +0300
Subject: block: sed-opal: Fix a couple off by one bugs

resp->num is the number of tokens in resp->tok[].  It gets set in
response_parse().  So if n == resp->num then we're reading beyond the
end of the data.

Fixes: 455a7b238cd6 ("block: Add Sed-opal library")
Reviewed-by: Scott Bauer <scott.bauer@intel.com>
Tested-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/sed-opal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 945f4b8610e0..e0de4dd448b3 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -877,7 +877,7 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
 		return 0;
 	}
 
-	if (n > resp->num) {
+	if (n >= resp->num) {
 		pr_debug("Response has %d tokens. Can't access %d\n",
 			 resp->num, n);
 		return 0;
@@ -916,7 +916,7 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n)
 		return 0;
 	}
 
-	if (n > resp->num) {
+	if (n >= resp->num) {
 		pr_debug("Response has %d tokens. Can't access %d\n",
 			 resp->num, n);
 		return 0;
-- 
cgit v1.2.3


From 08ba91ee6e2c1c08d3f0648f978cbb5dbf3491d8 Mon Sep 17 00:00:00 2001
From: Doron Roberts-Kedes
Date: Fri, 15 Jun 2018 14:05:32 -0700
Subject: nbd: Add the nbd NBD_DISCONNECT_ON_CLOSE config flag.

If NBD_DISCONNECT_ON_CLOSE is set on a device, then the driver will
issue a disconnect from nbd_release if the device has no remaining
bdev->bd_openers.

Fix ret val so reconfigure with only setting the flag succeeds.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Doron Roberts-Kedes <doronrk@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c      | 42 ++++++++++++++++++++++++++++++++++--------
 include/uapi/linux/nbd.h |  3 +++
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 3b7083b8ecbb..74a05561b620 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -76,6 +76,7 @@ struct link_dead_args {
 #define NBD_HAS_CONFIG_REF		4
 #define NBD_BOUND			5
 #define NBD_DESTROY_ON_DISCONNECT	6
+#define NBD_DISCONNECT_ON_CLOSE 	7
 
 struct nbd_config {
 	u32 flags;
@@ -138,6 +139,7 @@ static void nbd_config_put(struct nbd_device *nbd);
 static void nbd_connect_reply(struct genl_info *info, int index);
 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
 static void nbd_dead_link_work(struct work_struct *work);
+static void nbd_disconnect_and_put(struct nbd_device *nbd);
 
 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 {
@@ -1305,6 +1307,12 @@ out:
 static void nbd_release(struct gendisk *disk, fmode_t mode)
 {
 	struct nbd_device *nbd = disk->private_data;
+	struct block_device *bdev = bdget_disk(disk, 0);
+
+	if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
+			bdev->bd_openers == 0)
+		nbd_disconnect_and_put(nbd);
+
 	nbd_config_put(nbd);
 	nbd_put(nbd);
 }
@@ -1705,6 +1713,10 @@ again:
 				&config->runtime_flags);
 			put_dev = true;
 		}
+		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
+			set_bit(NBD_DISCONNECT_ON_CLOSE,
+				&config->runtime_flags);
+		}
 	}
 
 	if (info->attrs[NBD_ATTR_SOCKETS]) {
@@ -1749,6 +1761,17 @@ out:
 	return ret;
 }
 
+static void nbd_disconnect_and_put(struct nbd_device *nbd)
+{
+	mutex_lock(&nbd->config_lock);
+	nbd_disconnect(nbd);
+	nbd_clear_sock(nbd);
+	mutex_unlock(&nbd->config_lock);
+	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+			       &nbd->config->runtime_flags))
+		nbd_config_put(nbd);
+}
+
 static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nbd_device *nbd;
@@ -1781,13 +1804,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
 		nbd_put(nbd);
 		return 0;
 	}
-	mutex_lock(&nbd->config_lock);
-	nbd_disconnect(nbd);
-	nbd_clear_sock(nbd);
-	mutex_unlock(&nbd->config_lock);
-	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
-			       &nbd->config->runtime_flags))
-		nbd_config_put(nbd);
+	nbd_disconnect_and_put(nbd);
 	nbd_config_put(nbd);
 	nbd_put(nbd);
 	return 0;
@@ -1798,7 +1815,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
 	struct nbd_device *nbd = NULL;
 	struct nbd_config *config;
 	int index;
-	int ret = -EINVAL;
+	int ret = 0;
 	bool put_dev = false;
 
 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
@@ -1838,6 +1855,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
 	    !nbd->task_recv) {
 		dev_err(nbd_to_dev(nbd),
 			"not configured, cannot reconfigure\n");
+		ret = -EINVAL;
 		goto out;
 	}
 
@@ -1862,6 +1880,14 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
 					       &config->runtime_flags))
 				refcount_inc(&nbd->refs);
 		}
+
+		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
+			set_bit(NBD_DISCONNECT_ON_CLOSE,
+					&config->runtime_flags);
+		} else {
+			clear_bit(NBD_DISCONNECT_ON_CLOSE,
+					&config->runtime_flags);
+		}
 	}
 
 	if (info->attrs[NBD_ATTR_SOCKETS]) {
diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index 85a3fb65e40a..20d6cc91435d 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -53,6 +53,9 @@ enum {
 /* These are client behavior specific flags. */
 #define NBD_CFLAG_DESTROY_ON_DISCONNECT	(1 << 0) /* delete the nbd device on
 						    disconnect. */
+#define NBD_CFLAG_DISCONNECT_ON_CLOSE (1 << 1) /* disconnect the nbd device on
+						*  close by last opener.
+						*/
 
 /* userspace doesn't need the nbd_device structure */
 
-- 
cgit v1.2.3


From 02d62a8bc48e92171c46540722e2d52ce77d87af Mon Sep 17 00:00:00 2001
From: James Smart
Date: Wed, 20 Jun 2018 07:44:12 -0700
Subject: nvme-fc: release io queues to allow fast fail

Rather than leaving io queues quiesced after tearing down an association,
restart them. This allows ios to be replayed, with fastfail ios terminating
and non-fastfail getting into loops of retry.

This follows rdma's lead.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Sagi Grimberg <sagi@grimber.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b528a2f5826c..41d45a1b5c62 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2790,6 +2790,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 	/* re-enable the admin_q so anything new can fast fail */
 	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
 
+	/* resume the io queues so that things will fast fail */
+	nvme_start_queues(&ctrl->ctrl);
+
 	nvme_fc_ctlr_inactive_on_rport(ctrl);
 }
 
@@ -2804,9 +2807,6 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
 	 * waiting for io to terminate
 	 */
 	nvme_fc_delete_association(ctrl);
-
-	/* resume the io queues so that things will fast fail */
-	nvme_start_queues(nctrl);
 }
 
 static void
-- 
cgit v1.2.3


From 9f9cafc14016f23f982d3ce18f9057923bd3037a Mon Sep 17 00:00:00 2001
From: Jianchao Wang
Date: Wed, 20 Jun 2018 13:42:22 +0800
Subject: nvme-pci: move nvme_kill_queues to nvme_remove_dead_ctrl

There is race between nvme_remove and nvme_reset_work that can
lead to io hang.

nvme_remove                    nvme_reset_work
                               -> nvme_remove_dead_ctrl
                                 -> nvme_dev_disable
                                   -> quiesce request_queue
                                 -> queue remove_work
-> cancel_work_sync reset_work
-> nvme_remove_namespaces
  -> splice ctrl->namespaces
                               nvme_remove_dead_ctrl_work
                               -> nvme_kill_queues
  -> nvme_ns_remove               do nothing
    -> blk_cleanup_queue
      -> blk_freeze_queue

Finally, the request_queue is quiesced state when wait freeze,
we will get io hang here. To fix it, move the nvme_kill_queues
from nvme_remove_dead_ctrl_work to nvme_remove_dead_ctrl.

Suggested-by: Keith Busch <keith.busch@linux.intel.com>
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fc33804662e7..73a97fcea364 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2289,6 +2289,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 
 	nvme_get_ctrl(&dev->ctrl);
 	nvme_dev_disable(dev, false);
+	nvme_kill_queues(&dev->ctrl);
 	if (!queue_work(nvme_wq, &dev->remove_work))
 		nvme_put_ctrl(&dev->ctrl);
 }
@@ -2405,7 +2406,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
-	nvme_kill_queues(&dev->ctrl);
 	if (pci_get_drvdata(pdev))
 		device_release_driver(&pdev->dev);
 	nvme_put_ctrl(&dev->ctrl);
-- 
cgit v1.2.3


From 943e942e6266f22babee5efeb00f8f672fbff5bd Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Thu, 21 Jun 2018 09:49:37 -0600
Subject: nvme-pci: limit max IO size and segments to avoid high order
 allocations

nvme requires an sg table allocation for each request. If the request
is large, then the allocation can become quite large. For instance,
with our default software settings of 1280KB IO size, we'll need
10248 bytes of sg table. That turns into a 2nd order allocation,
which we can't always guarantee. If we fail the allocation, blk-mq
will retry it later. But there's no guarantee that we'll EVER be
able to allocate that much contigious memory.

Limit the IO size such that we never need more than a single page
of memory. That's a lot faster and more reliable. Then back that
allocation with a mempool, so that we know we'll always be able
to succeed the allocation at some point.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Acked-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c |  1 +
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  | 42 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 21710a7460c8..46df030b2c3f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1808,6 +1808,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		u32 max_segments =
 			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
 
+		max_segments = min_not_zero(max_segments, ctrl->max_segments);
 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
 		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
 	}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 231807cbc849..0c4a33df3b2f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -170,6 +170,7 @@ struct nvme_ctrl {
 	u64 cap;
 	u32 page_size;
 	u32 max_hw_sectors;
+	u32 max_segments;
 	u16 oncs;
 	u16 oacs;
 	u16 nssa;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 73a97fcea364..ba943f211687 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -38,6 +38,13 @@
 
 #define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
 
+/*
+ * These can be higher, but we need to ensure that any command doesn't
+ * require an sg allocation that needs more than a page of data.
+ */
+#define NVME_MAX_KB_SZ	4096
+#define NVME_MAX_SEGS	127
+
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -100,6 +107,8 @@ struct nvme_dev {
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
 
+	mempool_t *iod_mempool;
+
 	/* shadow doorbell buffer support: */
 	u32 *dbbuf_dbs;
 	dma_addr_t dbbuf_dbs_dma_addr;
@@ -477,10 +486,7 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
 	iod->use_sgl = nvme_pci_use_sgls(dev, rq);
 
 	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
-		size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
-				iod->use_sgl);
-
-		iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
+		iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
 		if (!iod->sg)
 			return BLK_STS_RESOURCE;
 	} else {
@@ -526,7 +532,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
 	}
 
 	if (iod->sg != iod->inline_sg)
-		kfree(iod->sg);
+		mempool_free(iod->sg, dev->iod_mempool);
 }
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
@@ -2280,6 +2286,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 		blk_put_queue(dev->ctrl.admin_q);
 	kfree(dev->queues);
 	free_opal_dev(dev->ctrl.opal_dev);
+	mempool_destroy(dev->iod_mempool);
 	kfree(dev);
 }
 
@@ -2334,6 +2341,13 @@ static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out;
 
+	/*
+	 * Limit the max command size to prevent iod->sg allocations going
+	 * over a single page.
+	 */
+	dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
+	dev->ctrl.max_segments = NVME_MAX_SEGS;
+
 	result = nvme_init_identify(&dev->ctrl);
 	if (result)
 		goto out;
@@ -2509,6 +2523,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	int node, result = -ENOMEM;
 	struct nvme_dev *dev;
 	unsigned long quirks = id->driver_data;
+	size_t alloc_size;
 
 	node = dev_to_node(&pdev->dev);
 	if (node == NUMA_NO_NODE)
@@ -2546,6 +2561,23 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto release_pools;
 
+	/*
+	 * Double check that our mempool alloc size will cover the biggest
+	 * command we support.
+	 */
+	alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
+						NVME_MAX_SEGS, true);
+	WARN_ON_ONCE(alloc_size > PAGE_SIZE);
+
+	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
+						mempool_kfree,
+						(void *) alloc_size,
+						GFP_KERNEL, node);
+	if (!dev->iod_mempool) {
+		result = -ENOMEM;
+		goto release_pools;
+	}
+
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
 	nvme_get_ctrl(&dev->ctrl);
-- 
cgit v1.2.3


From 0ae52ddf5bd7f685bb43d7687290f6c2eeacfb31 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven
Date: Fri, 22 Jun 2018 13:05:35 +0200
Subject: lightnvm: Remove depends on HAS_DMA in case of platform dependency

Remove dependencies on HAS_DMA where a Kconfig symbol depends on another
symbol that implies HAS_DMA, and, optionally, on "|| COMPILE_TEST".
In most cases this other symbol is an architecture or platform specific
symbol, or PCI.

Generic symbols and drivers without platform dependencies keep their
dependencies on HAS_DMA, to prevent compiling subsystems or drivers that
cannot work anyway.

This simplifies the dependencies, and allows to improve compile-testing.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 10c08982185a..9c03f35d9df1 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -4,7 +4,7 @@
 
 menuconfig NVM
 	bool "Open-Channel SSD target support"
-	depends on BLOCK && HAS_DMA && PCI
+	depends on BLOCK && PCI
 	select BLK_DEV_NVME
 	help
 	  Say Y here to get to enable Open-channel SSDs.
-- 
cgit v1.2.3


From 3ee7e8697d5860b173132606d80a9cd35e7113ee Mon Sep 17 00:00:00 2001
From: Jan Kara
Date: Mon, 18 Jun 2018 15:46:58 +0200
Subject: bdi: Fix another oops in wb_workfn()

syzbot is reporting NULL pointer dereference at wb_workfn() [1] due to
wb->bdi->dev being NULL. And Dmitry confirmed that wb->state was
WB_shutting_down after wb->bdi->dev became NULL. This indicates that
unregister_bdi() failed to call wb_shutdown() on one of wb objects.

The problem is in cgwb_bdi_unregister() which does cgwb_kill() and thus
drops bdi's reference to wb structures before going through the list of
wbs again and calling wb_shutdown() on each of them. This way the loop
iterating through all wbs can easily miss a wb if that wb has already
passed through cgwb_remove_from_bdi_list() called from wb_shutdown()
from cgwb_release_workfn() and as a result fully shutdown bdi although
wb_workfn() for this wb structure is still running. In fact there are
also other ways cgwb_bdi_unregister() can race with
cgwb_release_workfn() leading e.g. to use-after-free issues:

CPU1                            CPU2
                                cgwb_bdi_unregister()
                                  cgwb_kill(*slot);

cgwb_release()
  queue_work(cgwb_release_wq, &wb->release_work);
cgwb_release_workfn()
                                  wb = list_first_entry(&bdi->wb_list, ...)
                                  spin_unlock_irq(&cgwb_lock);
  wb_shutdown(wb);
  ...
  kfree_rcu(wb, rcu);
                                  wb_shutdown(wb); -> oops use-after-free

We solve these issues by synchronizing writeback structure shutdown from
cgwb_bdi_unregister() with cgwb_release_workfn() using a new mutex. That
way we also no longer need synchronization using WB_shutting_down as the
mutex provides it for CONFIG_CGROUP_WRITEBACK case and without
CONFIG_CGROUP_WRITEBACK wb_shutdown() can be called only once from
bdi_unregister().

Reported-by: syzbot <syzbot+4a7438e774b21ddd8eca@syzkaller.appspotmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/backing-dev-defs.h |  2 +-
 mm/backing-dev.c                 | 20 +++++++-------------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 0bd432a4d7bd..24251762c20c 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -22,7 +22,6 @@ struct dentry;
  */
 enum wb_state {
 	WB_registered,		/* bdi_register() was done */
-	WB_shutting_down,	/* wb_shutdown() in progress */
 	WB_writeback_running,	/* Writeback is in progress */
 	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
 	WB_start_all,		/* nr_pages == 0 (all) work pending */
@@ -189,6 +188,7 @@ struct backing_dev_info {
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
 	struct rb_root cgwb_congested_tree; /* their congested states */
+	struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
 #else
 	struct bdi_writeback_congested *wb_congested;
 #endif
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 347cc834c04a..2e5d3df0853d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -359,15 +359,8 @@ static void wb_shutdown(struct bdi_writeback *wb)
 	spin_lock_bh(&wb->work_lock);
 	if (!test_and_clear_bit(WB_registered, &wb->state)) {
 		spin_unlock_bh(&wb->work_lock);
-		/*
-		 * Wait for wb shutdown to finish if someone else is just
-		 * running wb_shutdown(). Otherwise we could proceed to wb /
-		 * bdi destruction before wb_shutdown() is finished.
-		 */
-		wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
 		return;
 	}
-	set_bit(WB_shutting_down, &wb->state);
 	spin_unlock_bh(&wb->work_lock);
 
 	cgwb_remove_from_bdi_list(wb);
@@ -379,12 +372,6 @@ static void wb_shutdown(struct bdi_writeback *wb)
 	mod_delayed_work(bdi_wq, &wb->dwork, 0);
 	flush_delayed_work(&wb->dwork);
 	WARN_ON(!list_empty(&wb->work_list));
-	/*
-	 * Make sure bit gets cleared after shutdown is finished. Matches with
-	 * the barrier provided by test_and_clear_bit() above.
-	 */
-	smp_wmb();
-	clear_and_wake_up_bit(WB_shutting_down, &wb->state);
 }
 
 static void wb_exit(struct bdi_writeback *wb)
@@ -508,10 +495,12 @@ static void cgwb_release_workfn(struct work_struct *work)
 	struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
 						release_work);
 
+	mutex_lock(&wb->bdi->cgwb_release_mutex);
 	wb_shutdown(wb);
 
 	css_put(wb->memcg_css);
 	css_put(wb->blkcg_css);
+	mutex_unlock(&wb->bdi->cgwb_release_mutex);
 
 	fprop_local_destroy_percpu(&wb->memcg_completions);
 	percpu_ref_exit(&wb->refcnt);
@@ -697,6 +686,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 
 	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
 	bdi->cgwb_congested_tree = RB_ROOT;
+	mutex_init(&bdi->cgwb_release_mutex);
 
 	ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
 	if (!ret) {
@@ -717,7 +707,10 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
 	spin_lock_irq(&cgwb_lock);
 	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
 		cgwb_kill(*slot);
+	spin_unlock_irq(&cgwb_lock);
 
+	mutex_lock(&bdi->cgwb_release_mutex);
+	spin_lock_irq(&cgwb_lock);
 	while (!list_empty(&bdi->wb_list)) {
 		wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
 				      bdi_node);
@@ -726,6 +719,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
 		spin_lock_irq(&cgwb_lock);
 	}
 	spin_unlock_irq(&cgwb_lock);
+	mutex_unlock(&bdi->cgwb_release_mutex);
 }
 
 /**
-- 
cgit v1.2.3


From f5e350f021e04ea41d2e5d58487c33b05ba3d25b Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Fri, 22 Jun 2018 13:18:09 -0700
Subject: blk-mq: Fix timeout handling in case the timeout handler returns
 BLK_EH_DONE

Make sure that RQF_TIMED_OUT is cleared when a request is reused
after a block driver timeout handler has returned BLK_EH_DONE.

Fixes: da6612673988 ("blk-mq: don't time out requests again that are in the timeout handler")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jianchao Wang <jianchao.w.wang@oracle.com>
Cc: Andrew Randrianasulu <randrianasulu@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c      | 1 -
 block/blk-timeout.c | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8e57b84e50e9..b6888ff556cf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -781,7 +781,6 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 		WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
 	}
 
-	req->rq_flags &= ~RQF_TIMED_OUT;
 	blk_add_timer(req);
 }
 
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 4b8a48d48ba1..f2cfd56e1606 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -210,6 +210,7 @@ void blk_add_timer(struct request *req)
 	if (!req->timeout)
 		req->timeout = q->rq_timeout;
 
+	req->rq_flags &= ~RQF_TIMED_OUT;
 	blk_rq_set_deadline(req, jiffies + req->timeout);
 
 	/*
-- 
cgit v1.2.3