From 144cba1493fdd6e3e1980e439a31df877831ebcd Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Mon, 27 Apr 2015 11:09:54 +0800
Subject: libceph: allow setting osd_req_op's flags

Signed-off-by: Yan, Zheng <zyan@redhat.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ec6c5c6e1ac9..349115ae3bc2 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2376,7 +2376,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
 	}
 
 	if (opcode == CEPH_OSD_OP_DELETE)
-		osd_req_op_init(osd_request, num_ops, opcode);
+		osd_req_op_init(osd_request, num_ops, opcode, 0);
 	else
 		osd_req_op_extent_init(osd_request, num_ops, opcode,
 				       offset, length, 0, 0);
@@ -2848,7 +2848,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
 		goto out;
 	stat_request->callback = rbd_img_obj_exists_callback;
 
-	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
+	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
 					false, false);
 	rbd_osd_req_format_read(stat_request);
-- 
cgit v1.2.3


From a319bf56a617354e62cf5f774d2ca4e1a8a3bff3 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov
Date: Fri, 15 May 2015 12:02:17 +0300
Subject: libceph: store timeouts in jiffies, verify user input

There are currently three libceph-level timeouts that the user can
specify on mount: mount_timeout, osd_idle_ttl and osdkeepalive.  All of
these are in seconds and no checking is done on user input: negative
values are accepted, we multiply them all by HZ which may or may not
overflow, arbitrarily large jiffies then get added together, etc.

There is also a bug in the way mount_timeout=0 is handled.  It's
supposed to mean "infinite timeout", but that's not how wait.h APIs
treat it and so __ceph_open_session() for example will busy loop
without much chance of being interrupted if none of ceph-mons are
there.

Fix all this by verifying user input, storing timeouts capped by
msecs_to_jiffies() in jiffies and using the new ceph_timeout_jiffies()
helper for all user-specified waits to handle infinite timeouts
correctly.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c          |  5 +++--
 fs/ceph/dir.c                |  4 ++--
 fs/ceph/mds_client.c         | 12 ++++++------
 fs/ceph/mds_client.h         |  2 +-
 fs/ceph/super.c              |  2 +-
 include/linux/ceph/libceph.h | 17 +++++++++++------
 net/ceph/ceph_common.c       | 41 +++++++++++++++++++++++++++++++----------
 net/ceph/mon_client.c        | 11 +++++++++--
 net/ceph/osd_client.c        | 15 +++++++--------
 9 files changed, 71 insertions(+), 38 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 349115ae3bc2..992683b6b299 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4963,8 +4963,8 @@ out_err:
  */
 static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 {
+	struct ceph_options *opts = rbdc->client->options;
 	u64 newest_epoch;
-	unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
 	int tries = 0;
 	int ret;
 
@@ -4979,7 +4979,8 @@ again:
 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
 			ceph_monc_request_next_osdmap(&rbdc->client->monc);
 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
-						     newest_epoch, timeout);
+						     newest_epoch,
+						     opts->mount_timeout);
 			goto again;
 		} else {
 			/* the osdmap we have is new enough */
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4248307fea90..173dd4b58c71 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1259,8 +1259,8 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
 		     inode, req->r_tid, last_tid);
 		if (req->r_timeout) {
 			unsigned long time_left = wait_for_completion_timeout(
-							&req->r_safe_completion,
-							req->r_timeout);
+					&req->r_safe_completion,
+					ceph_timeout_jiffies(req->r_timeout));
 			if (time_left > 0)
 				ret = 0;
 			else
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 69a36f40517f..0b0e0a9a81c0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2268,7 +2268,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 	dout("do_request waiting\n");
 	if (req->r_timeout) {
 		err = (long)wait_for_completion_killable_timeout(
-			&req->r_completion, req->r_timeout);
+					&req->r_completion,
+					ceph_timeout_jiffies(req->r_timeout));
 		if (err == 0)
 			err = -EIO;
 	} else if (req->r_wait_for_completion) {
@@ -3424,8 +3425,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
  */
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
+	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_request *req;
-	struct ceph_fs_client *fsc = mdsc->fsc;
 
 	mutex_lock(&mdsc->mutex);
 	if (__get_oldest_req(mdsc)) {
@@ -3433,7 +3434,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 
 		dout("wait_requests waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-				    fsc->client->options->mount_timeout * HZ);
+				    ceph_timeout_jiffies(opts->mount_timeout));
 
 		/* tear down remaining requests */
 		mutex_lock(&mdsc->mutex);
@@ -3556,10 +3557,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc)
  */
 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
+	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_session *session;
 	int i;
-	struct ceph_fs_client *fsc = mdsc->fsc;
-	unsigned long timeout = fsc->client->options->mount_timeout * HZ;
 
 	dout("close_sessions\n");
 
@@ -3580,7 +3580,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 
 	dout("waiting for sessions to close\n");
 	wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
-			   timeout);
+			   ceph_timeout_jiffies(opts->mount_timeout));
 
 	/* tear down remaining sessions */
 	mutex_lock(&mdsc->mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 2ef799961ebb..509d6822e9b1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -227,7 +227,7 @@ struct ceph_mds_request {
 	int r_err;
 	bool r_aborted;
 
-	unsigned long r_timeout;  /* optional.  jiffies */
+	unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
 	unsigned long r_started;  /* start time to measure timeout against */
 	unsigned long r_request_started; /* start time for mds request only,
 					    used to measure lease durations */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9a5350030af8..edeb83c43112 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -742,7 +742,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 	req->r_ino1.ino = CEPH_INO_ROOT;
 	req->r_ino1.snap = CEPH_NOSNAP;
 	req->r_started = started;
-	req->r_timeout = fsc->client->options->mount_timeout * HZ;
+	req->r_timeout = fsc->client->options->mount_timeout;
 	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 85ae9a889a3f..d73a569f9bf5 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -43,9 +43,9 @@ struct ceph_options {
 	int flags;
 	struct ceph_fsid fsid;
 	struct ceph_entity_addr my_addr;
-	int mount_timeout;
-	int osd_idle_ttl;
-	int osd_keepalive_timeout;
+	unsigned long mount_timeout;		/* jiffies */
+	unsigned long osd_idle_ttl;		/* jiffies */
+	unsigned long osd_keepalive_timeout;	/* jiffies */
 
 	/*
 	 * any type that can't be simply compared or doesn't need need
@@ -63,9 +63,9 @@ struct ceph_options {
 /*
  * defaults
  */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
+#define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
@@ -93,6 +93,11 @@ enum {
 	CEPH_MOUNT_SHUTDOWN,
 };
 
+static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
+{
+	return timeout ?: MAX_SCHEDULE_TIMEOUT;
+}
+
 struct ceph_mds_client;
 
 /*
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 79e8f71aef5b..a80e91c2c9a3 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -352,8 +352,8 @@ ceph_parse_options(char *options, const char *dev_name,
 	/* start with defaults */
 	opt->flags = CEPH_OPT_DEFAULT;
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
-	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
-	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
+	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
 
 	/* get mon ip(s) */
 	/* ip1[:port1][,ip2[:port2]...] */
@@ -439,13 +439,32 @@ ceph_parse_options(char *options, const char *dev_name,
 			pr_warn("ignoring deprecated osdtimeout option\n");
 			break;
 		case Opt_osdkeepalivetimeout:
-			opt->osd_keepalive_timeout = intval;
+			/* 0 isn't well defined right now, reject it */
+			if (intval < 1 || intval > INT_MAX / 1000) {
+				pr_err("osdkeepalive out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->osd_keepalive_timeout =
+					msecs_to_jiffies(intval * 1000);
 			break;
 		case Opt_osd_idle_ttl:
-			opt->osd_idle_ttl = intval;
+			/* 0 isn't well defined right now, reject it */
+			if (intval < 1 || intval > INT_MAX / 1000) {
+				pr_err("osd_idle_ttl out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
 			break;
 		case Opt_mount_timeout:
-			opt->mount_timeout = intval;
+			/* 0 is "wait forever" (i.e. infinite timeout) */
+			if (intval < 0 || intval > INT_MAX / 1000) {
+				pr_err("mount_timeout out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->mount_timeout = msecs_to_jiffies(intval * 1000);
 			break;
 
 		case Opt_share:
@@ -512,12 +531,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
 		seq_puts(m, "notcp_nodelay,");
 
 	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-		seq_printf(m, "mount_timeout=%d,", opt->mount_timeout);
+		seq_printf(m, "mount_timeout=%d,",
+			   jiffies_to_msecs(opt->mount_timeout) / 1000);
 	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-		seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl);
+		seq_printf(m, "osd_idle_ttl=%d,",
+			   jiffies_to_msecs(opt->osd_idle_ttl) / 1000);
 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 		seq_printf(m, "osdkeepalivetimeout=%d,",
-			   opt->osd_keepalive_timeout);
+		    jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
 
 	/* drop redundant comma */
 	if (m->count != pos)
@@ -627,7 +648,7 @@ static int have_mon_and_osd_map(struct ceph_client *client)
 int __ceph_open_session(struct ceph_client *client, unsigned long started)
 {
 	int err;
-	unsigned long timeout = client->options->mount_timeout * HZ;
+	unsigned long timeout = client->options->mount_timeout;
 
 	/* open session, and wait for mon and osd maps */
 	err = ceph_monc_open_session(&client->monc);
@@ -643,7 +664,7 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
 		dout("mount waiting for mon_map\n");
 		err = wait_event_interruptible_timeout(client->auth_wq,
 			have_mon_and_osd_map(client) || (client->auth_err < 0),
-			timeout);
+			ceph_timeout_jiffies(timeout));
 		if (err == -EINTR || err == -ERESTARTSYS)
 			return err;
 		if (client->auth_err < 0)
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 2b3cf05e87b0..0da3bdc116f7 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -298,6 +298,12 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 }
 EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
 
+/*
+ * Wait for an osdmap with a given epoch.
+ *
+ * @epoch: epoch to wait for
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
 int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 			  unsigned long timeout)
 {
@@ -308,11 +314,12 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 	while (monc->have_osdmap < epoch) {
 		mutex_unlock(&monc->mutex);
 
-		if (timeout != 0 && time_after_eq(jiffies, started + timeout))
+		if (timeout && time_after_eq(jiffies, started + timeout))
 			return -ETIMEDOUT;
 
 		ret = wait_event_interruptible_timeout(monc->client->auth_wq,
-					 monc->have_osdmap >= epoch, timeout);
+						monc->have_osdmap >= epoch,
+						ceph_timeout_jiffies(timeout));
 		if (ret < 0)
 			return ret;
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 4cb4fab46e4f..50033677c0fa 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1097,7 +1097,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc,
 	BUG_ON(!list_empty(&osd->o_osd_lru));
 
 	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
 
 static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
@@ -1208,7 +1208,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
 static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
 {
 	schedule_delayed_work(&osdc->timeout_work,
-			osdc->client->options->osd_keepalive_timeout * HZ);
+			      osdc->client->options->osd_keepalive_timeout);
 }
 
 static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
@@ -1576,10 +1576,9 @@ static void handle_timeout(struct work_struct *work)
 {
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client, timeout_work.work);
+	struct ceph_options *opts = osdc->client->options;
 	struct ceph_osd_request *req;
 	struct ceph_osd *osd;
-	unsigned long keepalive =
-		osdc->client->options->osd_keepalive_timeout * HZ;
 	struct list_head slow_osds;
 	dout("timeout\n");
 	down_read(&osdc->map_sem);
@@ -1595,7 +1594,8 @@ static void handle_timeout(struct work_struct *work)
 	 */
 	INIT_LIST_HEAD(&slow_osds);
 	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies, req->r_stamp + keepalive))
+		if (time_before(jiffies,
+				req->r_stamp + opts->osd_keepalive_timeout))
 			break;
 
 		osd = req->r_osd;
@@ -1622,8 +1622,7 @@ static void handle_osds_timeout(struct work_struct *work)
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client,
 			     osds_timeout_work.work);
-	unsigned long delay =
-		osdc->client->options->osd_idle_ttl * HZ >> 2;
+	unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
 
 	dout("osds timeout\n");
 	down_read(&osdc->map_sem);
@@ -2628,7 +2627,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->event_count = 0;
 
 	schedule_delayed_work(&osdc->osds_timeout_work,
-	   round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+	    round_jiffies_relative(osdc->client->options->osd_idle_ttl));
 
 	err = -ENOMEM;
 	osdc->req_mempool = mempool_create_kmalloc_pool(10,
-- 
cgit v1.2.3


From 2894e1d769743eb583bf8dc6be149f64a7c6a798 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov
Date: Tue, 12 May 2015 19:53:24 +0300
Subject: rbd: timeout watch teardown on unmap with mount_timeout

As part of unmap sequence, kernel client has to talk to the OSDs to
teardown watch on the header object.  If none of the OSDs are available
it would hang forever, until interrupted by a signal - when that
happens we follow through with the rest of unmap procedure (i.e.
unregister the device and put all the data structures) and the unmap is
still considired successful (rbd cli tool exits with 0).  The watch on
the userspace side should eventually timeout so that's fine.

This isn't very nice, because various userspace tools (pacemaker rbd
resource agent, for example) then have to worry about setting up their
own timeouts.  Timeout it with mount_timeout (60 seconds by default).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 drivers/block/rbd.c | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 992683b6b299..89fe8a4bc02e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1563,22 +1563,39 @@ static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
 /*
  * Wait for an object request to complete.  If interrupted, cancel the
  * underlying osd request.
+ *
+ * @timeout: in jiffies, 0 means "wait forever"
  */
-static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
+static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
+				  unsigned long timeout)
 {
-	int ret;
+	long ret;
 
 	dout("%s %p\n", __func__, obj_request);
-
-	ret = wait_for_completion_interruptible(&obj_request->completion);
-	if (ret < 0) {
-		dout("%s %p interrupted\n", __func__, obj_request);
+	ret = wait_for_completion_interruptible_timeout(
+					&obj_request->completion,
+					ceph_timeout_jiffies(timeout));
+	if (ret <= 0) {
+		if (ret == 0)
+			ret = -ETIMEDOUT;
 		rbd_obj_request_end(obj_request);
-		return ret;
+	} else {
+		ret = 0;
 	}
 
-	dout("%s %p done\n", __func__, obj_request);
-	return 0;
+	dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
+	return ret;
+}
+
+static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
+{
+	return __rbd_obj_request_wait(obj_request, 0);
+}
+
+static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
+					unsigned long timeout)
+{
+	return __rbd_obj_request_wait(obj_request, timeout);
 }
 
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
@@ -3122,6 +3139,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
 						bool watch)
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_options *opts = osdc->client->options;
 	struct rbd_obj_request *obj_request;
 	int ret;
 
@@ -3148,7 +3166,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
 	if (ret)
 		goto out;
 
-	ret = rbd_obj_request_wait(obj_request);
+	ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
 	if (ret)
 		goto out;
 
-- 
cgit v1.2.3


From d3834fefcfe5610702379d78596337875df2db5b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov
Date: Fri, 12 Jun 2015 19:19:02 +0300
Subject: rbd: bump queue_max_segments

The default queue_limits::max_segments value (BLK_MAX_SEGMENTS = 128)
unnecessarily limits bio sizes to 512k (assuming 4k pages).  rbd, being
a virtual block device, doesn't have any restrictions on the number of
physical segments, so bump max_segments to max_hw_sectors, in theory
allowing a sector per segment (although the only case this matters that
I can think of is some readv/writev style thing).  In practice this is
going to give us 1M bios - the number of segments in a bio is limited
in bio_get_nr_vecs() by BIO_MAX_PAGES = 256.

Note that this doesn't result in any improvement on a typical direct
sequential test.  This is because on a box with a not too badly
fragmented memory the default BLK_MAX_SEGMENTS is enough to see nice
rbd object size sized requests.  The only difference is the size of
bios being merged - 512k vs 1M for something like

    $ dd if=/dev/zero of=/dev/rbd0 oflag=direct bs=$RBD_OBJ_SIZE
    $ dd if=/dev/rbd0 iflag=direct of=/dev/null bs=$RBD_OBJ_SIZE

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 89fe8a4bc02e..bc88fbcb9715 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3791,6 +3791,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	/* set io sizes to object size */
 	segment_size = rbd_obj_bytes(&rbd_dev->header);
 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
+	blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
 	blk_queue_max_segment_size(q, segment_size);
 	blk_queue_io_min(q, segment_size);
 	blk_queue_io_opt(q, segment_size);
-- 
cgit v1.2.3


From 210c104c544e5be321fc5b78e74b596d36820332 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov
Date: Mon, 22 Jun 2015 13:24:48 +0300
Subject: rbd: terminate rbd_opts_tokens with Opt_err

Also nuke useless Opt_last_bool and don't break lines unnecessarily.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bc88fbcb9715..4de8c9167c4b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -724,7 +724,7 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 }
 
 /*
- * mount options
+ * (Per device) rbd map options
  */
 enum {
 	Opt_last_int,
@@ -733,8 +733,7 @@ enum {
 	/* string args above */
 	Opt_read_only,
 	Opt_read_write,
-	/* Boolean args above */
-	Opt_last_bool,
+	Opt_err
 };
 
 static match_table_t rbd_opts_tokens = {
@@ -744,8 +743,7 @@ static match_table_t rbd_opts_tokens = {
 	{Opt_read_only, "ro"},		/* Alternate spelling */
 	{Opt_read_write, "read_write"},
 	{Opt_read_write, "rw"},		/* Alternate spelling */
-	/* Boolean args above */
-	{-1, NULL}
+	{Opt_err, NULL}
 };
 
 struct rbd_options {
@@ -761,22 +759,15 @@ static int parse_rbd_opts_token(char *c, void *private)
 	int token, intval, ret;
 
 	token = match_token(c, rbd_opts_tokens, argstr);
-	if (token < 0)
-		return -EINVAL;
-
 	if (token < Opt_last_int) {
 		ret = match_int(&argstr[0], &intval);
 		if (ret < 0) {
-			pr_err("bad mount option arg (not int) "
-			       "at '%s'\n", c);
+			pr_err("bad mount option arg (not int) at '%s'\n", c);
 			return ret;
 		}
 		dout("got int token %d val %d\n", token, intval);
 	} else if (token > Opt_last_int && token < Opt_last_string) {
-		dout("got string token %d val %s\n", token,
-		     argstr[0].from);
-	} else if (token > Opt_last_string && token < Opt_last_bool) {
-		dout("got Boolean token %d\n", token);
+		dout("got string token %d val %s\n", token, argstr[0].from);
 	} else {
 		dout("got token %d\n", token);
 	}
@@ -789,9 +780,10 @@ static int parse_rbd_opts_token(char *c, void *private)
 		rbd_opts->read_only = false;
 		break;
 	default:
-		rbd_assert(false);
-		break;
+		/* libceph prints "bad option" msg */
+		return -EINVAL;
 	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From d147543d7943eaa549a569143b7815482585fb91 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov
Date: Mon, 22 Jun 2015 13:24:48 +0300
Subject: rbd: store rbd_options in rbd_device

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4de8c9167c4b..e502bce02d2c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -346,6 +346,7 @@ struct rbd_device {
 	struct rbd_image_header	header;
 	unsigned long		flags;		/* possibly lock protected */
 	struct rbd_spec		*spec;
+	struct rbd_options	*opts;
 
 	char			*header_name;
 
@@ -4055,7 +4056,8 @@ static void rbd_spec_free(struct kref *kref)
 }
 
 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
-				struct rbd_spec *spec)
+					 struct rbd_spec *spec,
+					 struct rbd_options *opts)
 {
 	struct rbd_device *rbd_dev;
 
@@ -4069,8 +4071,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 	INIT_LIST_HEAD(&rbd_dev->node);
 	init_rwsem(&rbd_dev->header_rwsem);
 
-	rbd_dev->spec = spec;
 	rbd_dev->rbd_client = rbdc;
+	rbd_dev->spec = spec;
+	rbd_dev->opts = opts;
 
 	/* Initialize the layout used for all rbd requests */
 
@@ -4086,6 +4089,7 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev)
 {
 	rbd_put_client(rbd_dev->rbd_client);
 	rbd_spec_put(rbd_dev->spec);
+	kfree(rbd_dev->opts);
 	kfree(rbd_dev);
 }
 
@@ -5160,7 +5164,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
 
 	ret = -ENOMEM;
-	parent = rbd_dev_create(rbdc, parent_spec);
+	parent = rbd_dev_create(rbdc, parent_spec, NULL);
 	if (!parent)
 		goto out_err;
 
@@ -5406,9 +5410,6 @@ static ssize_t do_rbd_add(struct bus_type *bus,
 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
 	if (rc < 0)
 		goto err_out_module;
-	read_only = rbd_opts->read_only;
-	kfree(rbd_opts);
-	rbd_opts = NULL;	/* done with this */
 
 	rbdc = rbd_get_client(ceph_opts);
 	if (IS_ERR(rbdc)) {
@@ -5434,11 +5435,12 @@ static ssize_t do_rbd_add(struct bus_type *bus,
 		goto err_out_client;
 	}
 
-	rbd_dev = rbd_dev_create(rbdc, spec);
+	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
 	if (!rbd_dev)
 		goto err_out_client;
 	rbdc = NULL;		/* rbd_dev now owns this */
 	spec = NULL;		/* rbd_dev now owns this */
+	rbd_opts = NULL;	/* rbd_dev now owns this */
 
 	rc = rbd_dev_image_probe(rbd_dev, true);
 	if (rc < 0)
@@ -5446,6 +5448,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
 
 	/* If we are mapping a snapshot it must be marked read-only */
 
+	read_only = rbd_dev->opts->read_only;
 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
 		read_only = true;
 	rbd_dev->mapping.read_only = read_only;
@@ -5470,6 +5473,7 @@ err_out_client:
 	rbd_put_client(rbdc);
 err_out_args:
 	rbd_spec_put(spec);
+	kfree(rbd_opts);
 err_out_module:
 	module_put(THIS_MODULE);
 
-- 
cgit v1.2.3


From b55841807fb864eccca0167650a65722fd7cd553 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov
Date: Tue, 23 Jun 2015 16:21:19 +0300
Subject: rbd: queue_depth map option

nr_requests (/sys/block/rbd<id>/queue/nr_requests) is pretty much
irrelevant in blk-mq case because each driver sets its own max depth
that it can handle and that's the number of tags that gets preallocated
on setup.  Users can't increase queue depth beyond that value via
writing to nr_requests.

For rbd we are happy with the default BLKDEV_MAX_RQ (128) for most
cases but we want to give users the opportunity to increase it.
Introduce a new per-device queue_depth option to do just that:

    $ sudo rbd map -o queue_depth=1024 ...

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e502bce02d2c..b316ee48a30b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -728,6 +728,7 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
  * (Per device) rbd map options
  */
 enum {
+	Opt_queue_depth,
 	Opt_last_int,
 	/* int args above */
 	Opt_last_string,
@@ -738,6 +739,7 @@ enum {
 };
 
 static match_table_t rbd_opts_tokens = {
+	{Opt_queue_depth, "queue_depth=%d"},
 	/* int args above */
 	/* string args above */
 	{Opt_read_only, "read_only"},
@@ -748,9 +750,11 @@ static match_table_t rbd_opts_tokens = {
 };
 
 struct rbd_options {
+	int	queue_depth;
 	bool	read_only;
 };
 
+#define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
 #define RBD_READ_ONLY_DEFAULT	false
 
 static int parse_rbd_opts_token(char *c, void *private)
@@ -774,6 +778,13 @@ static int parse_rbd_opts_token(char *c, void *private)
 	}
 
 	switch (token) {
+	case Opt_queue_depth:
+		if (intval < 1) {
+			pr_err("queue_depth out of range\n");
+			return -EINVAL;
+		}
+		rbd_opts->queue_depth = intval;
+		break;
 	case Opt_read_only:
 		rbd_opts->read_only = true;
 		break;
@@ -3761,10 +3772,9 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 
 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
 	rbd_dev->tag_set.ops = &rbd_mq_ops;
-	rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
+	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
-	rbd_dev->tag_set.flags =
-		BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
 	rbd_dev->tag_set.nr_hw_queues = 1;
 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
 
@@ -4948,6 +4958,7 @@ static int rbd_add_parse_args(const char *buf,
 		goto out_mem;
 
 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
+	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
 
 	copts = ceph_parse_options(options, mon_addrs,
 					mon_addrs + mon_addrs_size - 1,
-- 
cgit v1.2.3


From 5a60e87603c4c533492c515b7f62578189b03c9c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov
Date: Wed, 24 Jun 2015 17:24:33 +0300
Subject: rbd: use GFP_NOIO in rbd_obj_request_create()

rbd_obj_request_create() is called on the main I/O path, so we need to
use GFP_NOIO to make sure allocation doesn't blow back on us.  Not all
callers need this, but I'm still hardcoding the flag inside rather than
making it a parameter because a) this is going to stable, and b) those
callers shouldn't really use rbd_obj_request_create() and will be fixed
in the future.

More memory allocation fixes will follow.

Cc: stable@vger.kernel.org # 3.10+
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b316ee48a30b..d94529d5c8e9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2022,11 +2022,11 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
 	rbd_assert(obj_request_type_valid(type));
 
 	size = strlen(object_name) + 1;
-	name = kmalloc(size, GFP_KERNEL);
+	name = kmalloc(size, GFP_NOIO);
 	if (!name)
 		return NULL;
 
-	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
+	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
 	if (!obj_request) {
 		kfree(name);
 		return NULL;
-- 
cgit v1.2.3