From 9a11b4ed0e7c44bca7c939aa544c3c47aae40c12 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 May 2008 09:32:08 +0200 Subject: cfq-iosched: properly protect ioc_gone and ioc count If we have multiple tasks freeing cfq_io_contexts when cfq-iosched is being unloaded, we could complete() ioc_gone twice. Fix that by protecting ioc_gone complete() and clearing with a spinlock for just that purpose. Doesn't matter from a performance perspective, since it'll only enter that path when ioc_gone != NULL (when cfq-iosched is being rmmod'ed). Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d01b411c72f0..32aa3674f8a3 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -48,6 +48,7 @@ static struct kmem_cache *cfq_ioc_pool; static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; +static DEFINE_SPINLOCK(ioc_gone_lock); #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) @@ -1177,8 +1178,19 @@ static void cfq_cic_free_rcu(struct rcu_head *head) kmem_cache_free(cfq_ioc_pool, cic); elv_ioc_count_dec(ioc_count); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) - complete(ioc_gone); + if (ioc_gone) { + /* + * CFQ scheduler is exiting, grab exit lock and check + * the pending io context count. If it hits zero, + * complete ioc_gone and set it back to NULL + */ + spin_lock(&ioc_gone_lock); + if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + complete(ioc_gone); + ioc_gone = NULL; + } + spin_unlock(&ioc_gone_lock); + } } static void cfq_cic_free(struct cfq_io_context *cic) @@ -2317,7 +2329,7 @@ static void __exit cfq_exit(void) * pending RCU callbacks */ if (elv_ioc_count_read(ioc_count)) - wait_for_completion(ioc_gone); + wait_for_completion(&all_gone); cfq_slab_kill(); } -- cgit v1.2.3 From 863fddcb4b0caee4c2d5bd6e3b28779920516db3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 May 2008 09:35:22 +0200 Subject: as-iosched: properly protect ioc_gone and ioc count If we have multiple tasks freeing io contexts when as-iosched is being unloaded, we could complete() ioc_gone twice. Fix that by protecting ioc_gone complete() and clearing with a spinlock for just that purpose. Doesn't matter from a performance perspective, since it'll only enter that path when ioc_gone != NULL (when as-iosched is being rmmod'ed). Signed-off-by: Jens Axboe --- block/as-iosched.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/as-iosched.c b/block/as-iosched.c index 743f33a01a07..9735acb5b4f5 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -151,6 +151,7 @@ enum arq_state { static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; +static DEFINE_SPINLOCK(ioc_gone_lock); static void as_move_to_dispatch(struct as_data *ad, struct request *rq); static void as_antic_stop(struct as_data *ad); @@ -164,8 +165,19 @@ static void free_as_io_context(struct as_io_context *aic) { kfree(aic); elv_ioc_count_dec(ioc_count); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) - complete(ioc_gone); + if (ioc_gone) { + /* + * AS scheduler is exiting, grab exit lock and check + * the pending io context count. If it hits zero, + * complete ioc_gone and set it back to NULL. + */ + spin_lock(&ioc_gone_lock); + if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + complete(ioc_gone); + ioc_gone = NULL; + } + spin_unlock(&ioc_gone_lock); + } } static void as_trim(struct io_context *ioc) @@ -1493,7 +1505,7 @@ static void __exit as_exit(void) /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); if (elv_ioc_count_read(ioc_count)) - wait_for_completion(ioc_gone); + wait_for_completion(&all_gone); synchronize_rcu(); } -- cgit v1.2.3 From 7b679138b3237a9a3d45a4fda23a58ac79cd279c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 May 2008 12:23:07 +0200 Subject: cfq-iosched: add message logging through blktrace Now that blktrace has the ability to carry arbitrary messages in its stream, use that for some CFQ logging. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 65 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 32aa3674f8a3..0ebb626a25d3 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -11,6 +11,7 @@ #include #include #include +#include /* * tunables @@ -41,7 +42,7 @@ static int cfq_slice_idle = HZ / 125; #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) -#define RQ_CFQQ(rq) ((rq)->elevator_private2) +#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; @@ -156,6 +157,7 @@ struct cfq_queue { unsigned short ioprio, org_ioprio; unsigned short ioprio_class, org_ioprio_class; + pid_t pid; }; enum cfqq_state_flags { @@ -199,6 +201,11 @@ CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); #undef CFQ_CFQQ_FNS +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) +#define cfq_log(cfqd, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) + static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, struct io_context *, gfp_t); @@ -235,8 +242,10 @@ static inline int cfq_bio_sync(struct bio *bio) */ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) { - if (cfqd->busy_queues) + if (cfqd->busy_queues) { + cfq_log(cfqd, "schedule dispatch"); kblockd_schedule_work(&cfqd->unplug_work); + } } static int cfq_queue_empty(struct request_queue *q) @@ -271,6 +280,7 @@ static inline void cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; + cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); } /* @@ -540,6 +550,7 @@ static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) */ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; @@ -553,6 +564,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) */ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); @@ -639,6 +651,8 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) struct cfq_data *cfqd = q->elevator->elevator_data; cfqd->rq_in_driver++; + cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", + cfqd->rq_in_driver); /* * If the depth is larger 1, it really could be queueing. But lets @@ -658,6 +672,8 @@ static void cfq_deactivate_request(struct request_queue *q, struct request *rq) WARN_ON(!cfqd->rq_in_driver); cfqd->rq_in_driver--; + cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", + cfqd->rq_in_driver); } static void cfq_remove_request(struct request *rq) @@ -747,6 +763,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "set_active"); cfqq->slice_end = 0; cfq_clear_cfqq_must_alloc_slice(cfqq); cfq_clear_cfqq_fifo_expire(cfqq); @@ -764,6 +781,8 @@ static void __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, int timed_out) { + cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); + if (cfq_cfqq_wait_request(cfqq)) del_timer(&cfqd->idle_slice_timer); @@ -773,8 +792,10 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * store what was left of this slice, if the queue idled/timed out */ - if (timed_out && !cfq_cfqq_slice_new(cfqq)) + if (timed_out && !cfq_cfqq_slice_new(cfqq)) { cfqq->slice_resid = cfqq->slice_end - jiffies; + cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); + } cfq_resort_rr_list(cfqd, cfqq); @@ -866,6 +887,12 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) return; + /* + * still requests with the driver, don't idle + */ + if (cfqd->rq_in_driver) + return; + /* * task has exited, don't wait */ @@ -893,6 +920,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); mod_timer(&cfqd->idle_slice_timer, jiffies + sl); + cfq_log(cfqd, "arm_idle: %lu", sl); } /* @@ -903,6 +931,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_queue *cfqq = RQ_CFQQ(rq); + cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); + cfq_remove_request(rq); cfqq->dispatched++; elv_dispatch_sort(q, rq); @@ -932,8 +962,9 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq) rq = rq_entry_fifo(cfqq->fifo.next); if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) - return NULL; + rq = NULL; + cfq_log_cfqq(cfqd, cfqq, "fifo=%p", rq); return rq; } @@ -1073,6 +1104,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) BUG_ON(cfqd->busy_queues); + cfq_log(cfqd, "forced_dispatch=%d\n", dispatched); return dispatched; } @@ -1113,6 +1145,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); } + cfq_log(cfqd, "dispatched=%d", dispatched); return dispatched; } @@ -1131,6 +1164,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) if (!atomic_dec_and_test(&cfqq->ref)) return; + cfq_log_cfqq(cfqd, cfqq, "put_queue"); BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); BUG_ON(cfq_cfqq_on_rr(cfqq)); @@ -1439,6 +1473,8 @@ retry: cfq_mark_cfqq_idle_window(cfqq); cfq_mark_cfqq_sync(cfqq); } + cfqq->pid = current->pid; + cfq_log_cfqq(cfqd, cfqq, "alloced"); } if (new_cfqq) @@ -1687,7 +1723,7 @@ static void cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic) { - int enable_idle; + int old_idle, enable_idle; /* * Don't idle for async or idle io prio class @@ -1695,7 +1731,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) return; - enable_idle = cfq_cfqq_idle_window(cfqq); + old_idle = cfq_cfqq_idle_window(cfqq); if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (cfqd->hw_tag && CIC_SEEKY(cic))) @@ -1707,10 +1743,13 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, enable_idle = 1; } - if (enable_idle) - cfq_mark_cfqq_idle_window(cfqq); - else - cfq_clear_cfqq_idle_window(cfqq); + if (old_idle != enable_idle) { + cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); + if (enable_idle) + cfq_mark_cfqq_idle_window(cfqq); + else + cfq_clear_cfqq_idle_window(cfqq); + } } /* @@ -1769,6 +1808,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + cfq_log_cfqq(cfqd, cfqq, "preempt"); cfq_slice_expired(cfqd, 1); /* @@ -1830,6 +1870,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_queue *cfqq = RQ_CFQQ(rq); + cfq_log_cfqq(cfqd, cfqq, "insert_request"); cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); cfq_add_rq_rb(rq); @@ -1847,6 +1888,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) unsigned long now; now = jiffies; + cfq_log_cfqq(cfqd, cfqq, "complete"); WARN_ON(!cfqd->rq_in_driver); WARN_ON(!cfqq->dispatched); @@ -2016,6 +2058,7 @@ queue_fail: cfq_schedule_dispatch(cfqd); spin_unlock_irqrestore(q->queue_lock, flags); + cfq_log(cfqd, "set_request fail"); return 1; } @@ -2041,6 +2084,8 @@ static void cfq_idle_slice_timer(unsigned long data) unsigned long flags; int timed_out = 1; + cfq_log(cfqd, "idle timer fired"); + spin_lock_irqsave(cfqd->queue->queue_lock, flags); cfqq = cfqd->active_queue; -- cgit v1.2.3 From 02c62304e6af60f1963695c6bc1bbffe619aa585 Mon Sep 17 00:00:00 2001 From: Alan D. Brunelle Date: Wed, 11 Jun 2008 09:12:52 +0200 Subject: Added in user-injected messages into blk traces This allows a user to annotate the blk trace stream: writing a suitable message to {/sys/kernel/debug}/block//msg will have it propagated into the trace stream. Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- block/blktrace.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/blktrace_api.h | 1 + 2 files changed, 46 insertions(+) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index 8d3a27780260..eb9651ccb241 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -244,6 +244,7 @@ err: static void blk_trace_cleanup(struct blk_trace *bt) { relay_close(bt->rchan); + debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); blk_remove_tree(bt->dir); free_percpu(bt->sequence); @@ -291,6 +292,44 @@ static const struct file_operations blk_dropped_fops = { .read = blk_dropped_read, }; +static int blk_msg_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + + return 0; +} + +static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *msg; + struct blk_trace *bt; + + if (count > BLK_TN_MAX_MSG) + return -EINVAL; + + msg = kmalloc(count, GFP_KERNEL); + if (msg == NULL) + return -ENOMEM; + + if (copy_from_user(msg, buffer, count)) { + kfree(msg); + return -EFAULT; + } + + bt = filp->private_data; + __trace_note_message(bt, "%s", msg); + kfree(msg); + + return count; +} + +static const struct file_operations blk_msg_fops = { + .owner = THIS_MODULE, + .open = blk_msg_open, + .write = blk_msg_write, +}; + /* * Keep track of how many times we encountered a full subbuffer, to aid * the user space app in telling how many lost events there were. @@ -380,6 +419,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->dropped_file) goto err; + bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); + if (!bt->msg_file) + goto err; + bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); if (!bt->rchan) @@ -409,6 +452,8 @@ err: if (dir) blk_remove_tree(dir); if (bt) { + if (bt->msg_file) + debugfs_remove(bt->msg_file); if (bt->dropped_file) debugfs_remove(bt->dropped_file); free_percpu(bt->sequence); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index e3ef903aae88..d084b8d227a5 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -129,6 +129,7 @@ struct blk_trace { u32 dev; struct dentry *dir; struct dentry *dropped_file; + struct dentry *msg_file; atomic_t dropped; }; -- cgit v1.2.3 From 1c9ce5276324ae566ca409491b99a2cc8d5986fa Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 13 Jun 2008 09:41:00 +0200 Subject: block: export "ro" attribute Signed-off-by: Kay Sievers Signed-off-by: Jens Axboe --- block/genhd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index b922d4801c87..43e468ee5993 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -400,6 +400,14 @@ static ssize_t disk_removable_show(struct device *dev, (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } +static ssize_t disk_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk->policy ? 1 : 0); +} + static ssize_t disk_size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -472,6 +480,7 @@ static ssize_t disk_fail_store(struct device *dev, static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); +static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, disk_size_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, disk_stat_show, NULL); @@ -483,6 +492,7 @@ static struct device_attribute dev_attr_fail = static struct attribute *disk_attrs[] = { &dev_attr_range.attr, &dev_attr_removable.attr, + &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, -- cgit v1.2.3 From 7ba1ba12eeef0aa7113beb16410ef8b7c748e18b Mon Sep 17 00:00:00 2001 From: Martin K. Petersen Date: Mon, 30 Jun 2008 20:04:41 +0200 Subject: block: Block layer data integrity support Some block devices support verifying the integrity of requests by way of checksums or other protection information that is submitted along with the I/O. This patch implements support for generating and verifying integrity metadata, as well as correctly merging, splitting and cloning bios and requests that have this extra information attached. See Documentation/block/data-integrity.txt for more information. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/Kconfig | 12 + block/Makefile | 1 + block/blk-core.c | 7 + block/blk-integrity.c | 382 ++++++++++++++++++++++++++ block/blk-merge.c | 3 + block/blk.h | 8 + block/elevator.c | 6 + fs/Makefile | 1 + fs/bio-integrity.c | 708 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/bio.c | 32 ++- include/linux/bio.h | 94 ++++++- include/linux/blkdev.h | 105 ++++++++ include/linux/genhd.h | 3 + 13 files changed, 1355 insertions(+), 7 deletions(-) create mode 100644 block/blk-integrity.c create mode 100644 fs/bio-integrity.c (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 3e97f2bc446f..1ab7c15c8d7a 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -81,6 +81,18 @@ config BLK_DEV_BSG If unsure, say N. +config BLK_DEV_INTEGRITY + bool "Block layer data integrity support" + ---help--- + Some storage devices allow extra information to be + stored/retrieved to help protect the data. The block layer + data integrity option provides hooks which can be used by + filesystems to ensure better data integrity. + + Say yes here if you have a storage device that provides the + T10/SCSI Data Integrity Field or the T13/ATA External Path + Protection. If in doubt, say N. + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Makefile b/block/Makefile index 5a43c7d79594..045f7b62e4bb 100644 --- a/block/Makefile +++ b/block/Makefile @@ -14,3 +14,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/block/blk-core.c b/block/blk-core.c index 1905aaba49fb..e0fb0bcc0c17 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -143,6 +143,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio->bi_size -= nbytes; bio->bi_sector += (nbytes >> 9); + + if (bio_integrity(bio)) + bio_integrity_advance(bio, nbytes); + if (bio->bi_size == 0) bio_endio(bio, error); } else { @@ -1381,6 +1385,9 @@ end_io: */ blk_partition_remap(bio); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) + goto end_io; + if (old_sector != -1) blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, old_sector); diff --git a/block/blk-integrity.c b/block/blk-integrity.c new file mode 100644 index 000000000000..65f23ef38bbe --- /dev/null +++ b/block/blk-integrity.c @@ -0,0 +1,382 @@ +/* + * blk-integrity.c - Block layer data integrity extensions + * + * Copyright (C) 2007, 2008 Oracle Corporation + * Written by: Martin K. Petersen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include +#include +#include +#include + +#include "blk.h" + +static struct kmem_cache *integrity_cachep; + +/** + * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements + * @rq: request with integrity metadata attached + * + * Description: Returns the number of elements required in a + * scatterlist corresponding to the integrity metadata in a request. + */ +int blk_rq_count_integrity_sg(struct request *rq) +{ + struct bio_vec *iv, *ivprv; + struct req_iterator iter; + unsigned int segments; + + ivprv = NULL; + segments = 0; + + rq_for_each_integrity_segment(iv, rq, iter) { + + if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + segments++; + + ivprv = iv; + } + + return segments; +} +EXPORT_SYMBOL(blk_rq_count_integrity_sg); + +/** + * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist + * @rq: request with integrity metadata attached + * @sglist: target scatterlist + * + * Description: Map the integrity vectors in request into a + * scatterlist. The scatterlist must be big enough to hold all + * elements. I.e. sized using blk_rq_count_integrity_sg(). + */ +int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) +{ + struct bio_vec *iv, *ivprv; + struct req_iterator iter; + struct scatterlist *sg; + unsigned int segments; + + ivprv = NULL; + sg = NULL; + segments = 0; + + rq_for_each_integrity_segment(iv, rq, iter) { + + if (ivprv) { + if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + goto new_segment; + + sg->length += iv->bv_len; + } else { +new_segment: + if (!sg) + sg = sglist; + else { + sg->page_link &= ~0x02; + sg = sg_next(sg); + } + + sg_set_page(sg, iv->bv_page, iv->bv_len, iv->bv_offset); + segments++; + } + + ivprv = iv; + } + + if (sg) + sg_mark_end(sg); + + return segments; +} +EXPORT_SYMBOL(blk_rq_map_integrity_sg); + +/** + * blk_integrity_compare - Compare integrity profile of two block devices + * @b1: Device to compare + * @b2: Device to compare + * + * Description: Meta-devices like DM and MD need to verify that all + * sub-devices use the same integrity format before advertising to + * upper layers that they can send/receive integrity metadata. This + * function can be used to check whether two block devices have + * compatible integrity formats. + */ +int blk_integrity_compare(struct block_device *bd1, struct block_device *bd2) +{ + struct blk_integrity *b1 = bd1->bd_disk->integrity; + struct blk_integrity *b2 = bd2->bd_disk->integrity; + + BUG_ON(bd1->bd_disk == NULL); + BUG_ON(bd2->bd_disk == NULL); + + if (!b1 || !b2) + return 0; + + if (b1->sector_size != b2->sector_size) { + printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->sector_size, b2->sector_size); + return -1; + } + + if (b1->tuple_size != b2->tuple_size) { + printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->tuple_size, b2->tuple_size); + return -1; + } + + if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { + printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->tag_size, b2->tag_size); + return -1; + } + + if (strcmp(b1->name, b2->name)) { + printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->name, b2->name); + return -1; + } + + return 0; +} +EXPORT_SYMBOL(blk_integrity_compare); + +struct integrity_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_integrity *, char *); + ssize_t (*store)(struct blk_integrity *, const char *, size_t); +}; + +static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct blk_integrity *bi = + container_of(kobj, struct blk_integrity, kobj); + struct integrity_sysfs_entry *entry = + container_of(attr, struct integrity_sysfs_entry, attr); + + return entry->show(bi, page); +} + +static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t count) +{ + struct blk_integrity *bi = + container_of(kobj, struct blk_integrity, kobj); + struct integrity_sysfs_entry *entry = + container_of(attr, struct integrity_sysfs_entry, attr); + ssize_t ret = 0; + + if (entry->store) + ret = entry->store(bi, page, count); + + return ret; +} + +static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) +{ + if (bi != NULL && bi->name != NULL) + return sprintf(page, "%s\n", bi->name); + else + return sprintf(page, "none\n"); +} + +static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) +{ + if (bi != NULL) + return sprintf(page, "%u\n", bi->tag_size); + else + return sprintf(page, "0\n"); +} + +static ssize_t integrity_read_store(struct blk_integrity *bi, + const char *page, size_t count) +{ + char *p = (char *) page; + unsigned long val = simple_strtoul(p, &p, 10); + + if (val) + set_bit(INTEGRITY_FLAG_READ, &bi->flags); + else + clear_bit(INTEGRITY_FLAG_READ, &bi->flags); + + return count; +} + +static ssize_t integrity_read_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%d\n", + test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0); +} + +static ssize_t integrity_write_store(struct blk_integrity *bi, + const char *page, size_t count) +{ + char *p = (char *) page; + unsigned long val = simple_strtoul(p, &p, 10); + + if (val) + set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + else + clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + + return count; +} + +static ssize_t integrity_write_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%d\n", + test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0); +} + +static struct integrity_sysfs_entry integrity_format_entry = { + .attr = { .name = "format", .mode = S_IRUGO }, + .show = integrity_format_show, +}; + +static struct integrity_sysfs_entry integrity_tag_size_entry = { + .attr = { .name = "tag_size", .mode = S_IRUGO }, + .show = integrity_tag_size_show, +}; + +static struct integrity_sysfs_entry integrity_read_entry = { + .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR }, + .show = integrity_read_show, + .store = integrity_read_store, +}; + +static struct integrity_sysfs_entry integrity_write_entry = { + .attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR }, + .show = integrity_write_show, + .store = integrity_write_store, +}; + +static struct attribute *integrity_attrs[] = { + &integrity_format_entry.attr, + &integrity_tag_size_entry.attr, + &integrity_read_entry.attr, + &integrity_write_entry.attr, + NULL, +}; + +static struct sysfs_ops integrity_ops = { + .show = &integrity_attr_show, + .store = &integrity_attr_store, +}; + +static int __init blk_dev_integrity_init(void) +{ + integrity_cachep = kmem_cache_create("blkdev_integrity", + sizeof(struct blk_integrity), + 0, SLAB_PANIC, NULL); + return 0; +} +subsys_initcall(blk_dev_integrity_init); + +static void blk_integrity_release(struct kobject *kobj) +{ + struct blk_integrity *bi = + container_of(kobj, struct blk_integrity, kobj); + + kmem_cache_free(integrity_cachep, bi); +} + +static struct kobj_type integrity_ktype = { + .default_attrs = integrity_attrs, + .sysfs_ops = &integrity_ops, + .release = blk_integrity_release, +}; + +/** + * blk_integrity_register - Register a gendisk as being integrity-capable + * @disk: struct gendisk pointer to make integrity-aware + * @template: integrity profile + * + * Description: When a device needs to advertise itself as being able + * to send/receive integrity metadata it must use this function to + * register the capability with the block layer. The template is a + * blk_integrity struct with values appropriate for the underlying + * hardware. See Documentation/block/data-integrity.txt. + */ +int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) +{ + struct blk_integrity *bi; + + BUG_ON(disk == NULL); + BUG_ON(template == NULL); + + if (disk->integrity == NULL) { + bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO); + if (!bi) + return -1; + + if (kobject_init_and_add(&bi->kobj, &integrity_ktype, + &disk->dev.kobj, "%s", "integrity")) { + kmem_cache_free(integrity_cachep, bi); + return -1; + } + + kobject_uevent(&bi->kobj, KOBJ_ADD); + + set_bit(INTEGRITY_FLAG_READ, &bi->flags); + set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->sector_size = disk->queue->hardsect_size; + disk->integrity = bi; + } else + bi = disk->integrity; + + /* Use the provided profile as template */ + bi->name = template->name; + bi->generate_fn = template->generate_fn; + bi->verify_fn = template->verify_fn; + bi->tuple_size = template->tuple_size; + bi->set_tag_fn = template->set_tag_fn; + bi->get_tag_fn = template->get_tag_fn; + bi->tag_size = template->tag_size; + + return 0; +} +EXPORT_SYMBOL(blk_integrity_register); + +/** + * blk_integrity_unregister - Remove block integrity profile + * @disk: disk whose integrity profile to deallocate + * + * Description: This function frees all memory used by the block + * integrity profile. To be called at device teardown. + */ +void blk_integrity_unregister(struct gendisk *disk) +{ + struct blk_integrity *bi; + + if (!disk || !disk->integrity) + return; + + bi = disk->integrity; + + kobject_uevent(&bi->kobj, KOBJ_REMOVE); + kobject_del(&bi->kobj); + kobject_put(&disk->dev.kobj); + kmem_cache_free(integrity_cachep, bi); +} +EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-merge.c b/block/blk-merge.c index 651136aae76e..5efc9e7a68b7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -441,6 +441,9 @@ static int attempt_merge(struct request_queue *q, struct request *req, || next->special) return 0; + if (blk_integrity_rq(req) != blk_integrity_rq(next)) + return 0; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn diff --git a/block/blk.h b/block/blk.h index 59776ab4742a..c79f30e1df52 100644 --- a/block/blk.h +++ b/block/blk.h @@ -51,4 +51,12 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +#define rq_for_each_integrity_segment(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i) + +#endif /* BLK_DEV_INTEGRITY */ + #endif diff --git a/block/elevator.c b/block/elevator.c index 902dd1344d56..1f5bfe696026 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -86,6 +86,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) return 0; + /* + * only merge integrity protected bio into ditto rq + */ + if (bio_integrity(bio) != blk_integrity_rq(rq)) + return 0; + if (!elv_iosched_allow_merge(rq, bio)) return 0; diff --git a/fs/Makefile b/fs/Makefile index 1e7a11bd4da1..277b079dec9e 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -19,6 +19,7 @@ else obj-y += no-block.o endif +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY_USER) += inotify_user.o obj-$(CONFIG_EPOLL) += eventpoll.o diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c new file mode 100644 index 000000000000..31b08878913d --- /dev/null +++ b/fs/bio-integrity.c @@ -0,0 +1,708 @@ +/* + * bio-integrity.c - bio data integrity extensions + * + * Copyright (C) 2007, 2008 Oracle Corporation + * Written by: Martin K. Petersen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include +#include +#include +#include + +static struct kmem_cache *bio_integrity_slab __read_mostly; +static struct workqueue_struct *kintegrityd_wq; + +/** + * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * @bs: bio_set to allocate from + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs, struct bio_set *bs) +{ + struct bio_integrity_payload *bip; + struct bio_vec *iv; + unsigned long idx; + + BUG_ON(bio == NULL); + + bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "%s: could not alloc bip\n", __func__); + return NULL; + } + + memset(bip, 0, sizeof(*bip)); + + iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs); + if (unlikely(iv == NULL)) { + printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__); + mempool_free(bip, bs->bio_integrity_pool); + return NULL; + } + + bip->bip_pool = idx; + bip->bip_vec = iv; + bip->bip_bio = bio; + bio->bi_integrity = bip; + + return bip; +} +EXPORT_SYMBOL(bio_integrity_alloc_bioset); + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) +{ + return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); +} +EXPORT_SYMBOL(bio_integrity_alloc); + +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * @bs: bio_set this bio was allocated from + * + * Description: Used to free the integrity portion of a bio. Usually + * called from bio_free(). + */ +void bio_integrity_free(struct bio *bio, struct bio_set *bs) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip == NULL); + + /* A cloned bio doesn't own the integrity metadata */ + if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL) + kfree(bip->bip_buf); + + mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); + mempool_free(bip, bs->bio_integrity_pool); + + bio->bi_integrity = NULL; +} +EXPORT_SYMBOL(bio_integrity_free); + +/** + * bio_integrity_add_page - Attach integrity metadata + * @bio: bio to update + * @page: page containing integrity metadata + * @len: number of bytes of integrity metadata in page + * @offset: start offset within page + * + * Description: Attach a page containing integrity metadata to bio. + */ +int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_vec *iv; + + if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) { + printk(KERN_ERR "%s: bip_vec full\n", __func__); + return 0; + } + + iv = bip_vec_idx(bip, bip->bip_vcnt); + BUG_ON(iv == NULL); + BUG_ON(iv->bv_page != NULL); + + iv->bv_page = page; + iv->bv_len = len; + iv->bv_offset = offset; + bip->bip_vcnt++; + + return len; +} +EXPORT_SYMBOL(bio_integrity_add_page); + +/** + * bio_integrity_enabled - Check whether integrity can be passed + * @bio: bio to check + * + * Description: Determines whether bio_integrity_prep() can be called + * on this bio or not. bio data direction and target device must be + * set prior to calling. The functions honors the write_generate and + * read_verify flags in sysfs. + */ +int bio_integrity_enabled(struct bio *bio) +{ + /* Already protected? */ + if (bio_integrity(bio)) + return 0; + + return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); +} +EXPORT_SYMBOL(bio_integrity_enabled); + +/** + * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto + * @bi: blk_integrity profile for device + * @sectors: Number of 512 sectors to convert + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the hardware + * sector size of the storage device. Convert the block layer sectors + * to physical sectors. + */ +static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, unsigned int sectors) +{ + /* At this point there are only 512b or 4096b DIF/EPP devices */ + if (bi->sector_size == 4096) + return sectors >>= 3; + + return sectors; +} + +/** + * bio_integrity_tag_size - Retrieve integrity tag space + * @bio: bio to inspect + * + * Description: Returns the maximum number of tag bytes that can be + * attached to this bio. Filesystems can use this to determine how + * much metadata to attach to an I/O. + */ +unsigned int bio_integrity_tag_size(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + BUG_ON(bio->bi_size == 0); + + return bi->tag_size * (bio->bi_size / bi->sector_size); +} +EXPORT_SYMBOL(bio_integrity_tag_size); + +int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip->bip_buf == NULL); + + if (bi->tag_size == 0) + return -1; + + nr_sectors = bio_integrity_hw_sectors(bi, DIV_ROUND_UP(len, bi->tag_size)); + + if (nr_sectors * bi->tuple_size > bip->bip_size) { + printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", + __func__, nr_sectors * bi->tuple_size, bip->bip_size); + return -1; + } + + if (set) + bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + else + bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + + return 0; +} + +/** + * bio_integrity_set_tag - Attach a tag buffer to a bio + * @bio: bio to attach buffer to + * @tag_buf: Pointer to a buffer containing tag data + * @len: Length of the included buffer + * + * Description: Use this function to tag a bio by leveraging the extra + * space provided by devices formatted with integrity protection. The + * size of the integrity buffer must be <= to the size reported by + * bio_integrity_tag_size(). + */ +int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != WRITE); + + return bio_integrity_tag(bio, tag_buf, len, 1); +} +EXPORT_SYMBOL(bio_integrity_set_tag); + +/** + * bio_integrity_get_tag - Retrieve a tag buffer from a bio + * @bio: bio to retrieve buffer from + * @tag_buf: Pointer to a buffer for the tag data + * @len: Length of the target buffer + * + * Description: Use this function to retrieve the tag buffer from a + * completed I/O. The size of the integrity buffer must be <= to the + * size reported by bio_integrity_tag_size(). + */ +int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != READ); + + return bio_integrity_tag(bio, tag_buf, len, 0); +} +EXPORT_SYMBOL(bio_integrity_get_tag); + +/** + * bio_integrity_generate - Generate integrity metadata for a bio + * @bio: bio to generate integrity metadata for + * + * Description: Generates integrity metadata for a bio by calling the + * block device's generation callback function. The bio must have a + * bip attached with enough room to accommodate the generated + * integrity metadata. + */ +static void bio_integrity_generate(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_sector; + unsigned int i, sectors, total; + void *prot_buf = bio->bi_integrity->bip_buf; + + total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + bi->generate_fn(&bix); + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Description: Allocates a buffer for integrity metadata, maps the + * pages and attaches them to a bio. The bio must have data + * direction, target device and start sector set priot to calling. In + * the WRITE case, integrity metadata will be generated using the + * block device's integrity function. In the READ case, the buffer + * will be prepared for DMA and a suitable end_io handler set up. + */ +int bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct request_queue *q; + void *buf; + unsigned long start, end; + unsigned int len, nr_pages; + unsigned int bytes, offset, i; + unsigned int sectors; + + bi = bdev_get_integrity(bio->bi_bdev); + q = bdev_get_queue(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bio_integrity(bio)); + + sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); + + /* Allocate kernel buffer for protection data */ + len = sectors * blk_integrity_tuple_size(bi); + buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); + if (unlikely(buf == NULL)) { + printk(KERN_ERR "could not allocate integrity buffer\n"); + return -EIO; + } + + end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ((unsigned long) buf) >> PAGE_SHIFT; + nr_pages = end - start; + + /* Allocate bio integrity payload and integrity vectors */ + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "could not allocate data integrity bioset\n"); + kfree(buf); + return -EIO; + } + + bip->bip_buf = buf; + bip->bip_size = len; + bip->bip_sector = bio->bi_sector; + + /* Map it */ + offset = offset_in_page(buf); + for (i = 0 ; i < nr_pages ; i++) { + int ret; + bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + ret = bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset); + + if (ret == 0) + return 0; + + if (ret < bytes) + break; + + buf += bytes; + len -= bytes; + offset = 0; + } + + /* Install custom I/O completion handler if read verify is enabled */ + if (bio_data_dir(bio) == READ) { + bip->bip_end_io = bio->bi_end_io; + bio->bi_end_io = bio_integrity_endio; + } + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + bio_integrity_generate(bio); + + return 0; +} +EXPORT_SYMBOL(bio_integrity_prep); + +/** + * bio_integrity_verify - Verify integrity metadata for a bio + * @bio: bio to verify + * + * Description: This function is called to verify the integrity of a + * bio. The data in the bio io_vec is compared to the integrity + * metadata returned by the HBA. + */ +static int bio_integrity_verify(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_integrity->bip_sector; + unsigned int i, sectors, total, ret; + void *prot_buf = bio->bi_integrity->bip_buf; + + ret = total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + ret = bi->verify_fn(&bix); + + if (ret) { + kunmap_atomic(kaddr, KM_USER0); + break; + } + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } + + return ret; +} + +/** + * bio_integrity_verify_fn - Integrity I/O completion worker + * @work: Work struct stored in bio to be verified + * + * Description: This workqueue function is called to complete a READ + * request. The function verifies the transferred integrity metadata + * and then calls the original bio end_io function. + */ +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + int error = bip->bip_error; + + if (bio_integrity_verify(bio)) { + clear_bit(BIO_UPTODATE, &bio->bi_flags); + error = -EIO; + } + + /* Restore original bio completion handler */ + bio->bi_end_io = bip->bip_end_io; + + if (bio->bi_end_io) + bio->bi_end_io(bio, error); +} + +/** + * bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * @error: Pointer to errno + * + * Description: Completion for integrity I/O + * + * Normally I/O completion is done in interrupt context. However, + * verifying I/O integrity is a time-consuming task which must be run + * in process context. This function postpones completion + * accordingly. + */ +void bio_integrity_endio(struct bio *bio, int error) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip->bip_bio != bio); + + bip->bip_error = error; + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); +} +EXPORT_SYMBOL(bio_integrity_endio); + +/** + * bio_integrity_mark_head - Advance bip_vec skip bytes + * @bip: Integrity vector to advance + * @skip: Number of bytes to advance it + */ +void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int skip) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (skip == 0) { + bip->bip_idx = i; + return; + } else if (skip >= iv->bv_len) { + skip -= iv->bv_len; + } else { /* skip < iv->bv_len) */ + iv->bv_offset += skip; + iv->bv_len -= skip; + bip->bip_idx = i; + return; + } + } +} + +/** + * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long + * @bip: Integrity vector to truncate + * @len: New length of integrity vector + */ +void bio_integrity_mark_tail(struct bio_integrity_payload *bip, unsigned int len) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (len == 0) { + bip->bip_vcnt = i; + return; + } else if (len >= iv->bv_len) { + len -= iv->bv_len; + } else { /* len < iv->bv_len) */ + iv->bv_len = len; + len = 0; + } + } +} + +/** + * bio_integrity_advance - Advance integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes that have been completed + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and advances the + * integrity vector accordingly. + */ +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + + nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9); + bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_advance); + +/** + * bio_integrity_trim - Trim integrity vector + * @bio: bio whose integrity vector to update + * @offset: offset to first data sector + * @sectors: number of data sectors + * + * Description: Used to trim the integrity vector in a cloned bio. + * The ivec will be advanced corresponding to 'offset' data sectors + * and the length will be truncated corresponding to 'len' data + * sectors. + */ +void bio_integrity_trim(struct bio *bio, unsigned int offset, unsigned int sectors) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + BUG_ON(!bio_flagged(bio, BIO_CLONED)); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + bip->bip_sector = bip->bip_sector + offset; + bio_integrity_mark_head(bip, offset * bi->tuple_size); + bio_integrity_mark_tail(bip, sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_trim); + +/** + * bio_integrity_split - Split integrity metadata + * @bio: Protected bio + * @bp: Resulting bio_pair + * @sectors: Offset + * + * Description: Splits an integrity page into a bio_pair. + */ +void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) +{ + struct blk_integrity *bi; + struct bio_integrity_payload *bip = bio->bi_integrity; + unsigned int nr_sectors; + + if (bio_integrity(bio) == 0) + return; + + bi = bdev_get_integrity(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bip->bip_vcnt != 1); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + + bp->bio1.bi_integrity = &bp->bip1; + bp->bio2.bi_integrity = &bp->bip2; + + bp->iv1 = bip->bip_vec[0]; + bp->iv2 = bip->bip_vec[0]; + + bp->bip1.bip_vec = &bp->iv1; + bp->bip2.bip_vec = &bp->iv2; + + bp->iv1.bv_len = sectors * bi->tuple_size; + bp->iv2.bv_offset += sectors * bi->tuple_size; + bp->iv2.bv_len -= sectors * bi->tuple_size; + + bp->bip1.bip_sector = bio->bi_integrity->bip_sector; + bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors; + + bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1; + bp->bip1.bip_idx = bp->bip2.bip_idx = 0; +} +EXPORT_SYMBOL(bio_integrity_split); + +/** + * bio_integrity_clone - Callback for cloning bios with integrity metadata + * @bio: New bio + * @bio_src: Original bio + * @bs: bio_set to allocate bip from + * + * Description: Called to allocate a bip when cloning a bio + */ +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, struct bio_set *bs) +{ + struct bio_integrity_payload *bip_src = bio_src->bi_integrity; + struct bio_integrity_payload *bip; + + BUG_ON(bip_src == NULL); + + bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); + + if (bip == NULL) + return -EIO; + + memcpy(bip->bip_vec, bip_src->bip_vec, + bip_src->bip_vcnt * sizeof(struct bio_vec)); + + bip->bip_sector = bip_src->bip_sector; + bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_idx = bip_src->bip_idx; + + return 0; +} +EXPORT_SYMBOL(bio_integrity_clone); + +int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, + bio_integrity_slab); + if (!bs->bio_integrity_pool) + return -1; + + return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ + if (bs->bio_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); +} +EXPORT_SYMBOL(bioset_integrity_free); + +void __init bio_integrity_init_slab(void) +{ + bio_integrity_slab = KMEM_CACHE(bio_integrity_payload, + SLAB_HWCACHE_ALIGN|SLAB_PANIC); +} +EXPORT_SYMBOL(bio_integrity_init_slab); + +static int __init integrity_init(void) +{ + kintegrityd_wq = create_workqueue("kintegrityd"); + + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + + return 0; +} +subsys_initcall(integrity_init); diff --git a/fs/bio.c b/fs/bio.c index 7a6598abc967..7761c84c7032 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -50,6 +50,11 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { */ struct bio_set *fs_bio_set; +unsigned int bvec_nr_vecs(unsigned short idx) +{ + return bvec_slabs[idx].nr_vecs; +} + struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) { struct bio_vec *bvl; @@ -91,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set) mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); } + if (bio_integrity(bio)) + bio_integrity_free(bio, bio_set); + mempool_free(bio, bio_set->bio_pool); } @@ -249,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) { struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); - if (b) { - b->bi_destructor = bio_fs_destructor; - __bio_clone(b, bio); + if (!b) + return NULL; + + b->bi_destructor = bio_fs_destructor; + __bio_clone(b, bio); + + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, fs_bio_set); + + if (ret < 0) + return NULL; } return b; @@ -1223,6 +1241,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) bp->bio1.bi_private = bi; bp->bio2.bi_private = pool; + if (bio_integrity(bi)) + bio_integrity_split(bi, bp, first_sectors); + return bp; } @@ -1264,6 +1285,7 @@ void bioset_free(struct bio_set *bs) if (bs->bio_pool) mempool_destroy(bs->bio_pool); + bioset_integrity_free(bs); biovec_free_pools(bs); kfree(bs); @@ -1280,6 +1302,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size) if (!bs->bio_pool) goto bad; + if (bioset_integrity_create(bs, bio_pool_size)) + goto bad; + if (!biovec_create_pools(bs, bvec_pool_size)) return bs; @@ -1306,6 +1331,7 @@ static int __init init_bio(void) { bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + bio_integrity_init_slab(); biovec_init_slabs(); fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); diff --git a/include/linux/bio.h b/include/linux/bio.h index 49dfb3cb7460..6bfc3e8d9d89 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -64,6 +64,7 @@ struct bio_vec { struct bio_set; struct bio; +struct bio_integrity_payload; typedef void (bio_end_io_t) (struct bio *, int); typedef void (bio_destructor_t) (struct bio *); @@ -112,6 +113,9 @@ struct bio { atomic_t bi_cnt; /* pin count */ void *bi_private; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_payload *bi_integrity; /* data integrity */ +#endif bio_destructor_t *bi_destructor; /* destructor */ }; @@ -271,6 +275,29 @@ static inline void *bio_data(struct bio *bio) */ #define bio_get(bio) atomic_inc(&(bio)->bi_cnt) +#if defined(CONFIG_BLK_DEV_INTEGRITY) +/* + * bio integrity payload + */ +struct bio_integrity_payload { + struct bio *bip_bio; /* parent bio */ + struct bio_vec *bip_vec; /* integrity data vector */ + + sector_t bip_sector; /* virtual start sector */ + + void *bip_buf; /* generated integrity data */ + bio_end_io_t *bip_end_io; /* saved I/O completion fn */ + + int bip_error; /* saved I/O error */ + unsigned int bip_size; + + unsigned short bip_pool; /* pool the ivec came from */ + unsigned short bip_vcnt; /* # of integrity bio_vecs */ + unsigned short bip_idx; /* current bip_vec index */ + + struct work_struct bip_work; /* I/O completion */ +}; +#endif /* CONFIG_BLK_DEV_INTEGRITY */ /* * A bio_pair is used when we need to split a bio. @@ -283,10 +310,14 @@ static inline void *bio_data(struct bio *bio) * in bio2.bi_private */ struct bio_pair { - struct bio bio1, bio2; - struct bio_vec bv1, bv2; - atomic_t cnt; - int error; + struct bio bio1, bio2; + struct bio_vec bv1, bv2; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_payload bip1, bip2; + struct bio_vec iv1, iv2; +#endif + atomic_t cnt; + int error; }; extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors); @@ -334,6 +365,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *, extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); +extern unsigned int bvec_nr_vecs(unsigned short idx); /* * bio_set is used to allow other portions of the IO system to @@ -346,6 +378,9 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set struct bio_set { mempool_t *bio_pool; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + mempool_t *bio_integrity_pool; +#endif mempool_t *bvec_pools[BIOVEC_NR_POOLS]; }; @@ -410,5 +445,56 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, __bio_kmap_irq((bio), (bio)->bi_idx, (flags)) #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +#define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) +#define bip_vec(bip) bip_vec_idx(bip, 0) + +#define __bip_for_each_vec(bvl, bip, i, start_idx) \ + for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx); \ + i < (bip)->bip_vcnt; \ + bvl++, i++) + +#define bip_for_each_vec(bvl, bip, i) \ + __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) + +#define bio_integrity(bio) ((bio)->bi_integrity ? 1 : 0) + +extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); +extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); +extern void bio_integrity_free(struct bio *, struct bio_set *); +extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); +extern int bio_integrity_enabled(struct bio *bio); +extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); +extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); +extern int bio_integrity_prep(struct bio *); +extern void bio_integrity_endio(struct bio *, int); +extern void bio_integrity_advance(struct bio *, unsigned int); +extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); +extern void bio_integrity_split(struct bio *, struct bio_pair *, int); +extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *); +extern int bioset_integrity_create(struct bio_set *, int); +extern void bioset_integrity_free(struct bio_set *); +extern void bio_integrity_init_slab(void); + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +#define bio_integrity(a) (0) +#define bioset_integrity_create(a, b) (0) +#define bio_integrity_prep(a) (0) +#define bio_integrity_enabled(a) (0) +#define bio_integrity_clone(a, b, c) (0) +#define bioset_integrity_free(a) do { } while (0) +#define bio_integrity_free(a, b) do { } while (0) +#define bio_integrity_endio(a, b) do { } while (0) +#define bio_integrity_advance(a, b) do { } while (0) +#define bio_integrity_trim(a, b, c) do { } while (0) +#define bio_integrity_split(a, b, c) do { } while (0) +#define bio_integrity_set_tag(a, b, c) do { } while (0) +#define bio_integrity_get_tag(a, b, c) do { } while (0) +#define bio_integrity_init_slab(a) do { } while (0) + +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + #endif /* CONFIG_BLOCK */ #endif /* __LINUX_BIO_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a3da6717135..4a9ed45270ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -112,6 +112,7 @@ enum rq_flag_bits { __REQ_ALLOCED, /* request came from our alloc pool */ __REQ_RW_META, /* metadata io request */ __REQ_COPY_USER, /* contains copies of user pages */ + __REQ_INTEGRITY, /* integrity metadata has been remapped */ __REQ_NR_BITS, /* stops here */ }; @@ -134,6 +135,7 @@ enum rq_flag_bits { #define REQ_ALLOCED (1 << __REQ_ALLOCED) #define REQ_RW_META (1 << __REQ_RW_META) #define REQ_COPY_USER (1 << __REQ_COPY_USER) +#define REQ_INTEGRITY (1 << __REQ_INTEGRITY) #define BLK_MAX_CDB 16 @@ -865,6 +867,109 @@ void kblockd_flush_work(struct work_struct *work); MODULE_ALIAS("block-major-" __stringify(major) "-*") +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +#define INTEGRITY_FLAG_READ 1 /* verify data integrity on read */ +#define INTEGRITY_FLAG_WRITE 2 /* generate data integrity on write */ + +struct blk_integrity_exchg { + void *prot_buf; + void *data_buf; + sector_t sector; + unsigned int data_size; + unsigned short sector_size; + const char *disk_name; +}; + +typedef void (integrity_gen_fn) (struct blk_integrity_exchg *); +typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *); +typedef void (integrity_set_tag_fn) (void *, void *, unsigned int); +typedef void (integrity_get_tag_fn) (void *, void *, unsigned int); + +struct blk_integrity { + integrity_gen_fn *generate_fn; + integrity_vrfy_fn *verify_fn; + integrity_set_tag_fn *set_tag_fn; + integrity_get_tag_fn *get_tag_fn; + + unsigned short flags; + unsigned short tuple_size; + unsigned short sector_size; + unsigned short tag_size; + + const char *name; + + struct kobject kobj; +}; + +extern int blk_integrity_register(struct gendisk *, struct blk_integrity *); +extern void blk_integrity_unregister(struct gendisk *); +extern int blk_integrity_compare(struct block_device *, struct block_device *); +extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); +extern int blk_rq_count_integrity_sg(struct request *); + +static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) +{ + if (bi) + return bi->tuple_size; + + return 0; +} + +static inline struct blk_integrity *bdev_get_integrity(struct block_device *bdev) +{ + return bdev->bd_disk->integrity; +} + +static inline unsigned int bdev_get_tag_size(struct block_device *bdev) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi) + return bi->tag_size; + + return 0; +} + +static inline int bdev_integrity_enabled(struct block_device *bdev, int rw) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi == NULL) + return 0; + + if (rw == READ && bi->verify_fn != NULL && + test_bit(INTEGRITY_FLAG_READ, &bi->flags)) + return 1; + + if (rw == WRITE && bi->generate_fn != NULL && + test_bit(INTEGRITY_FLAG_WRITE, &bi->flags)) + return 1; + + return 0; +} + +static inline int blk_integrity_rq(struct request *rq) +{ + BUG_ON(rq->bio == NULL); + + return bio_integrity(rq->bio); +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +#define blk_integrity_rq(rq) (0) +#define blk_rq_count_integrity_sg(a) (0) +#define blk_rq_map_integrity_sg(a, b) (0) +#define bdev_get_integrity(a) (0) +#define bdev_get_tag_size(a) (0) +#define blk_integrity_compare(a, b) (0) +#define blk_integrity_register(a, b) (0) +#define blk_integrity_unregister(a) do { } while (0); + +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + + #else /* CONFIG_BLOCK */ /* * stubs for when the block layer is configured out diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ae7aec3cabee..524ec96f5a23 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -141,6 +141,9 @@ struct gendisk { struct disk_stats dkstats; #endif struct work_struct async_notify; +#ifdef CONFIG_BLK_DEV_INTEGRITY + struct blk_integrity *integrity; +#endif }; /* -- cgit v1.2.3 From b984679efe1a616ec4ac919dba08286d71593900 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Jun 2008 19:05:48 +0200 Subject: block: integrity checkpatch cleanups > 80 char lines and that sort of thing. Signed-off-by: Jens Axboe --- block/blk-integrity.c | 8 +++++--- fs/bio-integrity.c | 29 ++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 65f23ef38bbe..4ffa3814f6a9 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -178,8 +178,9 @@ static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, return entry->show(bi, page); } -static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t count) +static ssize_t integrity_attr_store(struct kobject *kobj, + struct attribute *attr, const char *page, + size_t count) { struct blk_integrity *bi = container_of(kobj, struct blk_integrity, kobj); @@ -326,7 +327,8 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) BUG_ON(template == NULL); if (disk->integrity == NULL) { - bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO); + bi = kmem_cache_alloc(integrity_cachep, + GFP_KERNEL | __GFP_ZERO); if (!bi) return -1; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 31b08878913d..63e2ee63058d 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -39,7 +39,10 @@ static struct workqueue_struct *kintegrityd_wq; * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ -struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs, struct bio_set *bs) +struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs, + struct bio_set *bs) { struct bio_integrity_payload *bip; struct bio_vec *iv; @@ -81,7 +84,9 @@ EXPORT_SYMBOL(bio_integrity_alloc_bioset); * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ -struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) { return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); } @@ -174,7 +179,8 @@ EXPORT_SYMBOL(bio_integrity_enabled); * sector size of the storage device. Convert the block layer sectors * to physical sectors. */ -static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, unsigned int sectors) +static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, + unsigned int sectors) { /* At this point there are only 512b or 4096b DIF/EPP devices */ if (bi->sector_size == 4096) @@ -212,7 +218,8 @@ int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) if (bi->tag_size == 0) return -1; - nr_sectors = bio_integrity_hw_sectors(bi, DIV_ROUND_UP(len, bi->tag_size)); + nr_sectors = bio_integrity_hw_sectors(bi, + DIV_ROUND_UP(len, bi->tag_size)); if (nr_sectors * bi->tuple_size > bip->bip_size) { printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", @@ -456,7 +463,7 @@ static int bio_integrity_verify(struct bio *bio) */ static void bio_integrity_verify_fn(struct work_struct *work) { - struct bio_integrity_payload *bip = + struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; int error = bip->bip_error; @@ -502,7 +509,8 @@ EXPORT_SYMBOL(bio_integrity_endio); * @bip: Integrity vector to advance * @skip: Number of bytes to advance it */ -void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int skip) +void bio_integrity_mark_head(struct bio_integrity_payload *bip, + unsigned int skip) { struct bio_vec *iv; unsigned int i; @@ -527,7 +535,8 @@ void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int ski * @bip: Integrity vector to truncate * @len: New length of integrity vector */ -void bio_integrity_mark_tail(struct bio_integrity_payload *bip, unsigned int len) +void bio_integrity_mark_tail(struct bio_integrity_payload *bip, + unsigned int len) { struct bio_vec *iv; unsigned int i; @@ -579,7 +588,8 @@ EXPORT_SYMBOL(bio_integrity_advance); * and the length will be truncated corresponding to 'len' data * sectors. */ -void bio_integrity_trim(struct bio *bio, unsigned int offset, unsigned int sectors) +void bio_integrity_trim(struct bio *bio, unsigned int offset, + unsigned int sectors) { struct bio_integrity_payload *bip = bio->bi_integrity; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); @@ -648,7 +658,8 @@ EXPORT_SYMBOL(bio_integrity_split); * * Description: Called to allocate a bip when cloning a bio */ -int bio_integrity_clone(struct bio *bio, struct bio *bio_src, struct bio_set *bs) +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + struct bio_set *bs) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; -- cgit v1.2.3 From 0b07de85a76e1346e675f0e98437378932473df7 Mon Sep 17 00:00:00 2001 From: Adel Gadllah Date: Thu, 26 Jun 2008 13:48:27 +0200 Subject: allow userspace to modify scsi command filter on per device basis This patch exports the per-gendisk command filter to user space through sysfs, so it can be changed by the system administrator. All users of the old cmd filter have been converted to use the new one. Original patch from Peter Jones. Signed-off-by: Adel Gadllah Signed-off-by: Peter Jones Signed-off-by: Jens Axboe --- block/Makefile | 3 +- block/bsg.c | 38 ++++-- block/cmd-filter.c | 325 +++++++++++++++++++++++++++++++++++++++++++++++++ block/genhd.c | 2 + block/scsi_ioctl.c | 121 +----------------- drivers/scsi/sg.c | 40 ++---- include/linux/blkdev.h | 10 +- include/linux/genhd.h | 9 ++ 8 files changed, 389 insertions(+), 159 deletions(-) create mode 100644 block/cmd-filter.c (limited to 'block') diff --git a/block/Makefile b/block/Makefile index 045f7b62e4bb..208000b0750d 100644 --- a/block/Makefile +++ b/block/Makefile @@ -4,7 +4,8 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o + blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o \ + cmd-filter.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o diff --git a/block/bsg.c b/block/bsg.c index f0b7cd343216..439940c3a1ff 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -44,11 +44,12 @@ struct bsg_device { char name[BUS_ID_SIZE]; int max_queue; unsigned long flags; + struct blk_scsi_cmd_filter *cmd_filter; + mode_t *f_mode; }; enum { BSG_F_BLOCK = 1, - BSG_F_WRITE_PERM = 2, }; #define BSG_DEFAULT_CMDS 64 @@ -172,7 +173,7 @@ unlock: } static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, - struct sg_io_v4 *hdr, int has_write_perm) + struct sg_io_v4 *hdr, struct bsg_device *bd) { if (hdr->request_len > BLK_MAX_CDB) { rq->cmd = kzalloc(hdr->request_len, GFP_KERNEL); @@ -185,7 +186,8 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, return -EFAULT; if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { - if (blk_verify_command(rq->cmd, has_write_perm)) + if (blk_cmd_filter_verify_command(bd->cmd_filter, rq->cmd, + bd->f_mode)) return -EPERM; } else if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -263,8 +265,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr) rq = blk_get_request(q, rw, GFP_KERNEL); if (!rq) return ERR_PTR(-ENOMEM); - ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, test_bit(BSG_F_WRITE_PERM, - &bd->flags)); + ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd); if (ret) goto out; @@ -566,12 +567,23 @@ static inline void bsg_set_block(struct bsg_device *bd, struct file *file) set_bit(BSG_F_BLOCK, &bd->flags); } -static inline void bsg_set_write_perm(struct bsg_device *bd, struct file *file) +static void bsg_set_cmd_filter(struct bsg_device *bd, + struct file *file) { - if (file->f_mode & FMODE_WRITE) - set_bit(BSG_F_WRITE_PERM, &bd->flags); - else - clear_bit(BSG_F_WRITE_PERM, &bd->flags); + struct inode *inode; + struct gendisk *disk; + + if (!file) + return; + + inode = file->f_dentry->d_inode; + if (!inode) + return; + + disk = inode->i_bdev->bd_disk; + + bd->cmd_filter = &disk->cmd_filter; + bd->f_mode = &file->f_mode; } /* @@ -595,6 +607,8 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) dprintk("%s: read %Zd bytes\n", bd->name, count); bsg_set_block(bd, file); + bsg_set_cmd_filter(bd, file); + bytes_read = 0; ret = __bsg_read(buf, count, bd, NULL, &bytes_read); *ppos = bytes_read; @@ -668,7 +682,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) dprintk("%s: write %Zd bytes\n", bd->name, count); bsg_set_block(bd, file); - bsg_set_write_perm(bd, file); + bsg_set_cmd_filter(bd, file); bytes_written = 0; ret = __bsg_write(bd, buf, count, &bytes_written); @@ -771,7 +785,9 @@ static struct bsg_device *bsg_add_device(struct inode *inode, } bd->queue = rq; + bsg_set_block(bd, file); + bsg_set_cmd_filter(bd, file); atomic_set(&bd->ref_count, 1); mutex_lock(&bsg_mutex); diff --git a/block/cmd-filter.c b/block/cmd-filter.c new file mode 100644 index 000000000000..35e327ceaa97 --- /dev/null +++ b/block/cmd-filter.c @@ -0,0 +1,325 @@ +/* + * Copyright 2004 Peter M. Jones + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +int blk_cmd_filter_verify_command(struct blk_scsi_cmd_filter *filter, + unsigned char *cmd, mode_t *f_mode) +{ + /* root can do any command. */ + if (capable(CAP_SYS_RAWIO)) + return 0; + + /* if there's no filter set, assume we're filtering everything out */ + if (!filter) + return -EPERM; + + /* Anybody who can open the device can do a read-safe command */ + if (test_bit(cmd[0], filter->read_ok)) + return 0; + + /* Write-safe commands require a writable open */ + if (test_bit(cmd[0], filter->write_ok) && (*f_mode & FMODE_WRITE)) + return 0; + + return -EPERM; +} +EXPORT_SYMBOL(blk_cmd_filter_verify_command); + +int blk_verify_command(struct file *file, unsigned char *cmd) +{ + struct gendisk *disk; + struct inode *inode; + + if (!file) + return -EINVAL; + + inode = file->f_dentry->d_inode; + if (!inode) + return -EINVAL; + + disk = inode->i_bdev->bd_disk; + + return blk_cmd_filter_verify_command(&disk->cmd_filter, + cmd, &file->f_mode); +} +EXPORT_SYMBOL(blk_verify_command); + +/* and now, the sysfs stuff */ +static ssize_t rcf_cmds_show(struct blk_scsi_cmd_filter *filter, char *page, + int rw) +{ + char *npage = page; + unsigned long *okbits; + int i; + + if (rw == READ) + okbits = filter->read_ok; + else + okbits = filter->write_ok; + + for (i = 0; i < BLK_SCSI_MAX_CMDS; i++) { + if (test_bit(i, okbits)) { + sprintf(npage, "%02x", i); + npage += 2; + if (i < BLK_SCSI_MAX_CMDS - 1) + sprintf(npage++, " "); + } + } + + if (npage != page) + npage += sprintf(npage, "\n"); + + return npage - page; +} + +static ssize_t rcf_readcmds_show(struct blk_scsi_cmd_filter *filter, char *page) +{ + return rcf_cmds_show(filter, page, READ); +} + +static ssize_t rcf_writecmds_show(struct blk_scsi_cmd_filter *filter, + char *page) +{ + return rcf_cmds_show(filter, page, WRITE); +} + +static ssize_t rcf_cmds_store(struct blk_scsi_cmd_filter *filter, + const char *page, size_t count, int rw) +{ + ssize_t ret = 0; + unsigned long okbits[BLK_SCSI_CMD_PER_LONG], *target_okbits; + int cmd, status, len; + substring_t ss; + + memset(&okbits, 0, sizeof(okbits)); + + for (len = strlen(page); len > 0; len -= 3) { + if (len < 2) + break; + ss.from = (char *) page + ret; + ss.to = (char *) page + ret + 2; + ret += 3; + status = match_hex(&ss, &cmd); + /* either of these cases means invalid input, so do nothing. */ + if (status || cmd >= BLK_SCSI_MAX_CMDS) + return -EINVAL; + + __set_bit(cmd, okbits); + } + + if (rw == READ) + target_okbits = filter->read_ok; + else + target_okbits = filter->write_ok; + + memmove(target_okbits, okbits, sizeof(okbits)); + return count; +} + +static ssize_t rcf_readcmds_store(struct blk_scsi_cmd_filter *filter, + const char *page, size_t count) +{ + return rcf_cmds_store(filter, page, count, READ); +} + +static ssize_t rcf_writecmds_store(struct blk_scsi_cmd_filter *filter, + const char *page, size_t count) +{ + return rcf_cmds_store(filter, page, count, WRITE); +} + +struct rcf_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_scsi_cmd_filter *, char *); + ssize_t (*store)(struct blk_scsi_cmd_filter *, const char *, size_t); +}; + +static struct rcf_sysfs_entry rcf_readcmds_entry = { + .attr = { .name = "read_table", .mode = S_IRUGO | S_IWUSR }, + .show = rcf_readcmds_show, + .store = rcf_readcmds_store, +}; + +static struct rcf_sysfs_entry rcf_writecmds_entry = { + .attr = {.name = "write_table", .mode = S_IRUGO | S_IWUSR }, + .show = rcf_writecmds_show, + .store = rcf_writecmds_store, +}; + +static struct attribute *default_attrs[] = { + &rcf_readcmds_entry.attr, + &rcf_writecmds_entry.attr, + NULL, +}; + +#define to_rcf(atr) container_of((atr), struct rcf_sysfs_entry, attr) + +static ssize_t +rcf_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct rcf_sysfs_entry *entry = to_rcf(attr); + struct blk_scsi_cmd_filter *filter; + + filter = container_of(kobj, struct blk_scsi_cmd_filter, kobj); + if (entry->show) + return entry->show(filter, page); + + return 0; +} + +static ssize_t +rcf_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct rcf_sysfs_entry *entry = to_rcf(attr); + struct blk_scsi_cmd_filter *filter; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (!entry->store) + return -EINVAL; + + filter = container_of(kobj, struct blk_scsi_cmd_filter, kobj); + return entry->store(filter, page, length); +} + +static struct sysfs_ops rcf_sysfs_ops = { + .show = rcf_attr_show, + .store = rcf_attr_store, +}; + +static struct kobj_type rcf_ktype = { + .sysfs_ops = &rcf_sysfs_ops, + .default_attrs = default_attrs, +}; + +static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter) +{ + /* Basic read-only commands */ + __set_bit(TEST_UNIT_READY, filter->read_ok); + __set_bit(REQUEST_SENSE, filter->read_ok); + __set_bit(READ_6, filter->read_ok); + __set_bit(READ_10, filter->read_ok); + __set_bit(READ_12, filter->read_ok); + __set_bit(READ_16, filter->read_ok); + __set_bit(READ_BUFFER, filter->read_ok); + __set_bit(READ_DEFECT_DATA, filter->read_ok); + __set_bit(READ_LONG, filter->read_ok); + __set_bit(INQUIRY, filter->read_ok); + __set_bit(MODE_SENSE, filter->read_ok); + __set_bit(MODE_SENSE_10, filter->read_ok); + __set_bit(LOG_SENSE, filter->read_ok); + __set_bit(START_STOP, filter->read_ok); + __set_bit(GPCMD_VERIFY_10, filter->read_ok); + __set_bit(VERIFY_16, filter->read_ok); + __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); + + /* Audio CD commands */ + __set_bit(GPCMD_PLAY_CD, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok); + __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok); + + /* CD/DVD data reading */ + __set_bit(GPCMD_READ_CD, filter->read_ok); + __set_bit(GPCMD_READ_CD_MSF, filter->read_ok); + __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok); + __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok); + __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok); + __set_bit(GPCMD_READ_HEADER, filter->read_ok); + __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok); + __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok); + __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok); + __set_bit(GPCMD_REPORT_KEY, filter->read_ok); + __set_bit(GPCMD_SCAN, filter->read_ok); + __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok); + __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok); + __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok); + __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok); + __set_bit(GPCMD_SEEK, filter->read_ok); + __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok); + + /* Basic writing commands */ + __set_bit(WRITE_6, filter->write_ok); + __set_bit(WRITE_10, filter->write_ok); + __set_bit(WRITE_VERIFY, filter->write_ok); + __set_bit(WRITE_12, filter->write_ok); + __set_bit(WRITE_VERIFY_12, filter->write_ok); + __set_bit(WRITE_16, filter->write_ok); + __set_bit(WRITE_LONG, filter->write_ok); + __set_bit(WRITE_LONG_2, filter->write_ok); + __set_bit(ERASE, filter->write_ok); + __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); + __set_bit(MODE_SELECT, filter->write_ok); + __set_bit(LOG_SELECT, filter->write_ok); + __set_bit(GPCMD_BLANK, filter->write_ok); + __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok); + __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok); + __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok); + __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok); + __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok); + __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok); + __set_bit(GPCMD_SEND_EVENT, filter->write_ok); + __set_bit(GPCMD_SEND_KEY, filter->write_ok); + __set_bit(GPCMD_SEND_OPC, filter->write_ok); + __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok); + __set_bit(GPCMD_SET_SPEED, filter->write_ok); + __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); + __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); + __set_bit(GPCMD_SET_STREAMING, filter->write_ok); +} + +int blk_register_filter(struct gendisk *disk) +{ + int ret; + struct blk_scsi_cmd_filter *filter = &disk->cmd_filter; + struct kobject *parent = kobject_get(disk->holder_dir->parent); + + if (!parent) + return -ENODEV; + + ret = kobject_init_and_add(&filter->kobj, &rcf_ktype, parent, + "%s", "cmd_filter"); + + if (ret < 0) + return ret; + + rcf_set_defaults(filter); + return 0; +} + +void blk_unregister_filter(struct gendisk *disk) +{ + struct blk_scsi_cmd_filter *filter = &disk->cmd_filter; + + kobject_put(&filter->kobj); + kobject_put(disk->holder_dir->parent); +} + diff --git a/block/genhd.c b/block/genhd.c index 43e468ee5993..9074f384b097 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -189,6 +189,7 @@ void add_disk(struct gendisk *disk) disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); + blk_register_filter(disk); bdi = &disk->queue->backing_dev_info; bdi_register_dev(bdi, MKDEV(disk->major, disk->first_minor)); @@ -200,6 +201,7 @@ EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ void unlink_gendisk(struct gendisk *disk) { + blk_unregister_filter(disk); sysfs_remove_link(&disk->dev.kobj, "bdi"); bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 78199c08ec92..c5b9bcfc0a6d 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -105,120 +105,12 @@ static int sg_emulated_host(struct request_queue *q, int __user *p) return put_user(1, p); } -#define CMD_READ_SAFE 0x01 -#define CMD_WRITE_SAFE 0x02 -#define CMD_WARNED 0x04 -#define safe_for_read(cmd) [cmd] = CMD_READ_SAFE -#define safe_for_write(cmd) [cmd] = CMD_WRITE_SAFE - -int blk_verify_command(unsigned char *cmd, int has_write_perm) -{ - static unsigned char cmd_type[256] = { - - /* Basic read-only commands */ - safe_for_read(TEST_UNIT_READY), - safe_for_read(REQUEST_SENSE), - safe_for_read(READ_6), - safe_for_read(READ_10), - safe_for_read(READ_12), - safe_for_read(READ_16), - safe_for_read(READ_BUFFER), - safe_for_read(READ_DEFECT_DATA), - safe_for_read(READ_LONG), - safe_for_read(INQUIRY), - safe_for_read(MODE_SENSE), - safe_for_read(MODE_SENSE_10), - safe_for_read(LOG_SENSE), - safe_for_read(START_STOP), - safe_for_read(GPCMD_VERIFY_10), - safe_for_read(VERIFY_16), - - /* Audio CD commands */ - safe_for_read(GPCMD_PLAY_CD), - safe_for_read(GPCMD_PLAY_AUDIO_10), - safe_for_read(GPCMD_PLAY_AUDIO_MSF), - safe_for_read(GPCMD_PLAY_AUDIO_TI), - safe_for_read(GPCMD_PAUSE_RESUME), - - /* CD/DVD data reading */ - safe_for_read(GPCMD_READ_BUFFER_CAPACITY), - safe_for_read(GPCMD_READ_CD), - safe_for_read(GPCMD_READ_CD_MSF), - safe_for_read(GPCMD_READ_DISC_INFO), - safe_for_read(GPCMD_READ_CDVD_CAPACITY), - safe_for_read(GPCMD_READ_DVD_STRUCTURE), - safe_for_read(GPCMD_READ_HEADER), - safe_for_read(GPCMD_READ_TRACK_RZONE_INFO), - safe_for_read(GPCMD_READ_SUBCHANNEL), - safe_for_read(GPCMD_READ_TOC_PMA_ATIP), - safe_for_read(GPCMD_REPORT_KEY), - safe_for_read(GPCMD_SCAN), - safe_for_read(GPCMD_GET_CONFIGURATION), - safe_for_read(GPCMD_READ_FORMAT_CAPACITIES), - safe_for_read(GPCMD_GET_EVENT_STATUS_NOTIFICATION), - safe_for_read(GPCMD_GET_PERFORMANCE), - safe_for_read(GPCMD_SEEK), - safe_for_read(GPCMD_STOP_PLAY_SCAN), - - /* Basic writing commands */ - safe_for_write(WRITE_6), - safe_for_write(WRITE_10), - safe_for_write(WRITE_VERIFY), - safe_for_write(WRITE_12), - safe_for_write(WRITE_VERIFY_12), - safe_for_write(WRITE_16), - safe_for_write(WRITE_LONG), - safe_for_write(WRITE_LONG_2), - safe_for_write(ERASE), - safe_for_write(GPCMD_MODE_SELECT_10), - safe_for_write(MODE_SELECT), - safe_for_write(LOG_SELECT), - safe_for_write(GPCMD_BLANK), - safe_for_write(GPCMD_CLOSE_TRACK), - safe_for_write(GPCMD_FLUSH_CACHE), - safe_for_write(GPCMD_FORMAT_UNIT), - safe_for_write(GPCMD_REPAIR_RZONE_TRACK), - safe_for_write(GPCMD_RESERVE_RZONE_TRACK), - safe_for_write(GPCMD_SEND_DVD_STRUCTURE), - safe_for_write(GPCMD_SEND_EVENT), - safe_for_write(GPCMD_SEND_KEY), - safe_for_write(GPCMD_SEND_OPC), - safe_for_write(GPCMD_SEND_CUE_SHEET), - safe_for_write(GPCMD_SET_SPEED), - safe_for_write(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL), - safe_for_write(GPCMD_LOAD_UNLOAD), - safe_for_write(GPCMD_SET_STREAMING), - }; - unsigned char type = cmd_type[cmd[0]]; - - /* Anybody who can open the device can do a read-safe command */ - if (type & CMD_READ_SAFE) - return 0; - - /* Write-safe commands just require a writable open.. */ - if ((type & CMD_WRITE_SAFE) && has_write_perm) - return 0; - - /* And root can do any command.. */ - if (capable(CAP_SYS_RAWIO)) - return 0; - - if (!type) { - cmd_type[cmd[0]] = CMD_WARNED; - printk(KERN_WARNING "scsi: unknown opcode 0x%02x\n", cmd[0]); - } - - /* Otherwise fail it with an "Operation not permitted" */ - return -EPERM; -} -EXPORT_SYMBOL_GPL(blk_verify_command); - static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, - struct sg_io_hdr *hdr, int has_write_perm) + struct sg_io_hdr *hdr, struct file *file) { if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (blk_verify_command(rq->cmd, has_write_perm)) + if (blk_verify_command(file, rq->cmd)) return -EPERM; /* @@ -287,7 +179,7 @@ static int sg_io(struct file *file, struct request_queue *q, struct gendisk *bd_disk, struct sg_io_hdr *hdr) { unsigned long start_time; - int writing = 0, ret = 0, has_write_perm = 0; + int writing = 0, ret = 0; struct request *rq; char sense[SCSI_SENSE_BUFFERSIZE]; struct bio *bio; @@ -316,10 +208,7 @@ static int sg_io(struct file *file, struct request_queue *q, if (!rq) return -ENOMEM; - if (file) - has_write_perm = file->f_mode & FMODE_WRITE; - - if (blk_fill_sghdr_rq(q, rq, hdr, has_write_perm)) { + if (blk_fill_sghdr_rq(q, rq, hdr, file)) { blk_put_request(rq); return -EFAULT; } @@ -451,7 +340,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q, if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; - err = blk_verify_command(rq->cmd, file->f_mode & FMODE_WRITE); + err = blk_verify_command(file, rq->cmd); if (err) goto error; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ea0edd1b2e76..f7abccaffaec 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -182,8 +182,9 @@ static int sg_build_sgat(Sg_scatter_hold * schp, const Sg_fd * sfp, int tablesize); static ssize_t sg_new_read(Sg_fd * sfp, char __user *buf, size_t count, Sg_request * srp); -static ssize_t sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count, - int blocking, int read_only, Sg_request ** o_srp); +static ssize_t sg_new_write(Sg_fd *sfp, struct file *file, + const char __user *buf, size_t count, int blocking, + int read_only, Sg_request **o_srp); static int sg_common_write(Sg_fd * sfp, Sg_request * srp, unsigned char *cmnd, int timeout, int blocking); static int sg_u_iovec(sg_io_hdr_t * hp, int sg_num, int ind, @@ -204,7 +205,6 @@ static Sg_request *sg_get_rq_mark(Sg_fd * sfp, int pack_id); static Sg_request *sg_add_request(Sg_fd * sfp); static int sg_remove_request(Sg_fd * sfp, Sg_request * srp); static int sg_res_in_use(Sg_fd * sfp); -static int sg_allow_access(unsigned char opcode, char dev_type); static int sg_build_direct(Sg_request * srp, Sg_fd * sfp, int dxfer_len); static Sg_device *sg_get_dev(int dev); #ifdef CONFIG_SCSI_PROC_FS @@ -544,7 +544,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) return -EFAULT; blocking = !(filp->f_flags & O_NONBLOCK); if (old_hdr.reply_len < 0) - return sg_new_write(sfp, buf, count, blocking, 0, NULL); + return sg_new_write(sfp, filp, buf, count, blocking, 0, NULL); if (count < (SZ_SG_HEADER + 6)) return -EIO; /* The minimum scsi command length is 6 bytes. */ @@ -621,8 +621,9 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) } static ssize_t -sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count, - int blocking, int read_only, Sg_request ** o_srp) +sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf, + size_t count, int blocking, int read_only, + Sg_request **o_srp) { int k; Sg_request *srp; @@ -678,8 +679,7 @@ sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count, sg_remove_request(sfp, srp); return -EFAULT; } - if (read_only && - (!sg_allow_access(cmnd[0], sfp->parentdp->device->type))) { + if (read_only && (!blk_verify_command(file, cmnd))) { sg_remove_request(sfp, srp); return -EPERM; } @@ -799,7 +799,7 @@ sg_ioctl(struct inode *inode, struct file *filp, if (!access_ok(VERIFY_WRITE, p, SZ_SG_IO_HDR)) return -EFAULT; result = - sg_new_write(sfp, p, SZ_SG_IO_HDR, + sg_new_write(sfp, filp, p, SZ_SG_IO_HDR, blocking, read_only, &srp); if (result < 0) return result; @@ -1048,7 +1048,7 @@ sg_ioctl(struct inode *inode, struct file *filp, if (copy_from_user(&opcode, siocp->data, 1)) return -EFAULT; - if (!sg_allow_access(opcode, sdp->device->type)) + if (!blk_verify_command(filp, &opcode)) return -EPERM; } return sg_scsi_ioctl(filp, sdp->device->request_queue, NULL, p); @@ -2506,26 +2506,6 @@ sg_page_free(struct page *page, int size) #define MAINTENANCE_IN_CMD 0xa3 #endif -static unsigned char allow_ops[] = { TEST_UNIT_READY, REQUEST_SENSE, - INQUIRY, READ_CAPACITY, READ_BUFFER, READ_6, READ_10, READ_12, - READ_16, MODE_SENSE, MODE_SENSE_10, LOG_SENSE, REPORT_LUNS, - SERVICE_ACTION_IN, RECEIVE_DIAGNOSTIC, READ_LONG, MAINTENANCE_IN_CMD -}; - -static int -sg_allow_access(unsigned char opcode, char dev_type) -{ - int k; - - if (TYPE_SCANNER == dev_type) /* TYPE_ROM maybe burner */ - return 1; - for (k = 0; k < sizeof (allow_ops); ++k) { - if (opcode == allow_ops[k]) - return 1; - } - return 0; -} - #ifdef CONFIG_SCSI_PROC_FS static int sg_idr_max_id(int id, void *p, void *data) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d3ae9ad97213..a842b776d099 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -671,7 +671,6 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); -extern int blk_verify_command(unsigned char *, int); extern void blk_unplug(struct request_queue *q); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) @@ -797,6 +796,15 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, extern int blkdev_issue_flush(struct block_device *, sector_t *); +/* +* command filter functions +*/ +extern int blk_verify_command(struct file *file, unsigned char *cmd); +extern int blk_cmd_filter_verify_command(struct blk_scsi_cmd_filter *filter, + unsigned char *cmd, mode_t *f_mode); +extern int blk_register_filter(struct gendisk *disk); +extern void blk_unregister_filter(struct gendisk *disk); + #define MAX_PHYS_SEGMENTS 128 #define MAX_HW_SEGMENTS 128 #define SAFE_MAX_SECTORS 255 diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 524ec96f5a23..e8787417f65a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -110,6 +110,14 @@ struct hd_struct { #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 #define GENHD_FL_FAIL 64 +#define BLK_SCSI_MAX_CMDS (256) +#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) + +struct blk_scsi_cmd_filter { + unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; + unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; + struct kobject kobj; +}; struct gendisk { int major; /* major number of driver */ @@ -120,6 +128,7 @@ struct gendisk { struct hd_struct **part; /* [indexed by minor] */ struct block_device_operations *fops; struct request_queue *queue; + struct blk_scsi_cmd_filter cmd_filter; void *private_data; sector_t capacity; -- cgit v1.2.3 From c265a7f41706cee20508de5b4a919214cfd7a11b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jun 2008 13:49:33 +0200 Subject: cfq-iosched: get rid of enable_idle being unused warning Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0ebb626a25d3..1e2aff812ee2 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1731,7 +1731,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) return; - old_idle = cfq_cfqq_idle_window(cfqq); + enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (cfqd->hw_tag && CIC_SEEKY(cic))) -- cgit v1.2.3 From 07359fc61bb8ed786f96a1c24cca6f94dd17e329 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 26 Jun 2008 19:39:23 +0200 Subject: block: add bounce support to blk_rq_map_user_iov blk_rq_map_user_iov can't handle the bounce buffer (it means that the bio_map_user_iov path doesn't work with a LLD that needs GFP_DMA). This patch fixes blk_rq_map_user_iov to support the bounce buffer. Signed-off-by: FUJITA Tomonori Cc: Mike Christie Signed-off-by: Jens Axboe --- block/blk-map.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 0b1af5a3537c..813011ef8276 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -210,6 +210,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->cmd_flags |= REQ_COPY_USER; + blk_queue_bounce(q, &bio); bio_get(bio); blk_rq_bio_prep(q, rq, bio); rq->buffer = rq->data = NULL; -- cgit v1.2.3 From 06a452e5b95eb669b7ad414ccf587dfc2d91b217 Mon Sep 17 00:00:00 2001 From: Adel Gadllah Date: Fri, 27 Jun 2008 09:16:17 +0200 Subject: cmdfilter: extend default read filter This patch adds the commands that the former sg filter allowed for read access to the cmdfilter to keep userspace apps that rely on them working. Signed-off-by: Adel Gadllah Signed-off-by: Jens Axboe --- block/cmd-filter.c | 9 +++++++++ drivers/scsi/sg.c | 4 ---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/cmd-filter.c b/block/cmd-filter.c index 35e327ceaa97..eec4404fd357 100644 --- a/block/cmd-filter.c +++ b/block/cmd-filter.c @@ -219,6 +219,10 @@ static struct kobj_type rcf_ktype = { .default_attrs = default_attrs, }; +#ifndef MAINTENANCE_IN_CMD +#define MAINTENANCE_IN_CMD 0xa3 +#endif + static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter) { /* Basic read-only commands */ @@ -230,6 +234,7 @@ static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter) __set_bit(READ_16, filter->read_ok); __set_bit(READ_BUFFER, filter->read_ok); __set_bit(READ_DEFECT_DATA, filter->read_ok); + __set_bit(READ_CAPACITY, filter->read_ok); __set_bit(READ_LONG, filter->read_ok); __set_bit(INQUIRY, filter->read_ok); __set_bit(MODE_SENSE, filter->read_ok); @@ -238,6 +243,10 @@ static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter) __set_bit(START_STOP, filter->read_ok); __set_bit(GPCMD_VERIFY_10, filter->read_ok); __set_bit(VERIFY_16, filter->read_ok); + __set_bit(REPORT_LUNS, filter->read_ok); + __set_bit(SERVICE_ACTION_IN, filter->read_ok); + __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); + __set_bit(MAINTENANCE_IN_CMD, filter->read_ok); __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); /* Audio CD commands */ diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 62b5bd5fd761..fe694f0ee19a 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -2502,10 +2502,6 @@ sg_page_free(struct page *page, int size) __free_pages(page, order); } -#ifndef MAINTENANCE_IN_CMD -#define MAINTENANCE_IN_CMD 0xa3 -#endif - #ifdef CONFIG_SCSI_PROC_FS static int sg_idr_max_id(int id, void *p, void *data) -- cgit v1.2.3 From b24498d477a14680fc3bb3ad884fa9fa76a2d237 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Jun 2008 09:12:09 +0200 Subject: block: integrity flags can't use bit ops on unsigned short Just use normal open coded bit operations instead, they need not be atomic. Signed-off-by: Jens Axboe --- block/blk-integrity.c | 17 +++++++---------- include/linux/blkdev.h | 8 ++++---- 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 4ffa3814f6a9..3f1a8478cc38 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -217,17 +217,16 @@ static ssize_t integrity_read_store(struct blk_integrity *bi, unsigned long val = simple_strtoul(p, &p, 10); if (val) - set_bit(INTEGRITY_FLAG_READ, &bi->flags); + bi->flags |= INTEGRITY_FLAG_READ; else - clear_bit(INTEGRITY_FLAG_READ, &bi->flags); + bi->flags &= ~INTEGRITY_FLAG_READ; return count; } static ssize_t integrity_read_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", - test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0); + return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_READ) != 0); } static ssize_t integrity_write_store(struct blk_integrity *bi, @@ -237,17 +236,16 @@ static ssize_t integrity_write_store(struct blk_integrity *bi, unsigned long val = simple_strtoul(p, &p, 10); if (val) - set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->flags |= INTEGRITY_FLAG_WRITE; else - clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->flags &= ~INTEGRITY_FLAG_WRITE; return count; } static ssize_t integrity_write_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", - test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0); + return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_WRITE) != 0); } static struct integrity_sysfs_entry integrity_format_entry = { @@ -340,8 +338,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) kobject_uevent(&bi->kobj, KOBJ_ADD); - set_bit(INTEGRITY_FLAG_READ, &bi->flags); - set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE; bi->sector_size = disk->queue->hardsect_size; disk->integrity = bi; } else diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a842b776d099..7ab8acad5b6e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -870,8 +870,8 @@ void kblockd_flush_work(struct work_struct *work); #if defined(CONFIG_BLK_DEV_INTEGRITY) -#define INTEGRITY_FLAG_READ 1 /* verify data integrity on read */ -#define INTEGRITY_FLAG_WRITE 2 /* generate data integrity on write */ +#define INTEGRITY_FLAG_READ 2 /* verify data integrity on read */ +#define INTEGRITY_FLAG_WRITE 4 /* generate data integrity on write */ struct blk_integrity_exchg { void *prot_buf; @@ -940,11 +940,11 @@ static inline int bdev_integrity_enabled(struct block_device *bdev, int rw) return 0; if (rw == READ && bi->verify_fn != NULL && - test_bit(INTEGRITY_FLAG_READ, &bi->flags)) + (bi->flags & INTEGRITY_FLAG_READ)) return 1; if (rw == WRITE && bi->generate_fn != NULL && - test_bit(INTEGRITY_FLAG_WRITE, &bi->flags)) + (bi->flags & INTEGRITY_FLAG_WRITE)) return 1; return 0; -- cgit v1.2.3 From e180f5949327e897bc35a816f4f4010186632df9 Mon Sep 17 00:00:00 2001 From: maximilian attems Date: Tue, 1 Jul 2008 09:42:47 +0200 Subject: block: request_module(): use format string Avoid bad things happening if the module has a printk control string in its name. Signed-off-by: maximilian attems Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 1f5bfe696026..ed6f8f32d27e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -150,7 +150,7 @@ static struct elevator_type *elevator_get(const char *name) else sprintf(elv, "%s-iosched", name); - request_module(elv); + request_module("%s", elv); spin_lock(&elv_list_lock); e = elevator_find(name); } -- cgit v1.2.3 From e48ec69005f02b70b7ecfde1bc39a599086d16ef Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jul 2008 13:18:54 +0200 Subject: block: extend queue_flag bitops Add test_and_clear and test_and_set. Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++-------- include/linux/blkdev.h | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index e0fb0bcc0c17..dbc7f42b5d2b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -205,8 +205,7 @@ void blk_plug_device(struct request_queue *q) if (blk_queue_stopped(q)) return; - if (!test_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) { - __set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags); + if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); } @@ -221,10 +220,9 @@ int blk_remove_plug(struct request_queue *q) { WARN_ON(!irqs_disabled()); - if (!test_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) + if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) return 0; - queue_flag_clear(QUEUE_FLAG_PLUGGED, q); del_timer(&q->unplug_timer); return 1; } @@ -328,8 +326,7 @@ void blk_start_queue(struct request_queue *q) * one level of recursion is ok and is much faster than kicking * the unplug handling */ - if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { - queue_flag_set(QUEUE_FLAG_REENTER, q); + if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { q->request_fn(q); queue_flag_clear(QUEUE_FLAG_REENTER, q); } else { @@ -394,8 +391,7 @@ void __blk_run_queue(struct request_queue *q) * handling reinvoke the handler shortly if we already got there. */ if (!elv_queue_empty(q)) { - if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { - queue_flag_set(QUEUE_FLAG_REENTER, q); + if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { q->request_fn(q); queue_flag_clear(QUEUE_FLAG_REENTER, q); } else { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ff9d0bdf2a16..e04c4ac8a7cf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -428,6 +428,32 @@ static inline void queue_flag_set_unlocked(unsigned int flag, __set_bit(flag, &q->queue_flags); } +static inline int queue_flag_test_and_clear(unsigned int flag, + struct request_queue *q) +{ + WARN_ON_ONCE(!queue_is_locked(q)); + + if (test_bit(flag, &q->queue_flags)) { + __clear_bit(flag, &q->queue_flags); + return 1; + } + + return 0; +} + +static inline int queue_flag_test_and_set(unsigned int flag, + struct request_queue *q) +{ + WARN_ON_ONCE(!queue_is_locked(q)); + + if (!test_bit(flag, &q->queue_flags)) { + __set_bit(flag, &q->queue_flags); + return 0; + } + + return 1; +} + static inline void queue_flag_set(unsigned int flag, struct request_queue *q) { WARN_ON_ONCE(!queue_is_locked(q)); -- cgit v1.2.3 From 27f8221af406e43b529a5425bc99c9b1e9bdf521 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 4 Jul 2008 09:30:03 +0200 Subject: block: add blk_queue_update_dma_pad This adds blk_queue_update_dma_pad to prevent LLDs from overwriting the dma pad mask wrongly (we added blk_queue_update_dma_alignment due to the same reason). This also converts libata to use blk_queue_update_dma_pad instead of blk_queue_dma_pad. Signed-off-by: FUJITA Tomonori Cc: Tejun Heo Cc: Bartlomiej Zolnierkiewicz Cc: Thomas Bogendoerfer Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-settings.c | 24 ++++++++++++++++++++---- drivers/ata/libata-scsi.c | 3 ++- include/linux/blkdev.h | 1 + 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 8dd86418f35d..dfc77012843f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -302,11 +302,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits); * @q: the request queue for the device * @mask: pad mask * - * Set pad mask. Direct IO requests are padded to the mask specified. + * Set dma pad mask. * - * Appending pad buffer to a request modifies ->data_len such that it - * includes the pad buffer. The original requested data length can be - * obtained using blk_rq_raw_data_len(). + * Appending pad buffer to a request modifies the last entry of a + * scatter list such that it includes the pad buffer. **/ void blk_queue_dma_pad(struct request_queue *q, unsigned int mask) { @@ -314,6 +313,23 @@ void blk_queue_dma_pad(struct request_queue *q, unsigned int mask) } EXPORT_SYMBOL(blk_queue_dma_pad); +/** + * blk_queue_update_dma_pad - update pad mask + * @q: the request queue for the device + * @mask: pad mask + * + * Update dma pad mask. + * + * Appending pad buffer to a request modifies the last entry of a + * scatter list such that it includes the pad buffer. + **/ +void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) +{ + if (mask > q->dma_pad_mask) + q->dma_pad_mask = mask; +} +EXPORT_SYMBOL(blk_queue_update_dma_pad); + /** * blk_queue_dma_drain - Set up a drain buffer for excess dma. * @q: the request queue for the device diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 57a43649a461..499ccc628d81 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -885,7 +885,8 @@ static int ata_scsi_dev_config(struct scsi_device *sdev, /* set the min alignment and padding */ blk_queue_update_dma_alignment(sdev->request_queue, ATA_DMA_PAD_SZ - 1); - blk_queue_dma_pad(sdev->request_queue, ATA_DMA_PAD_SZ - 1); + blk_queue_update_dma_pad(sdev->request_queue, + ATA_DMA_PAD_SZ - 1); /* configure draining */ buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e04c4ac8a7cf..1ffd8bfdc4c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -776,6 +776,7 @@ extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_hardsect_size(struct request_queue *, unsigned short); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); extern void blk_queue_dma_pad(struct request_queue *, unsigned int); +extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size); -- cgit v1.2.3 From 30c00eda73d5db5bd64dd0c370161abd8df5ba4a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 4 Jul 2008 09:31:11 +0200 Subject: block: blk_rq_map_kern uses the bounce buffers for stack buffers blk_rq_map_kern is used for kernel internal I/Os. Some callers use this function with stack buffers but DMA to/from the stack buffers leads to memory corruption on a non-coherent platform. This patch make blk_rq_map_kern uses the bounce buffers if a caller passes a stack buffer (on the all platforms for simplicity). Signed-off-by: FUJITA Tomonori Cc: Bartlomiej Zolnierkiewicz Cc: Thomas Bogendoerfer Cc: Tejun Heo Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-map.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 813011ef8276..ddd96fb11a7d 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -269,6 +269,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, int reading = rq_data_dir(rq) == READ; int do_copy = 0; struct bio *bio; + unsigned long stack_mask = ~(THREAD_SIZE - 1); if (len > (q->max_hw_sectors << 9)) return -EINVAL; @@ -279,6 +280,10 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, alignment = queue_dma_alignment(q) | q->dma_pad_mask; do_copy = ((kaddr & alignment) || (len & alignment)); + if (!((kaddr & stack_mask) ^ + ((unsigned long)current->stack & stack_mask))) + do_copy = 1; + if (do_copy) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else -- cgit v1.2.3