diff options
author | Linus Torvalds | 2018-08-14 10:23:25 -0700 |
---|---|---|
committer | Linus Torvalds | 2018-08-14 10:23:25 -0700 |
commit | 73ba2fb33c492916853dfe63e3b3163da0be661d (patch) | |
tree | c2fda8ca1273744d2e884d24189a15ac1a7d63c2 /include | |
parent | 958f338e96f874a0d29442396d6adf9c1e17aa2d (diff) | |
parent | b86d865cb1cae1e61527ea0b8977078bbf694328 (diff) |
Merge tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"First pull request for this merge window, there will also be a
followup request with some stragglers.
This pull request contains:
- Fix for a thundering heard issue in the wbt block code (Anchal
Agarwal)
- A few NVMe pull requests:
* Improved tracepoints (Keith)
* Larger inline data support for RDMA (Steve Wise)
* RDMA setup/teardown fixes (Sagi)
* Effects log suppor for NVMe target (Chaitanya Kulkarni)
* Buffered IO suppor for NVMe target (Chaitanya Kulkarni)
* TP4004 (ANA) support (Christoph)
* Various NVMe fixes
- Block io-latency controller support. Much needed support for
properly containing block devices. (Josef)
- Series improving how we handle sense information on the stack
(Kees)
- Lightnvm fixes and updates/improvements (Mathias/Javier et al)
- Zoned device support for null_blk (Matias)
- AIX partition fixes (Mauricio Faria de Oliveira)
- DIF checksum code made generic (Max Gurtovoy)
- Add support for discard in iostats (Michael Callahan / Tejun)
- Set of updates for BFQ (Paolo)
- Removal of async write support for bsg (Christoph)
- Bio page dirtying and clone fixups (Christoph)
- Set of bcache fix/changes (via Coly)
- Series improving blk-mq queue setup/teardown speed (Ming)
- Series improving merging performance on blk-mq (Ming)
- Lots of other fixes and cleanups from a slew of folks"
* tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block: (190 commits)
blkcg: Make blkg_root_lookup() work for queues in bypass mode
bcache: fix error setting writeback_rate through sysfs interface
null_blk: add lock drop/acquire annotation
Blk-throttle: reduce tail io latency when iops limit is enforced
block: paride: pd: mark expected switch fall-throughs
block: Ensure that a request queue is dissociated from the cgroup controller
block: Introduce blk_exit_queue()
blkcg: Introduce blkg_root_lookup()
block: Remove two superfluous #include directives
blk-mq: count the hctx as active before allocating tag
block: bvec_nr_vecs() returns value for wrong slab
bcache: trivial - remove tailing backslash in macro BTREE_FLAG
bcache: make the pr_err statement used for ENOENT only in sysfs_attatch section
bcache: set max writeback rate when I/O request is idle
bcache: add code comments for bset.c
bcache: fix mistaken comments in request.c
bcache: fix mistaken code comments in bcache.h
bcache: add a comment in super.c
bcache: avoid unncessary cache prefetch bch_btree_node_get()
bcache: display rate debug parameters to 0 when writeback is not running
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/bio.h | 19 | ||||
-rw-r--r-- | include/linux/blk-cgroup.h | 146 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 4 | ||||
-rw-r--r-- | include/linux/blk_types.h | 27 | ||||
-rw-r--r-- | include/linux/blkdev.h | 66 | ||||
-rw-r--r-- | include/linux/cdrom.h | 3 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 3 | ||||
-rw-r--r-- | include/linux/genhd.h | 14 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 13 | ||||
-rw-r--r-- | include/linux/nvme.h | 72 | ||||
-rw-r--r-- | include/linux/sched.h | 8 | ||||
-rw-r--r-- | include/linux/swap.h | 11 | ||||
-rw-r--r-- | include/linux/t10-pi.h | 24 | ||||
-rw-r--r-- | include/linux/tracehook.h | 2 | ||||
-rw-r--r-- | include/scsi/scsi_cmnd.h | 13 | ||||
-rw-r--r-- | include/scsi/scsi_device.h | 14 | ||||
-rw-r--r-- | include/uapi/linux/bcache.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/blkzoned.h | 2 |
18 files changed, 380 insertions, 65 deletions
diff --git a/include/linux/bio.h b/include/linux/bio.h index f08f5fe7bd08..51371740d2a8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -429,7 +429,6 @@ extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); -extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); extern struct bio_set fs_bio_set; @@ -443,12 +442,6 @@ static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); } -static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) -{ - return bio_clone_bioset(bio, gfp_mask, NULL); - -} - extern blk_qc_t submit_bio(struct bio *); extern void bio_endio(struct bio *); @@ -496,9 +489,9 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -void generic_start_io_acct(struct request_queue *q, int rw, +void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part); -void generic_end_io_acct(struct request_queue *q, int rw, +void generic_end_io_acct(struct request_queue *q, int op, struct hd_struct *part, unsigned long start_time); @@ -553,8 +546,16 @@ do { \ #define bio_dev(bio) \ disk_devt((bio)->bi_disk) +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); +#else +static inline int bio_associate_blkcg_from_page(struct bio *bio, + struct page *page) { return 0; } +#endif + #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); +int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); void bio_disassociate_task(struct bio *bio); void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6c666fd7de3c..34aec30e06c7 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -35,6 +35,7 @@ enum blkg_rwstat_type { BLKG_RWSTAT_WRITE, BLKG_RWSTAT_SYNC, BLKG_RWSTAT_ASYNC, + BLKG_RWSTAT_DISCARD, BLKG_RWSTAT_NR, BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, @@ -136,6 +137,12 @@ struct blkcg_gq { struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; + + atomic_t use_delay; + atomic64_t delay_nsec; + atomic64_t delay_start; + u64 last_delay; + int last_use; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); @@ -148,6 +155,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); +typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf, + size_t size); struct blkcg_policy { int plid; @@ -167,6 +176,7 @@ struct blkcg_policy { blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; + blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; @@ -238,6 +248,42 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return css_to_blkcg(task_css(current, io_cgrp_id)); } +static inline bool blk_cgroup_congested(void) +{ + struct cgroup_subsys_state *css; + bool ret = false; + + rcu_read_lock(); + css = kthread_blkcg(); + if (!css) + css = task_css(current, io_cgrp_id); + while (css) { + if (atomic_read(&css->cgroup->congestion_count)) { + ret = true; + break; + } + css = css->parent; + } + rcu_read_unlock(); + return ret; +} + +/** + * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg + * @return: true if this bio needs to be submitted with the root blkg context. + * + * In order to avoid priority inversions we sometimes need to issue a bio as if + * it were attached to the root blkg, and then backcharge to the actual owning + * blkg. The idea is we do bio_blkcg() to look up the actual context for the + * bio and attach the appropriate blkg to the bio. Then we call this helper and + * if it is true run with the root blkg for that queue and then do any + * backcharging to the originating cgroup once the io is complete. + */ +static inline bool bio_issue_as_root_blkg(struct bio *bio) +{ + return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; +} + /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest @@ -296,6 +342,17 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, } /** + * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair + * @q: request_queue of interest + * + * Lookup blkg for @q at the root level. See also blkg_lookup(). + */ +static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) +{ + return q->root_blkg; +} + +/** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest @@ -355,6 +412,21 @@ static inline void blkg_get(struct blkcg_gq *blkg) atomic_inc(&blkg->refcnt); } +/** + * blkg_try_get - try and get a blkg reference + * @blkg: blkg to get + * + * This is for use when doing an RCU lookup of the blkg. We may be in the midst + * of freeing this blkg, so we can only use it if the refcnt is not zero. + */ +static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) +{ + if (atomic_inc_not_zero(&blkg->refcnt)) + return blkg; + return NULL; +} + + void __blkg_release_rcu(struct rcu_head *rcu); /** @@ -589,7 +661,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, { struct percpu_counter *cnt; - if (op_is_write(op)) + if (op_is_discard(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; + else if (op_is_write(op)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; @@ -706,8 +780,14 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, if (!throtl) { blkg = blkg ?: q->root_blkg; - blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); + /* + * If the bio is flagged with BIO_QUEUE_ENTERED it means this + * is a split bio and we would have already accounted for the + * size of the bio. + */ + if (!bio_flagged(bio, BIO_QUEUE_ENTERED)) + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } @@ -715,6 +795,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, return !throtl; } +static inline void blkcg_use_delay(struct blkcg_gq *blkg) +{ + if (atomic_add_return(1, &blkg->use_delay) == 1) + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); +} + +static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) +{ + int old = atomic_read(&blkg->use_delay); + + if (old == 0) + return 0; + + /* + * We do this song and dance because we can race with somebody else + * adding or removing delay. If we just did an atomic_dec we'd end up + * negative and we'd already be in trouble. We need to subtract 1 and + * then check to see if we were the last delay so we can drop the + * congestion count on the cgroup. + */ + while (old) { + int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1); + if (cur == old) + break; + old = cur; + } + + if (old == 0) + return 0; + if (old == 1) + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + return 1; +} + +static inline void blkcg_clear_delay(struct blkcg_gq *blkg) +{ + int old = atomic_read(&blkg->use_delay); + if (!old) + return; + /* We only want 1 person clearing the congestion count for this blkg. */ + while (old) { + int cur = atomic_cmpxchg(&blkg->use_delay, old, 0); + if (cur == old) { + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + break; + } + old = cur; + } +} + +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); +void blkcg_maybe_throttle_current(void); #else /* CONFIG_BLK_CGROUP */ struct blkcg { @@ -734,9 +867,16 @@ struct blkcg_policy { #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) +static inline void blkcg_maybe_throttle_current(void) { } +static inline bool blk_cgroup_congested(void) { return false; } + #ifdef CONFIG_BLOCK +static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } + static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) +{ return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } static inline void blkcg_exit_queue(struct request_queue *q) { } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ca3f2c2edd85..1da59c16f637 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -35,10 +35,12 @@ struct blk_mq_hw_ctx { struct sbitmap ctx_map; struct blk_mq_ctx *dispatch_from; + unsigned int dispatch_busy; - struct blk_mq_ctx **ctxs; unsigned int nr_ctx; + struct blk_mq_ctx **ctxs; + spinlock_t dispatch_wait_lock; wait_queue_entry_t dispatch_wait; atomic_t wait_index; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3c4f390aea4b..f6dfb30737d8 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -179,11 +179,9 @@ struct bio { */ struct io_context *bi_ioc; struct cgroup_subsys_state *bi_css; -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - void *bi_cg_private; + struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; #endif -#endif union { #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity; /* data integrity */ @@ -329,7 +327,7 @@ enum req_flag_bits { /* for driver use */ __REQ_DRV, - + __REQ_SWAP, /* swapping request. */ __REQ_NR_BITS, /* stops here */ }; @@ -351,6 +349,7 @@ enum req_flag_bits { #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) #define REQ_DRV (1ULL << __REQ_DRV) +#define REQ_SWAP (1ULL << __REQ_SWAP) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) @@ -358,6 +357,14 @@ enum req_flag_bits { #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) +enum stat_group { + STAT_READ, + STAT_WRITE, + STAT_DISCARD, + + NR_STAT_GROUPS +}; + #define bio_op(bio) \ ((bio)->bi_opf & REQ_OP_MASK) #define req_op(req) \ @@ -395,6 +402,18 @@ static inline bool op_is_sync(unsigned int op) (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); } +static inline bool op_is_discard(unsigned int op) +{ + return (op & REQ_OP_MASK) == REQ_OP_DISCARD; +} + +static inline int op_stat_group(unsigned int op) +{ + if (op_is_discard(op)) + return STAT_DISCARD; + return op_is_write(op); +} + typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U #define BLK_QC_T_SHIFT 16 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 79226ca8f80f..d6869e0e2b64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,8 +27,6 @@ #include <linux/percpu-refcount.h> #include <linux/scatterlist.h> #include <linux/blkzoned.h> -#include <linux/seqlock.h> -#include <linux/u64_stats_sync.h> struct module; struct scsi_ioctl_command; @@ -42,7 +40,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; -struct rq_wb; +struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; @@ -442,10 +440,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ - atomic_t shared_hctx_restart; - struct blk_queue_stats *stats; - struct rq_wb *rq_wb; + struct rq_qos *rq_qos; /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg @@ -592,6 +588,7 @@ struct request_queue { struct queue_limits limits; +#ifdef CONFIG_BLK_DEV_ZONED /* * Zoned block device information for request dispatch control. * nr_zones is the total number of zones of the device. This is always @@ -612,6 +609,7 @@ struct request_queue { unsigned int nr_zones; unsigned long *seq_zones_bitmap; unsigned long *seq_zones_wlock; +#endif /* CONFIG_BLK_DEV_ZONED */ /* * sg stuff @@ -800,11 +798,7 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } -static inline unsigned int blk_queue_nr_zones(struct request_queue *q) -{ - return q->nr_zones; -} - +#ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_queue_zone_no(struct request_queue *q, sector_t sector) { @@ -820,6 +814,7 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q, return false; return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); } +#endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) { @@ -1070,6 +1065,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; } +#ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_rq_zone_no(struct request *rq) { return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); @@ -1079,6 +1075,7 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq) { return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); } +#endif /* CONFIG_BLK_DEV_ZONED */ /* * Some commands like WRITE SAME have a payload or data transfer size which @@ -1437,8 +1434,6 @@ enum blk_default_limits { BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) - static inline unsigned long queue_segment_boundary(struct request_queue *q) { return q->limits.seg_boundary_mask; @@ -1639,15 +1634,6 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev) return 0; } -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - return blk_queue_nr_zones(q); - return 0; -} - static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; @@ -1877,6 +1863,28 @@ static inline bool integrity_req_gap_front_merge(struct request *req, bip_next->bip_vec[0].bv_offset); } +/** + * bio_integrity_intervals - Return number of integrity intervals for a bio + * @bi: blk_integrity profile for device + * @sectors: Size of the bio in 512-byte sectors + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the data integrity + * interval size of the storage device. Convert the block layer sectors + * to the appropriate number of integrity intervals. + */ +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return sectors >> (bi->interval_exp - 9); +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return bio_integrity_intervals(bi, sectors) * bi->tuple_size; +} + #else /* CONFIG_BLK_DEV_INTEGRITY */ struct bio; @@ -1950,12 +1958,24 @@ static inline bool integrity_req_gap_front_merge(struct request *req, return false; } +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return 0; +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, bool); + int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index e75dfd1f1dec..528271c60018 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -13,6 +13,7 @@ #include <linux/fs.h> /* not really needed, later.. */ #include <linux/list.h> +#include <scsi/scsi_common.h> #include <uapi/linux/cdrom.h> struct packet_command @@ -21,7 +22,7 @@ struct packet_command unsigned char *buffer; unsigned int buflen; int stat; - struct request_sense *sense; + struct scsi_sense_hdr *sshdr; unsigned char data_direction; int quiet; int timeout; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c0e68f903011..ff20b677fb9f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -438,6 +438,9 @@ struct cgroup { /* used to store eBPF programs */ struct cgroup_bpf bpf; + /* If there is block congestion on this cgroup. */ + atomic_t congestion_count; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6cb8a5789668..57864422a2c8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/percpu-refcount.h> #include <linux/uuid.h> +#include <linux/blk_types.h> #ifdef CONFIG_BLOCK @@ -82,10 +83,10 @@ struct partition { } __attribute__((packed)); struct disk_stats { - unsigned long sectors[2]; /* READs and WRITEs */ - unsigned long ios[2]; - unsigned long merges[2]; - unsigned long ticks[2]; + unsigned long sectors[NR_STAT_GROUPS]; + unsigned long ios[NR_STAT_GROUPS]; + unsigned long merges[NR_STAT_GROUPS]; + unsigned long ticks[NR_STAT_GROUPS]; unsigned long io_ticks; unsigned long time_in_queue; }; @@ -353,6 +354,11 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ +#define part_stat_read_accum(part, field) \ + (part_stat_read(part, field[STAT_READ]) + \ + part_stat_read(part, field[STAT_WRITE]) + \ + part_stat_read(part, field[STAT_DISCARD])) + #define part_stat_add(cpu, part, field, addnd) do { \ __part_stat_add((cpu), (part), field, addnd); \ if ((part)->partno) \ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6c6fb116e925..680d3395fc83 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -317,6 +317,9 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp, bool compound); +int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare, bool compound); void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, @@ -789,6 +792,16 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, return 0; } +static inline int mem_cgroup_try_charge_delay(struct page *page, + struct mm_struct *mm, + gfp_t gfp_mask, + struct mem_cgroup **memcgp, + bool compound) +{ + *memcgp = NULL; + return 0; +} + static inline void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare, bool compound) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2950ce957656..68e91ef5494c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -242,7 +242,12 @@ struct nvme_id_ctrl { __le32 sanicap; __le32 hmminds; __le16 hmmaxd; - __u8 rsvd338[174]; + __u8 rsvd338[4]; + __u8 anatt; + __u8 anacap; + __le32 anagrpmax; + __le32 nanagrpid; + __u8 rsvd352[160]; __u8 sqes; __u8 cqes; __le16 maxcmd; @@ -254,11 +259,12 @@ struct nvme_id_ctrl { __le16 awun; __le16 awupf; __u8 nvscc; - __u8 rsvd531; + __u8 nwpc; __le16 acwu; __u8 rsvd534[2]; __le32 sgls; - __u8 rsvd540[228]; + __le32 mnan; + __u8 rsvd544[224]; char subnqn[256]; __u8 rsvd1024[768]; __le32 ioccsz; @@ -312,7 +318,11 @@ struct nvme_id_ns { __le16 nabspf; __le16 noiob; __u8 nvmcap[16]; - __u8 rsvd64[40]; + __u8 rsvd64[28]; + __le32 anagrpid; + __u8 rsvd96[3]; + __u8 nsattr; + __u8 rsvd100[4]; __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; @@ -425,6 +435,32 @@ struct nvme_effects_log { __u8 resv[2048]; }; +enum nvme_ana_state { + NVME_ANA_OPTIMIZED = 0x01, + NVME_ANA_NONOPTIMIZED = 0x02, + NVME_ANA_INACCESSIBLE = 0x03, + NVME_ANA_PERSISTENT_LOSS = 0x04, + NVME_ANA_CHANGE = 0x0f, +}; + +struct nvme_ana_group_desc { + __le32 grpid; + __le32 nnsids; + __le64 chgcnt; + __u8 state; + __u8 rsvd17[15]; + __le32 nsids[]; +}; + +/* flag for the log specific field of the ANA log */ +#define NVME_ANA_LOG_RGO (1 << 0) + +struct nvme_ana_rsp_hdr { + __le64 chgcnt; + __le16 ngrps; + __le16 rsvd10[3]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -444,11 +480,13 @@ enum { enum { NVME_AER_NOTICE_NS_CHANGED = 0x00, NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, + NVME_AER_NOTICE_ANA = 0x03, }; enum { NVME_AEN_CFG_NS_ATTR = 1 << 8, NVME_AEN_CFG_FW_ACT = 1 << 9, + NVME_AEN_CFG_ANA_CHANGE = 1 << 11, }; struct nvme_lba_range_type { @@ -749,15 +787,22 @@ enum { NVME_FEAT_HOST_MEM_BUF = 0x0d, NVME_FEAT_TIMESTAMP = 0x0e, NVME_FEAT_KATO = 0x0f, + NVME_FEAT_HCTM = 0x10, + NVME_FEAT_NOPSC = 0x11, + NVME_FEAT_RRL = 0x12, + NVME_FEAT_PLM_CONFIG = 0x13, + NVME_FEAT_PLM_WINDOW = 0x14, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, NVME_FEAT_RESV_PERSIST = 0x83, + NVME_FEAT_WRITE_PROTECT = 0x84, NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, NVME_LOG_CHANGED_NS = 0x04, NVME_LOG_CMD_EFFECTS = 0x05, + NVME_LOG_ANA = 0x0c, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), @@ -765,6 +810,14 @@ enum { NVME_FWACT_ACTV = (2 << 3), }; +/* NVMe Namespace Write Protect State */ +enum { + NVME_NS_NO_WRITE_PROTECT = 0, + NVME_NS_WRITE_PROTECT, + NVME_NS_WRITE_PROTECT_POWER_CYCLE, + NVME_NS_WRITE_PROTECT_PERMANENT, +}; + #define NVME_MAX_CHANGED_NAMESPACES 1024 struct nvme_identify { @@ -880,7 +933,7 @@ struct nvme_get_log_page_command { __u64 rsvd2[2]; union nvme_data_ptr dptr; __u8 lid; - __u8 rsvd10; + __u8 lsp; /* upper 4 bits reserved */ __le16 numdl; __le16 numdu; __u16 rsvd11; @@ -1111,6 +1164,8 @@ enum { NVME_SC_SGL_INVALID_OFFSET = 0x16, NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + NVME_SC_NS_WRITE_PROTECTED = 0x20, + NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, @@ -1180,6 +1235,13 @@ enum { NVME_SC_ACCESS_DENIED = 0x286, NVME_SC_UNWRITTEN_BLOCK = 0x287, + /* + * Path-related Errors: + */ + NVME_SC_ANA_PERSISTENT_LOSS = 0x301, + NVME_SC_ANA_INACCESSIBLE = 0x302, + NVME_SC_ANA_TRANSITION = 0x303, + NVME_SC_DNR = 0x4000, }; diff --git a/include/linux/sched.h b/include/linux/sched.h index dac5086e3815..95a5018c338e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -734,6 +734,10 @@ struct task_struct { /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; #endif +#ifdef CONFIG_BLK_CGROUP + /* to be used once the psi infrastructure lands upstream. */ + unsigned use_memdelay:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ @@ -1150,6 +1154,10 @@ struct task_struct { unsigned int memcg_nr_pages_over_high; #endif +#ifdef CONFIG_BLK_CGROUP + struct request_queue *throttle_queue; +#endif + #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index c063443d8638..1a8bd05a335e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -629,7 +629,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return memcg->swappiness; } - #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { @@ -637,6 +636,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) } #endif +#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, + gfp_t gfp_mask); +#else +static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, + int node, gfp_t gfp_mask) +{ +} +#endif + #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index c6aa8a3c42ed..b9626aa7e90c 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -37,9 +37,33 @@ struct t10_pi_tuple { #define T10_PI_APP_ESCAPE cpu_to_be16(0xffff) #define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff) +static inline u32 t10_pi_ref_tag(struct request *rq) +{ +#ifdef CONFIG_BLK_DEV_INTEGRITY + return blk_rq_pos(rq) >> + (rq->q->integrity.interval_exp - 9) & 0xffffffff; +#else + return -1U; +#endif +} + extern const struct blk_integrity_profile t10_pi_type1_crc; extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; extern const struct blk_integrity_profile t10_pi_type3_ip; +#ifdef CONFIG_BLK_DEV_INTEGRITY +extern void t10_pi_prepare(struct request *rq, u8 protection_type); +extern void t10_pi_complete(struct request *rq, u8 protection_type, + unsigned int intervals); +#else +static inline void t10_pi_complete(struct request *rq, u8 protection_type, + unsigned int intervals) +{ +} +static inline void t10_pi_prepare(struct request *rq, u8 protection_type) +{ +} +#endif + #endif diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 4a8841963c2e..05589a3e37f4 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -51,6 +51,7 @@ #include <linux/security.h> #include <linux/task_work.h> #include <linux/memcontrol.h> +#include <linux/blk-cgroup.h> struct linux_binprm; /* @@ -192,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) task_work_run(); mem_cgroup_handle_over_high(); + blkcg_maybe_throttle_current(); } #endif /* <linux/tracehook.h> */ diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index aaf1e971c6a3..c891ada3c5c2 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -4,6 +4,7 @@ #include <linux/dma-mapping.h> #include <linux/blkdev.h> +#include <linux/t10-pi.h> #include <linux/list.h> #include <linux/types.h> #include <linux/timer.h> @@ -14,8 +15,6 @@ struct Scsi_Host; struct scsi_driver; -#include <scsi/scsi_device.h> - /* * MAX_COMMAND_SIZE is: * The longest fixed-length SCSI CDB as per the SCSI standard. @@ -120,11 +119,11 @@ struct scsi_cmnd { struct request *request; /* The command we are working on */ -#define SCSI_SENSE_BUFFERSIZE 96 unsigned char *sense_buffer; /* obtained by REQUEST SENSE when * CHECK CONDITION is received on original - * command (auto-sense) */ + * command (auto-sense). Length must be + * SCSI_SENSE_BUFFERSIZE bytes. */ /* Low-level done function - can be used by low-level driver to point * to completion function. Not used by mid/upper level code. */ @@ -313,12 +312,6 @@ static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd) return scmd->device->sector_size; } -static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd) -{ - return blk_rq_pos(scmd->request) >> - (ilog2(scsi_prot_interval(scmd)) - 9) & 0xffffffff; -} - static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 4c36af6edd79..202f4d6a4342 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -17,6 +17,8 @@ struct scsi_sense_hdr; typedef __u64 __bitwise blist_flags_t; +#define SCSI_SENSE_BUFFERSIZE 96 + struct scsi_mode_data { __u32 length; __u16 block_descriptor_length; @@ -426,11 +428,21 @@ extern const char *scsi_device_state_name(enum scsi_device_state); extern int scsi_is_sdev_device(const struct device *); extern int scsi_is_target_device(const struct device *); extern void scsi_sanitize_inquiry_string(unsigned char *s, int len); -extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, +extern int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, unsigned char *sense, struct scsi_sense_hdr *sshdr, int timeout, int retries, u64 flags, req_flags_t rq_flags, int *resid); +/* Make sure any sense buffer is the correct size. */ +#define scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense, \ + sshdr, timeout, retries, flags, rq_flags, resid) \ +({ \ + BUILD_BUG_ON((sense) != NULL && \ + sizeof(sense) != SCSI_SENSE_BUFFERSIZE); \ + __scsi_execute(sdev, cmd, data_direction, buffer, bufflen, \ + sense, sshdr, timeout, retries, flags, rq_flags, \ + resid); \ +}) static inline int scsi_execute_req(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 821f71a2e48f..8d19e02d752a 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -195,7 +195,7 @@ struct cache_sb { }; }; - __u32 last_mount; /* time_t */ + __u32 last_mount; /* time overflow in y2106 */ __u16 first_bucket; union { @@ -318,7 +318,7 @@ struct uuid_entry { struct { __u8 uuid[16]; __u8 label[32]; - __u32 first_reg; + __u32 first_reg; /* time overflow in y2106 */ __u32 last_reg; __u32 invalidated; diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index e3c70fe6bf0f..ff5a5db8906a 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -117,7 +117,7 @@ struct blk_zone_report { __u32 nr_zones; __u8 reserved[4]; struct blk_zone zones[0]; -} __packed; +}; /** * struct blk_zone_range - BLKRESETZONE ioctl request |