aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/bdev.c23
-rw-r--r--fs/crypto/inline_crypt.c49
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/file.c37
-rw-r--r--fs/ext4/inode.c37
-rw-r--r--fs/f2fs/f2fs.h40
-rw-r--r--fs/f2fs/file.c43
-rw-r--r--fs/stat.c14
-rw-r--r--fs/xfs/xfs_iops.c10
-rw-r--r--include/linux/blkdev.h4
-rw-r--r--include/linux/fscrypt.h7
-rw-r--r--include/linux/stat.h2
-rw-r--r--include/uapi/linux/stat.h4
13 files changed, 188 insertions, 83 deletions
diff --git a/block/bdev.c b/block/bdev.c
index ce05175e71ce..d699ecdb3260 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -26,6 +26,7 @@
#include <linux/namei.h>
#include <linux/part_stat.h>
#include <linux/uaccess.h>
+#include <linux/stat.h>
#include "../fs/internal.h"
#include "blk.h"
@@ -1069,3 +1070,25 @@ void sync_bdevs(bool wait)
spin_unlock(&blockdev_superblock->s_inode_list_lock);
iput(old_inode);
}
+
+/*
+ * Handle STATX_DIOALIGN for block devices.
+ *
+ * Note that the inode passed to this is the inode of a block device node file,
+ * not the block device's internal inode. Therefore it is *not* valid to use
+ * I_BDEV() here; the block device has to be looked up by i_rdev instead.
+ */
+void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
+{
+ struct block_device *bdev;
+
+ bdev = blkdev_get_no_open(inode->i_rdev);
+ if (!bdev)
+ return;
+
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ stat->result_mask |= STATX_DIOALIGN;
+
+ blkdev_put_no_open(bdev);
+}
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index c40bd55bc781..cea8b14007e6 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -396,46 +396,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
/**
- * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
- * supported as far as encryption is concerned
- * @iocb: the file and position the I/O is targeting
- * @iter: the I/O data segment(s)
+ * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an
+ * inode, as far as encryption is concerned
+ * @inode: the inode in question
*
* Return: %true if there are no encryption constraints that prevent DIO from
* being supported; %false if DIO is unsupported. (Note that in the
* %true case, the filesystem might have other, non-encryption-related
- * constraints that prevent DIO from actually being supported.)
+ * constraints that prevent DIO from actually being supported. Also, on
+ * encrypted files the filesystem is still responsible for only allowing
+ * DIO when requests are filesystem-block-aligned.)
*/
-bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+bool fscrypt_dio_supported(struct inode *inode)
{
- const struct inode *inode = file_inode(iocb->ki_filp);
- const unsigned int blocksize = i_blocksize(inode);
+ int err;
/* If the file is unencrypted, no veto from us. */
if (!fscrypt_needs_contents_encryption(inode))
return true;
- /* We only support DIO with inline crypto, not fs-layer crypto. */
- if (!fscrypt_inode_uses_inline_crypto(inode))
- return false;
-
/*
- * Since the granularity of encryption is filesystem blocks, the file
- * position and total I/O length must be aligned to the filesystem block
- * size -- not just to the block device's logical block size as is
- * traditionally the case for DIO on many filesystems.
+ * We only support DIO with inline crypto, not fs-layer crypto.
*
- * We require that the user-provided memory buffers be filesystem block
- * aligned too. It is simpler to have a single alignment value required
- * for all properties of the I/O, as is normally the case for DIO.
- * Also, allowing less aligned buffers would imply that data units could
- * cross bvecs, which would greatly complicate the I/O stack, which
- * assumes that bios can be split at any bvec boundary.
+ * To determine whether the inode is using inline crypto, we have to set
+ * up the key if it wasn't already done. This is because in the current
+ * design of fscrypt, the decision of whether to use inline crypto or
+ * not isn't made until the inode's encryption key is being set up. In
+ * the DIO read/write case, the key will always be set up already, since
+ * the file will be open. But in the case of statx(), the key might not
+ * be set up yet, as the file might not have been opened yet.
*/
- if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
+ err = fscrypt_require_key(inode);
+ if (err) {
+ /*
+ * Key unavailable or couldn't be set up. This edge case isn't
+ * worth worrying about; just report that DIO is unsupported.
+ */
return false;
-
- return true;
+ }
+ return fscrypt_inode_uses_inline_crypto(inode);
}
EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3bf9a6926798..e5f2f5ca5120 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2977,6 +2977,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct user_namespace *, struct dentry *,
struct iattr *);
+extern u32 ext4_dio_alignment(struct inode *inode);
extern int ext4_getattr(struct user_namespace *, const struct path *,
struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 109d07629f81..8bb1c35fd6dd 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,19 +36,34 @@
#include "acl.h"
#include "truncate.h"
-static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+/*
+ * Returns %true if the given DIO request should be attempted with DIO, or
+ * %false if it should fall back to buffered I/O.
+ *
+ * DIO isn't well specified; when it's unsupported (either due to the request
+ * being misaligned, or due to the file not supporting DIO at all), filesystems
+ * either fall back to buffered I/O or return EINVAL. For files that don't use
+ * any special features like encryption or verity, ext4 has traditionally
+ * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too.
+ * In this case, we should attempt the DIO, *not* fall back to buffered I/O.
+ *
+ * In contrast, in cases where DIO is unsupported due to ext4 features, ext4
+ * traditionally falls back to buffered I/O.
+ *
+ * This function implements the traditional ext4 behavior in all these cases.
+ */
+static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ u32 dio_align = ext4_dio_alignment(inode);
- if (!fscrypt_dio_supported(iocb, iter))
- return false;
- if (fsverity_active(inode))
+ if (dio_align == 0)
return false;
- if (ext4_should_journal_data(inode))
- return false;
- if (ext4_has_inline_data(inode))
- return false;
- return true;
+
+ if (dio_align == 1)
+ return true;
+
+ return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
}
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -63,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
- if (!ext4_dio_supported(iocb, to)) {
+ if (!ext4_should_use_dio(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
@@ -511,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_dio_supported(iocb, from)) {
+ if (!ext4_should_use_dio(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 601214453c3a..364774230d87 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5550,6 +5550,22 @@ err_out:
return error;
}
+u32 ext4_dio_alignment(struct inode *inode)
+{
+ if (fsverity_active(inode))
+ return 0;
+ if (ext4_should_journal_data(inode))
+ return 0;
+ if (ext4_has_inline_data(inode))
+ return 0;
+ if (IS_ENCRYPTED(inode)) {
+ if (!fscrypt_dio_supported(inode))
+ return 0;
+ return i_blocksize(inode);
+ }
+ return 1; /* use the iomap defaults */
+}
+
int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -5565,6 +5581,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
}
+ /*
+ * Return the DIO alignment restrictions if requested. We only return
+ * this information when requested, since on encrypted files it might
+ * take a fair bit of work to get if the file wasn't opened recently.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ u32 dio_align = ext4_dio_alignment(inode);
+
+ stat->result_mask |= STATX_DIOALIGN;
+ if (dio_align == 1) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+
+ /* iomap defaults */
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ } else {
+ stat->dio_mem_align = dio_align;
+ stat->dio_offset_align = dio_align;
+ }
+ }
+
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3c7cdb70fe2e..aea816a133a8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4471,17 +4471,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
f2fs_mark_inode_dirty_sync(inode, true);
}
-static inline int block_unaligned_IO(struct inode *inode,
- struct kiocb *iocb, struct iov_iter *iter)
-{
- unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
- unsigned int blocksize_mask = (1 << i_blkbits) - 1;
- loff_t offset = iocb->ki_pos;
- unsigned long align = offset | iov_iter_alignment(iter);
-
- return align & blocksize_mask;
-}
-
static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
int flag)
{
@@ -4492,35 +4481,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
return sbi->aligned_blksize;
}
-static inline bool f2fs_force_buffered_io(struct inode *inode,
- struct kiocb *iocb, struct iov_iter *iter)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int rw = iov_iter_rw(iter);
-
- if (!fscrypt_dio_supported(iocb, iter))
- return true;
- if (fsverity_active(inode))
- return true;
- if (f2fs_compressed_file(inode))
- return true;
-
- /* disallow direct IO if any of devices has unaligned blksize */
- if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
- return true;
-
- if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
- if (block_unaligned_IO(inode, iocb, iter))
- return true;
- if (F2FS_IO_ALIGNED(sbi))
- return true;
- }
- if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
- return true;
-
- return false;
-}
-
static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
{
return fsverity_active(inode) &&
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ce4905a073b3..791770507328 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -808,6 +808,29 @@ int f2fs_truncate(struct inode *inode)
return 0;
}
+static bool f2fs_force_buffered_io(struct inode *inode, int rw)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!fscrypt_dio_supported(inode))
+ return true;
+ if (fsverity_active(inode))
+ return true;
+ if (f2fs_compressed_file(inode))
+ return true;
+
+ /* disallow direct IO if any of devices has unaligned blksize */
+ if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
+ return true;
+
+ if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
+ return true;
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
+ return true;
+
+ return false;
+}
+
int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -824,6 +847,24 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->btime.tv_nsec = fi->i_crtime.tv_nsec;
}
+ /*
+ * Return the DIO alignment restrictions if requested. We only return
+ * this information when requested, since on encrypted files it might
+ * take a fair bit of work to get if the file wasn't opened recently.
+ *
+ * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN
+ * cannot represent that, so in that case we report no DIO support.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ unsigned int bsize = i_blocksize(inode);
+
+ stat->result_mask |= STATX_DIOALIGN;
+ if (!f2fs_force_buffered_io(inode, WRITE)) {
+ stat->dio_mem_align = bsize;
+ stat->dio_offset_align = bsize;
+ }
+ }
+
flags = fi->i_flags;
if (flags & F2FS_COMPR_FL)
stat->attributes |= STATX_ATTR_COMPRESSED;
@@ -4182,7 +4223,7 @@ static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb,
if (!(iocb->ki_flags & IOCB_DIRECT))
return false;
- if (f2fs_force_buffered_io(inode, iocb, iter))
+ if (f2fs_force_buffered_io(inode, iov_iter_rw(iter)))
return false;
/*
diff --git a/fs/stat.c b/fs/stat.c
index 9ced8860e0f3..ef50573c72a2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -5,6 +5,7 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*/
+#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/errno.h>
@@ -230,11 +231,22 @@ retry:
goto out;
error = vfs_getattr(&path, stat, request_mask, flags);
+
stat->mnt_id = real_mount(path.mnt)->mnt_id;
stat->result_mask |= STATX_MNT_ID;
+
if (path.mnt->mnt_root == path.dentry)
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
+
+ /* Handle STATX_DIOALIGN for block devices. */
+ if (request_mask & STATX_DIOALIGN) {
+ struct inode *inode = d_backing_inode(path.dentry);
+
+ if (S_ISBLK(inode->i_mode))
+ bdev_statx_dioalign(inode, stat);
+ }
+
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -611,6 +623,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_dev_major = MAJOR(stat->dev);
tmp.stx_dev_minor = MINOR(stat->dev);
tmp.stx_mnt_id = stat->mnt_id;
+ tmp.stx_dio_mem_align = stat->dio_mem_align;
+ tmp.stx_dio_offset_align = stat->dio_offset_align;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 45518b8c613c..f51c60d7e205 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -604,6 +604,16 @@ xfs_vn_getattr(
stat->blksize = BLKDEV_IOSIZE;
stat->rdev = inode->i_rdev;
break;
+ case S_IFREG:
+ if (request_mask & STATX_DIOALIGN) {
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
+
+ stat->result_mask |= STATX_DIOALIGN;
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ }
+ fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
stat->rdev = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 84b13fdd34a7..8038c5fbde40 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1498,6 +1498,7 @@ int sync_blockdev(struct block_device *bdev);
int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
int sync_blockdev_nowait(struct block_device *bdev);
void sync_bdevs(bool wait);
+void bdev_statx_dioalign(struct inode *inode, struct kstat *stat);
void printk_all_partitions(void);
#else
static inline void invalidate_bdev(struct block_device *bdev)
@@ -1514,6 +1515,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev)
static inline void sync_bdevs(bool wait)
{
}
+static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
+{
+}
static inline void printk_all_partitions(void)
{
}
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 1f12ebb4a69d..cad78b569c7e 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -764,7 +764,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
bool fscrypt_mergeable_bio_bh(struct bio *bio,
const struct buffer_head *next_bh);
-bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter);
+bool fscrypt_dio_supported(struct inode *inode);
u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks);
@@ -797,11 +797,8 @@ static inline bool fscrypt_mergeable_bio_bh(struct bio *bio,
return true;
}
-static inline bool fscrypt_dio_supported(struct kiocb *iocb,
- struct iov_iter *iter)
+static inline bool fscrypt_dio_supported(struct inode *inode)
{
- const struct inode *inode = file_inode(iocb->ki_filp);
-
return !fscrypt_needs_contents_encryption(inode);
}
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 7df06931f25d..ff277ced50e9 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -50,6 +50,8 @@ struct kstat {
struct timespec64 btime; /* File creation time */
u64 blocks;
u64 mnt_id;
+ u32 dio_mem_align;
+ u32 dio_offset_align;
};
#endif
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 1500a0f58041..7cab2c65d3d7 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -124,7 +124,8 @@ struct statx {
__u32 stx_dev_minor;
/* 0x90 */
__u64 stx_mnt_id;
- __u64 __spare2;
+ __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */
+ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */
/* 0xa0 */
__u64 __spare3[12]; /* Spare space for future expansion */
/* 0x100 */
@@ -152,6 +153,7 @@ struct statx {
#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */
#define STATX_BTIME 0x00000800U /* Want/got stx_btime */
#define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */
+#define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */
#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */