diff options
author | Linus Torvalds | 2023-11-24 09:45:40 -0800 |
---|---|---|
committer | Linus Torvalds | 2023-11-24 09:45:40 -0800 |
commit | fa2b906f5148883e2d0be8952767469c2e3de274 (patch) | |
tree | 2d7e9abdb717b49bd5febc3d3544607dc76357a6 | |
parent | afa0f6ee000abd220a8160f0375b5b8d3e4284f2 (diff) | |
parent | 796432efab1e372d404e7a71cc6891a53f105051 (diff) |
Merge tag 'vfs-6.7-rc3.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs fixes from Christian Brauner:
- Avoid calling back into LSMs from vfs_getattr_nosec() calls.
IMA used to query inode properties accessing raw inode fields without
dedicated helpers. That was finally fixed a few releases ago by
forcing IMA to use vfs_getattr_nosec() helpers.
The goal of the vfs_getattr_nosec() helper is to query for attributes
without calling into the LSM layer which would be quite problematic
because incredibly IMA is called from __fput()...
__fput()
-> ima_file_free()
What it does is to call back into the filesystem to update the file's
IMA xattr. Querying the inode without using vfs_getattr_nosec() meant
that IMA didn't handle stacking filesystems such as overlayfs
correctly. So the switch to vfs_getattr_nosec() is quite correct. But
the switch to vfs_getattr_nosec() revealed another bug when used on
stacking filesystems:
__fput()
-> ima_file_free()
-> vfs_getattr_nosec()
-> i_op->getattr::ovl_getattr()
-> vfs_getattr()
-> i_op->getattr::$WHATEVER_UNDERLYING_FS_getattr()
-> security_inode_getattr() # calls back into LSMs
Now, if that __fput() happens from task_work_run() of an exiting task
current->fs and various other pointer could already be NULL. So
anything in the LSM layer relying on that not being NULL would be
quite surprised.
Fix that by passing the information that this is a security request
through to the stacking filesystem by adding a new internal
ATT_GETATTR_NOSEC flag. Now the callchain becomes:
__fput()
-> ima_file_free()
-> vfs_getattr_nosec()
-> i_op->getattr::ovl_getattr()
-> if (AT_GETATTR_NOSEC)
vfs_getattr_nosec()
else
vfs_getattr()
-> i_op->getattr::$WHATEVER_UNDERLYING_FS_getattr()
- Fix a bug introduced with the iov_iter rework from last cycle.
This broke /proc/kcore by copying too much and without the correct
offset.
- Add a missing NULL check when allocating the root inode in
autofs_fill_super().
- Fix stable writes for multi-device filesystems (xfs, btrfs etc) and
the block device pseudo filesystem.
Stable writes used to be a superblock flag only, making it a per
filesystem property. Add an additional AS_STABLE_WRITES mapping flag
to allow for fine-grained control.
- Ensure that offset_iterate_dir() returns 0 after reaching the end of
a directory so it adheres to getdents() convention.
* tag 'vfs-6.7-rc3.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
libfs: getdents() should return 0 after reaching EOD
xfs: respect the stable writes flag on the RT device
xfs: clean up FS_XFLAG_REALTIME handling in xfs_ioctl_setattr_xflags
block: update the stable_writes flag in bdev_add
filemap: add a per-mapping stable writes flag
autofs: add: new_inode check in autofs_fill_super()
iov_iter: fix copy_page_to_iter_nofault()
fs: Pass AT_GETATTR_NOSEC flag to getattr interface function
-rw-r--r-- | block/bdev.c | 2 | ||||
-rw-r--r-- | fs/autofs/inode.c | 56 | ||||
-rw-r--r-- | fs/ecryptfs/inode.c | 12 | ||||
-rw-r--r-- | fs/inode.c | 2 | ||||
-rw-r--r-- | fs/libfs.c | 14 | ||||
-rw-r--r-- | fs/overlayfs/inode.c | 10 | ||||
-rw-r--r-- | fs/overlayfs/overlayfs.h | 8 | ||||
-rw-r--r-- | fs/stat.c | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 8 | ||||
-rw-r--r-- | fs/xfs/xfs_ioctl.c | 30 | ||||
-rw-r--r-- | fs/xfs/xfs_iops.c | 7 | ||||
-rw-r--r-- | include/linux/pagemap.h | 17 | ||||
-rw-r--r-- | include/uapi/linux/fcntl.h | 3 | ||||
-rw-r--r-- | lib/iov_iter.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 2 |
15 files changed, 121 insertions, 58 deletions
diff --git a/block/bdev.c b/block/bdev.c index e4cfb7adb645..750aec178b6a 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -425,6 +425,8 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) void bdev_add(struct block_device *bdev, dev_t dev) { + if (bdev_stable_writes(bdev)) + mapping_set_stable_writes(bdev->bd_inode->i_mapping); bdev->bd_dev = dev; bdev->bd_inode->i_rdev = dev; bdev->bd_inode->i_ino = dev; diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index a5083d447a62..1f5db6863663 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -309,9 +309,7 @@ static int autofs_fill_super(struct super_block *s, struct fs_context *fc) struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = s->s_fs_info; struct inode *root_inode; - struct dentry *root; struct autofs_info *ino; - int ret = -ENOMEM; pr_debug("starting up, sbi = %p\n", sbi); @@ -328,56 +326,44 @@ static int autofs_fill_super(struct super_block *s, struct fs_context *fc) */ ino = autofs_new_ino(sbi); if (!ino) - goto fail; + return -ENOMEM; root_inode = autofs_get_inode(s, S_IFDIR | 0755); + if (!root_inode) + return -ENOMEM; + root_inode->i_uid = ctx->uid; root_inode->i_gid = ctx->gid; + root_inode->i_fop = &autofs_root_operations; + root_inode->i_op = &autofs_dir_inode_operations; - root = d_make_root(root_inode); - if (!root) - goto fail_ino; - - root->d_fsdata = ino; + s->s_root = d_make_root(root_inode); + if (unlikely(!s->s_root)) { + autofs_free_ino(ino); + return -ENOMEM; + } + s->s_root->d_fsdata = ino; if (ctx->pgrp_set) { sbi->oz_pgrp = find_get_pid(ctx->pgrp); - if (!sbi->oz_pgrp) { - ret = invalf(fc, "Could not find process group %d", - ctx->pgrp); - goto fail_dput; - } - } else { + if (!sbi->oz_pgrp) + return invalf(fc, "Could not find process group %d", + ctx->pgrp); + } else sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); - } if (autofs_type_trigger(sbi->type)) - __managed_dentry_set_managed(root); - - root_inode->i_fop = &autofs_root_operations; - root_inode->i_op = &autofs_dir_inode_operations; + /* s->s_root won't be contended so there's little to + * be gained by not taking the d_lock when setting + * d_flags, even when a lot mounts are being done. + */ + managed_dentry_set_managed(s->s_root); pr_debug("pipe fd = %d, pgrp = %u\n", sbi->pipefd, pid_nr(sbi->oz_pgrp)); sbi->flags &= ~AUTOFS_SBI_CATATONIC; - - /* - * Success! Install the root dentry now to indicate completion. - */ - s->s_root = root; return 0; - - /* - * Failure ... clean up. - */ -fail_dput: - dput(root); - goto fail; -fail_ino: - autofs_free_ino(ino); -fail: - return ret; } /* diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a25dd3d20008..b0e8774c435a 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -998,6 +998,14 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap, return rc; } +static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + if (flags & AT_GETATTR_NOSEC) + return vfs_getattr_nosec(path, stat, request_mask, flags); + return vfs_getattr(path, stat, request_mask, flags); +} + static int ecryptfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) @@ -1006,8 +1014,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, struct kstat lower_stat; int rc; - rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat, - request_mask, flags); + rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry), + &lower_stat, request_mask, flags); if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); diff --git a/fs/inode.c b/fs/inode.c index edcd8a61975f..f238d987dec9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -215,6 +215,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) lockdep_set_class_and_name(&mapping->invalidate_lock, &sb->s_type->invalidate_lock_key, "mapping.invalidate_lock"); + if (sb->s_iflags & SB_I_STABLE_WRITES) + mapping_set_stable_writes(mapping); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ diff --git a/fs/libfs.c b/fs/libfs.c index e9440d55073c..c2aa6fd4795c 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -399,6 +399,8 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) return -EINVAL; } + /* In this case, ->private_data is protected by f_pos_lock */ + file->private_data = NULL; return vfs_setpos(file, offset, U32_MAX); } @@ -428,7 +430,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } -static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) +static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) { struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); XA_STATE(xas, &so_ctx->xa, ctx->pos); @@ -437,7 +439,7 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) while (true) { dentry = offset_find_next(&xas); if (!dentry) - break; + return ERR_PTR(-ENOENT); if (!offset_dir_emit(ctx, dentry)) { dput(dentry); @@ -447,6 +449,7 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) dput(dentry); ctx->pos = xas.xa_index + 1; } + return NULL; } /** @@ -479,7 +482,12 @@ static int offset_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - offset_iterate_dir(d_inode(dir), ctx); + /* In this case, ->private_data is protected by f_pos_lock */ + if (ctx->pos == 2) + file->private_data = NULL; + else if (file->private_data == ERR_PTR(-ENOENT)) + return 0; + file->private_data = offset_iterate_dir(d_inode(dir), ctx); return 0; } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 345b8f161ca4..c63b31a460be 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -171,7 +171,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr(&realpath, stat, request_mask, flags); + err = ovl_do_getattr(&realpath, stat, request_mask, flags); if (err) goto out; @@ -196,8 +196,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, (!is_dir ? STATX_NLINK : 0); ovl_path_lower(dentry, &realpath); - err = vfs_getattr(&realpath, &lowerstat, - lowermask, flags); + err = ovl_do_getattr(&realpath, &lowerstat, lowermask, + flags); if (err) goto out; @@ -249,8 +249,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, ovl_path_lowerdata(dentry, &realpath); if (realpath.dentry) { - err = vfs_getattr(&realpath, &lowerdatastat, - lowermask, flags); + err = ovl_do_getattr(&realpath, &lowerdatastat, + lowermask, flags); if (err) goto out; } else { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index ca88b2636a57..05c3dd597fa8 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -408,6 +408,14 @@ static inline bool ovl_open_flags_need_copy_up(int flags) return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); } +static inline int ovl_do_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + if (flags & AT_GETATTR_NOSEC) + return vfs_getattr_nosec(path, stat, request_mask, flags); + return vfs_getattr(path, stat, request_mask, flags); +} + /* util.c */ int ovl_get_write_access(struct dentry *dentry); void ovl_put_write_access(struct dentry *dentry); diff --git a/fs/stat.c b/fs/stat.c index 24bb0209e459..f721d26ec3f7 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -133,7 +133,8 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) return inode->i_op->getattr(idmap, path, stat, - request_mask, query_flags); + request_mask, + query_flags | AT_GETATTR_NOSEC); generic_fillattr(idmap, request_mask, inode, stat); return 0; @@ -166,6 +167,9 @@ int vfs_getattr(const struct path *path, struct kstat *stat, { int retval; + if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC)) + return -EPERM; + retval = security_inode_getattr(path); if (retval) return retval; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3dc47937da5d..3beb470f1892 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -569,6 +569,14 @@ extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); +static inline void xfs_update_stable_writes(struct xfs_inode *ip) +{ + if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev)) + mapping_set_stable_writes(VFS_I(ip)->i_mapping); + else + mapping_clear_stable_writes(VFS_I(ip)->i_mapping); +} + /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a82470e027f7..6c3919687ea6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1121,23 +1121,25 @@ xfs_ioctl_setattr_xflags( struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; + bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); uint64_t i_flags2; - /* Can't change realtime flag if any extents are allocated. */ - if ((ip->i_df.if_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) - return -EINVAL; + if (rtflag != XFS_IS_REALTIME_INODE(ip)) { + /* Can't change realtime flag if any extents are allocated. */ + if (ip->i_df.if_nextents || ip->i_delayed_blks) + return -EINVAL; + } - /* If realtime flag is set then must have realtime device */ - if (fa->fsx_xflags & FS_XFLAG_REALTIME) { + if (rtflag) { + /* If realtime flag is set then must have realtime device */ if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || xfs_extlen_to_rtxmod(mp, ip->i_extsize)) return -EINVAL; - } - /* Clear reflink if we are actually able to set the rt flag. */ - if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) - ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + /* Clear reflink if we are actually able to set the rt flag. */ + if (xfs_is_reflink_inode(ip)) + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + } /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); @@ -1148,6 +1150,14 @@ xfs_ioctl_setattr_xflags( ip->i_diflags2 = i_flags2; xfs_diflags_to_iflags(ip, false); + + /* + * Make the stable writes flag match that of the device the inode + * resides on when flipping the RT flag. + */ + if (rtflag != XFS_IS_REALTIME_INODE(ip) && S_ISREG(VFS_I(ip)->i_mode)) + xfs_update_stable_writes(ip); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index fdfda4fba12b..a0d77f5f512e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1299,6 +1299,13 @@ xfs_setup_inode( mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); /* + * For real-time inodes update the stable write flags to that of the RT + * device instead of the data device. + */ + if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip)) + xfs_update_stable_writes(ip); + + /* * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcc1ea44b4e8..06142ff7f9ce 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -204,6 +204,8 @@ enum mapping_flags { AS_NO_WRITEBACK_TAGS = 5, AS_LARGE_FOLIO_SUPPORT = 6, AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ + AS_STABLE_WRITES, /* must wait for writeback before modifying + folio contents */ }; /** @@ -289,6 +291,21 @@ static inline void mapping_clear_release_always(struct address_space *mapping) clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); } +static inline bool mapping_stable_writes(const struct address_space *mapping) +{ + return test_bit(AS_STABLE_WRITES, &mapping->flags); +} + +static inline void mapping_set_stable_writes(struct address_space *mapping) +{ + set_bit(AS_STABLE_WRITES, &mapping->flags); +} + +static inline void mapping_clear_stable_writes(struct address_space *mapping) +{ + clear_bit(AS_STABLE_WRITES, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6c80f96049bd..282e90aeb163 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -116,5 +116,8 @@ #define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to compare object identity and may not be usable to open_by_handle_at(2) */ +#if defined(__KERNEL__) +#define AT_GETATTR_NOSEC 0x80000000 +#endif #endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index de7d11cf4c63..8ff6824a1005 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -409,7 +409,7 @@ size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t byte void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); - n = iterate_and_advance(i, bytes, kaddr, + n = iterate_and_advance(i, n, kaddr + offset, copy_to_user_iter_nofault, memcpy_to_iter); kunmap_local(kaddr); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 46f2f5d3d183..ee2fd6a6af40 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3107,7 +3107,7 @@ EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); */ void folio_wait_stable(struct folio *folio) { - if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES) + if (mapping_stable_writes(folio_mapping(folio))) folio_wait_writeback(folio); } EXPORT_SYMBOL_GPL(folio_wait_stable); |