aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/disk-io.c43
-rw-r--r--fs/btrfs/inode.c7
-rw-r--r--fs/btrfs/space-info.c2
-rw-r--r--fs/btrfs/volumes.c3
-rw-r--r--fs/btrfs/zoned.c139
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/connect.c14
-rw-r--r--fs/cifs/file.c3
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/debugfs/inode.c22
-rw-r--r--fs/exec.c7
-rw-r--r--fs/exfat/fatent.c3
-rw-r--r--fs/nfs/internal.h25
-rw-r--r--fs/nfs/nfs42proc.c9
-rw-r--r--fs/nfs/super.c27
-rw-r--r--fs/nfs/write.c25
-rw-r--r--fs/nfsd/vfs.c31
-rw-r--r--fs/open.c2
-rw-r--r--fs/tracefs/inode.c31
20 files changed, 261 insertions, 144 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ef162dbd4bc..df8c99c99df9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1088,8 +1088,6 @@ struct btrfs_fs_info {
spinlock_t zone_active_bgs_lock;
struct list_head zone_active_bgs;
- /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */
- wait_queue_head_t zone_finish_wait;
/* Updates are not protected by any lock */
struct btrfs_commit_stats commit_stats;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 820b1f1e6b67..2633137c3e9f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3068,7 +3068,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
init_waitqueue_head(&fs_info->delayed_iputs_wait);
- init_waitqueue_head(&fs_info->zone_finish_wait);
/* Usable values until the real ones are cached from the superblock */
fs_info->nodesize = 4096;
@@ -4476,6 +4475,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
/*
+ * If we had UNFINISHED_DROPS we could still be processing them, so
+ * clear that bit and wake up relocation so it can stop.
+ * We must do this before stopping the block group reclaim task, because
+ * at btrfs_relocate_block_group() we wait for this bit, and after the
+ * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
+ * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
+ * return 1.
+ */
+ btrfs_wake_unfinished_drop(fs_info);
+
+ /*
* We may have the reclaim task running and relocating a data block group,
* in which case it may create delayed iputs. So stop it before we park
* the cleaner kthread otherwise we can get new delayed iputs after
@@ -4493,12 +4503,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
kthread_park(fs_info->cleaner_kthread);
- /*
- * If we had UNFINISHED_DROPS we could still be processing them, so
- * clear that bit and wake up relocation so it can stop.
- */
- btrfs_wake_unfinished_drop(fs_info);
-
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
@@ -4521,6 +4525,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
+ /*
+ * After we parked the cleaner kthread, ordered extents may have
+ * completed and created new delayed iputs. If one of the async reclaim
+ * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
+ * can hang forever trying to stop it, because if a delayed iput is
+ * added after it ran btrfs_run_delayed_iputs() and before it called
+ * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
+ * no one else to run iputs.
+ *
+ * So wait for all ongoing ordered extents to complete and then run
+ * delayed iputs. This works because once we reach this point no one
+ * can either create new ordered extents nor create delayed iputs
+ * through some other means.
+ *
+ * Also note that btrfs_wait_ordered_roots() is not safe here, because
+ * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
+ * but the delayed iput for the respective inode is made only when doing
+ * the final btrfs_put_ordered_extent() (which must happen at
+ * btrfs_finish_ordered_io() when we are unmounting).
+ */
+ btrfs_flush_workqueue(fs_info->endio_write_workers);
+ /* Ordered extents for free space inodes. */
+ btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ btrfs_run_delayed_iputs(fs_info);
+
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ad250892028d..1372210869b1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1644,10 +1644,9 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
done_offset = end;
if (done_offset == start) {
- struct btrfs_fs_info *info = inode->root->fs_info;
-
- wait_var_event(&info->zone_finish_wait,
- !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags));
+ wait_on_bit_io(&inode->root->fs_info->flags,
+ BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
continue;
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d0cbeb7ae81c..435559ba94fa 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -199,7 +199,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
if (flags & BTRFS_BLOCK_GROUP_DATA)
- return SZ_1G;
+ return BTRFS_MAX_DATA_CHUNK_SIZE;
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return SZ_32M;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 064ab2a79c80..f63ff91e2883 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5267,6 +5267,9 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
ctl->stripe_size);
}
+ /* Stripe size should not go beyond 1G. */
+ ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
+
/* Align to BTRFS_STRIPE_LEN */
ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
ctl->chunk_size = ctl->stripe_size * data_stripes;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index b150b07ba1a7..73c6929f7be6 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -421,10 +421,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
* since btrfs adds the pages one by one to a bio, and btrfs cannot
* increase the metadata reservation even if it increases the number of
* extents, it is safe to stick with the limit.
+ *
+ * With the zoned emulation, we can have non-zoned device on the zoned
+ * mode. In this case, we don't have a valid max zone append size. So,
+ * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size.
*/
- zone_info->max_zone_append_size =
- min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
- (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+ if (bdev_is_zoned(bdev)) {
+ zone_info->max_zone_append_size = min_t(u64,
+ (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+ } else {
+ zone_info->max_zone_append_size =
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT;
+ }
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
@@ -1178,7 +1187,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
* offset.
*/
static int calculate_alloc_pointer(struct btrfs_block_group *cache,
- u64 *offset_ret)
+ u64 *offset_ret, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_root *root;
@@ -1188,6 +1197,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
int ret;
u64 length;
+ /*
+ * Avoid tree lookups for a new block group, there's no use for it.
+ * It must always be 0.
+ *
+ * Also, we have a lock chain of extent buffer lock -> chunk mutex.
+ * For new a block group, this function is called from
+ * btrfs_make_block_group() which is already taking the chunk mutex.
+ * Thus, we cannot call calculate_alloc_pointer() which takes extent
+ * buffer locks to avoid deadlock.
+ */
+ if (new) {
+ *offset_ret = 0;
+ return 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1323,6 +1347,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
else
num_conventional++;
+ /*
+ * Consider a zone as active if we can allow any number of
+ * active zones.
+ */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(i, active);
+
if (!is_sequential) {
alloc_offsets[i] = WP_CONVENTIONAL;
continue;
@@ -1389,45 +1420,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
__set_bit(i, active);
break;
}
-
- /*
- * Consider a zone as active if we can allow any number of
- * active zones.
- */
- if (!device->zone_info->max_active_zones)
- __set_bit(i, active);
}
if (num_sequential > 0)
cache->seq_zone = true;
if (num_conventional > 0) {
- /*
- * Avoid calling calculate_alloc_pointer() for new BG. It
- * is no use for new BG. It must be always 0.
- *
- * Also, we have a lock chain of extent buffer lock ->
- * chunk mutex. For new BG, this function is called from
- * btrfs_make_block_group() which is already taking the
- * chunk mutex. Thus, we cannot call
- * calculate_alloc_pointer() which takes extent buffer
- * locks to avoid deadlock.
- */
-
/* Zone capacity is always zone size in emulation */
cache->zone_capacity = cache->length;
- if (new) {
- cache->alloc_offset = 0;
- goto out;
- }
- ret = calculate_alloc_pointer(cache, &last_alloc);
- if (ret || map->num_stripes == num_conventional) {
- if (!ret)
- cache->alloc_offset = last_alloc;
- else
- btrfs_err(fs_info,
+ ret = calculate_alloc_pointer(cache, &last_alloc, new);
+ if (ret) {
+ btrfs_err(fs_info,
"zoned: failed to determine allocation offset of bg %llu",
- cache->start);
+ cache->start);
+ goto out;
+ } else if (map->num_stripes == num_conventional) {
+ cache->alloc_offset = last_alloc;
+ cache->zone_is_active = 1;
goto out;
}
}
@@ -1495,13 +1504,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
- if (cache->zone_is_active) {
- btrfs_get_block_group(cache);
- spin_lock(&fs_info->zone_active_bgs_lock);
- list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
- spin_unlock(&fs_info->zone_active_bgs_lock);
- }
-
out:
if (cache->alloc_offset > fs_info->zone_size) {
btrfs_err(fs_info,
@@ -1526,10 +1528,16 @@ out:
ret = -EIO;
}
- if (!ret)
+ if (!ret) {
cache->meta_write_pointer = cache->alloc_offset + cache->start;
-
- if (ret) {
+ if (cache->zone_is_active) {
+ btrfs_get_block_group(cache);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&cache->active_bg_list,
+ &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+ } else {
kfree(cache->physical_map);
cache->physical_map = NULL;
}
@@ -1910,10 +1918,44 @@ out_unlock:
return ret;
}
+static void wait_eb_writebacks(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ const u64 end = block_group->start + block_group->length;
+ struct radix_tree_iter iter;
+ struct extent_buffer *eb;
+ void __rcu **slot;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
+ block_group->start >> fs_info->sectorsize_bits) {
+ eb = radix_tree_deref_slot(slot);
+ if (!eb)
+ continue;
+ if (radix_tree_deref_retry(eb)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ if (eb->start < block_group->start)
+ continue;
+ if (eb->start >= end)
+ break;
+
+ slot = radix_tree_iter_resume(slot, &iter);
+ rcu_read_unlock();
+ wait_on_extent_buffer_writeback(eb);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct map_lookup *map;
+ const bool is_metadata = (block_group->flags &
+ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
int ret = 0;
int i;
@@ -1924,8 +1966,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
}
/* Check if we have unwritten allocated space */
- if ((block_group->flags &
- (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
+ if (is_metadata &&
block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
spin_unlock(&block_group->lock);
return -EAGAIN;
@@ -1950,6 +1991,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
/* No need to wait for NOCOW writers. Zoned mode does not allow that */
btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
block_group->length);
+ /* Wait for extent buffers to be written. */
+ if (is_metadata)
+ wait_eb_writebacks(block_group);
spin_lock(&block_group->lock);
@@ -2007,8 +2051,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
/* For active_bg_list */
btrfs_put_block_group(block_group);
- clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
- wake_up_all(&fs_info->zone_finish_wait);
+ clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
return 0;
}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 81f4c15936d0..5b4a7a32bdc5 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 38
-#define CIFS_VERSION "2.38"
+#define SMB3_PRODUCT_BUILD 39
+#define CIFS_VERSION "2.39"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a0a06b6f252b..7ae6f2c08153 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -702,9 +702,6 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
int length = 0;
int total_read;
- smb_msg->msg_control = NULL;
- smb_msg->msg_controllen = 0;
-
for (total_read = 0; msg_data_left(smb_msg); total_read += length) {
try_to_freeze();
@@ -760,7 +757,7 @@ int
cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read)
{
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
struct kvec iov = {.iov_base = buf, .iov_len = to_read};
iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read);
@@ -770,15 +767,13 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
ssize_t
cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read)
{
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
/*
* iov_iter_discard already sets smb_msg.type and count and iov_offset
* and cifs_readv_from_socket sets msg_control and msg_controllen
* so little to initialize in struct msghdr
*/
- smb_msg.msg_name = NULL;
- smb_msg.msg_namelen = 0;
iov_iter_discard(&smb_msg.msg_iter, READ, to_read);
return cifs_readv_from_socket(server, &smb_msg);
@@ -788,7 +783,7 @@ int
cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
unsigned int page_offset, unsigned int to_read)
{
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
struct bio_vec bv = {
.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read);
@@ -2350,7 +2345,9 @@ cifs_put_tcon(struct cifs_tcon *tcon)
ses = tcon->ses;
cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (--tcon->tc_count > 0) {
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
return;
}
@@ -2359,6 +2356,7 @@ cifs_put_tcon(struct cifs_tcon *tcon)
WARN_ON(tcon->tc_count < 0);
list_del_init(&tcon->tcon_list);
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
/* cancel polling of interfaces */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fa738adc031f..6f38b134a346 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3575,6 +3575,9 @@ static ssize_t __cifs_writev(
ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
+
+ cifs_revalidate_mapping(file->f_inode);
return __cifs_writev(iocb, from, true);
}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c2fe035e573b..9a2753e21170 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -194,10 +194,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg,
*sent = 0;
- smb_msg->msg_name = (struct sockaddr *) &server->dstaddr;
- smb_msg->msg_namelen = sizeof(struct sockaddr);
- smb_msg->msg_control = NULL;
- smb_msg->msg_controllen = 0;
if (server->noblocksnd)
smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
else
@@ -309,7 +305,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
sigset_t mask, oldmask;
size_t total_len = 0, sent, size;
struct socket *ssocket = server->ssocket;
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
__be32 rfc1002_marker;
if (cifs_rdma_enabled(server)) {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dcf0b8b4e93..232cfdf095ae 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -745,6 +745,28 @@ void debugfs_remove(struct dentry *dentry)
EXPORT_SYMBOL_GPL(debugfs_remove);
/**
+ * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it
+ * @name: a pointer to a string containing the name of the item to look up.
+ * @parent: a pointer to the parent dentry of the item.
+ *
+ * This is the equlivant of doing something like
+ * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting
+ * handled for the directory being looked up.
+ */
+void debugfs_lookup_and_remove(const char *name, struct dentry *parent)
+{
+ struct dentry *dentry;
+
+ dentry = debugfs_lookup(name, parent);
+ if (!dentry)
+ return;
+
+ debugfs_remove(dentry);
+ dput(dentry);
+}
+EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove);
+
+/**
* debugfs_rename - rename a file/directory in the debugfs filesystem
* @old_dir: a pointer to the parent dentry for the renamed object. This
* should be a directory dentry.
diff --git a/fs/exec.c b/fs/exec.c
index 9a5ca7b82bfc..d046dbb9cbd0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -65,7 +65,6 @@
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
-#include <linux/time_namespace.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -979,12 +978,10 @@ static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct *old_mm, *active_mm;
- bool vfork;
int ret;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
- vfork = !!tsk->vfork_done;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
if (old_mm)
@@ -1029,10 +1026,6 @@ static int exec_mmap(struct mm_struct *mm)
tsk->mm->vmacache_seqnum = 0;
vmacache_flush(tsk);
task_unlock(tsk);
-
- if (vfork)
- timens_on_fork(tsk->nsproxy, tsk);
-
if (old_mm) {
mmap_read_unlock(old_mm);
BUG_ON(active_mm != old_mm);
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index ee0b7cf51157..41ae4cce1f42 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -270,8 +270,7 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
struct super_block *sb = dir->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct buffer_head *bh;
- sector_t blknr, last_blknr;
- int i;
+ sector_t blknr, last_blknr, i;
blknr = exfat_cluster_to_sector(sbi, clu);
last_blknr = blknr + sbi->sect_per_clus;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 27c720d71b4e..898dd95bc7a7 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -606,6 +606,31 @@ static inline gfp_t nfs_io_gfp_mask(void)
return GFP_KERNEL;
}
+/*
+ * Special version of should_remove_suid() that ignores capabilities.
+ */
+static inline int nfs_should_remove_suid(const struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ /*
+ * sgid without any exec bits is just a mandatory locking mark; leave
+ * it alone. If some exec bits are set, it's a real sgid; kill it.
+ */
+ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+ kill |= ATTR_KILL_SGID;
+
+ if (unlikely(kill && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 068c45b3bc1a..6dab9e408372 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -78,10 +78,15 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
status = nfs4_call_sync(server->client, server, msg,
&args.seq_args, &res.seq_res, 0);
- if (status == 0)
+ if (status == 0) {
+ if (nfs_should_remove_suid(inode)) {
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
+ spin_unlock(&inode->i_lock);
+ }
status = nfs_post_op_update_inode_force_wcc(inode,
res.falloc_fattr);
-
+ }
if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE])
trace_nfs4_fallocate(inode, &args, status);
else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 82944e14fcea..ee66ffdb985e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1051,22 +1051,31 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
if (ctx->bsize)
sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits);
- if (server->nfs_client->rpc_ops->version != 2) {
- /* The VFS shouldn't apply the umask to mode bits. We will do
- * so ourselves when necessary.
+ switch (server->nfs_client->rpc_ops->version) {
+ case 2:
+ sb->s_time_gran = 1000;
+ sb->s_time_min = 0;
+ sb->s_time_max = U32_MAX;
+ break;
+ case 3:
+ /*
+ * The VFS shouldn't apply the umask to mode bits.
+ * We will do so ourselves when necessary.
*/
sb->s_flags |= SB_POSIXACL;
sb->s_time_gran = 1;
- sb->s_export_op = &nfs_export_ops;
- } else
- sb->s_time_gran = 1000;
-
- if (server->nfs_client->rpc_ops->version != 4) {
sb->s_time_min = 0;
sb->s_time_max = U32_MAX;
- } else {
+ sb->s_export_op = &nfs_export_ops;
+ break;
+ case 4:
+ sb->s_flags |= SB_POSIXACL;
+ sb->s_time_gran = 1;
sb->s_time_min = S64_MIN;
sb->s_time_max = S64_MAX;
+ if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+ sb->s_export_op = &nfs_export_ops;
+ break;
}
sb->s_magic = NFS_SUPER_MAGIC;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1843fa235d9b..f41d24b54fd1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1496,31 +1496,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
}
-/*
- * Special version of should_remove_suid() that ignores capabilities.
- */
-static int nfs_should_remove_suid(const struct inode *inode)
-{
- umode_t mode = inode->i_mode;
- int kill = 0;
-
- /* suid always must be killed */
- if (unlikely(mode & S_ISUID))
- kill = ATTR_KILL_SUID;
-
- /*
- * sgid without any exec bits is just a mandatory locking mark; leave
- * it alone. If some exec bits are set, it's a real sgid; kill it.
- */
- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
- kill |= ATTR_KILL_SGID;
-
- if (unlikely(kill && S_ISREG(mode)))
- return kill;
-
- return 0;
-}
-
static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr,
struct nfs_fattr *fattr)
{
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9f486b788ed0..fc17b0ac8729 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -300,6 +300,10 @@ commit_metadata(struct svc_fh *fhp)
static void
nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
{
+ /* Ignore mode updates on symlinks */
+ if (S_ISLNK(inode->i_mode))
+ iap->ia_valid &= ~ATTR_MODE;
+
/* sanitize the mode change */
if (iap->ia_valid & ATTR_MODE) {
iap->ia_mode &= S_IALLUGO;
@@ -353,7 +357,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
int accmode = NFSD_MAY_SATTR;
umode_t ftype = 0;
__be32 err;
- int host_err;
+ int host_err = 0;
bool get_write_count;
bool size_change = (iap->ia_valid & ATTR_SIZE);
@@ -391,13 +395,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
dentry = fhp->fh_dentry;
inode = d_inode(dentry);
- /* Ignore any mode updates on symlinks */
- if (S_ISLNK(inode->i_mode))
- iap->ia_valid &= ~ATTR_MODE;
-
- if (!iap->ia_valid)
- return 0;
-
nfsd_sanitize_attrs(inode, iap);
if (check_guard && guardtime != inode->i_ctime.tv_sec)
@@ -448,8 +445,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out_unlock;
}
- iap->ia_valid |= ATTR_CTIME;
- host_err = notify_change(&init_user_ns, dentry, iap, NULL);
+ if (iap->ia_valid) {
+ iap->ia_valid |= ATTR_CTIME;
+ host_err = notify_change(&init_user_ns, dentry, iap, NULL);
+ }
out_unlock:
if (attr->na_seclabel && attr->na_seclabel->len)
@@ -846,10 +845,14 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
struct svc_rqst *rqstp = sd->u.data;
-
- svc_rqst_replace_page(rqstp, buf->page);
- if (rqstp->rq_res.page_len == 0)
- rqstp->rq_res.page_base = buf->offset;
+ struct page *page = buf->page; // may be a compound one
+ unsigned offset = buf->offset;
+
+ page += offset / PAGE_SIZE;
+ for (int i = sd->len; i > 0; i -= PAGE_SIZE)
+ svc_rqst_replace_page(rqstp, page++);
+ if (rqstp->rq_res.page_len == 0) // first call
+ rqstp->rq_res.page_base = offset % PAGE_SIZE;
rqstp->rq_res.page_len += sd->len;
return sd->len;
}
diff --git a/fs/open.c b/fs/open.c
index 8a813fa5ca56..cf7e5c350a54 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -716,6 +716,8 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
fs_userns = i_user_ns(inode);
retry_deleg:
+ newattrs.ia_vfsuid = INVALID_VFSUID;
+ newattrs.ia_vfsgid = INVALID_VFSGID;
newattrs.ia_valid = ATTR_CTIME;
if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
return -EINVAL;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 81d26abf486f..da85b3979195 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -141,6 +141,8 @@ struct tracefs_mount_opts {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ /* Opt_* bitfield. */
+ unsigned int opts;
};
enum {
@@ -241,6 +243,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
kgid_t gid;
char *p;
+ opts->opts = 0;
opts->mode = TRACEFS_DEFAULT_MODE;
while ((p = strsep(&data, ",")) != NULL) {
@@ -275,24 +278,36 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
* but traditionally tracefs has ignored all mount options
*/
}
+
+ opts->opts |= BIT(token);
}
return 0;
}
-static int tracefs_apply_options(struct super_block *sb)
+static int tracefs_apply_options(struct super_block *sb, bool remount)
{
struct tracefs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = d_inode(sb->s_root);
struct tracefs_mount_opts *opts = &fsi->mount_opts;
- inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= opts->mode;
+ /*
+ * On remount, only reset mode/uid/gid if they were provided as mount
+ * options.
+ */
+
+ if (!remount || opts->opts & BIT(Opt_mode)) {
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+ }
- inode->i_uid = opts->uid;
+ if (!remount || opts->opts & BIT(Opt_uid))
+ inode->i_uid = opts->uid;
- /* Set all the group ids to the mount option */
- set_gid(sb->s_root, opts->gid);
+ if (!remount || opts->opts & BIT(Opt_gid)) {
+ /* Set all the group ids to the mount option */
+ set_gid(sb->s_root, opts->gid);
+ }
return 0;
}
@@ -307,7 +322,7 @@ static int tracefs_remount(struct super_block *sb, int *flags, char *data)
if (err)
goto fail;
- tracefs_apply_options(sb);
+ tracefs_apply_options(sb, true);
fail:
return err;
@@ -359,7 +374,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &tracefs_super_operations;
- tracefs_apply_options(sb);
+ tracefs_apply_options(sb, false);
return 0;