diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/ctree.h | 2 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 43 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 7 | ||||
-rw-r--r-- | fs/btrfs/space-info.c | 2 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 3 | ||||
-rw-r--r-- | fs/btrfs/zoned.c | 139 | ||||
-rw-r--r-- | fs/cifs/cifsfs.h | 4 | ||||
-rw-r--r-- | fs/cifs/connect.c | 14 | ||||
-rw-r--r-- | fs/cifs/file.c | 3 | ||||
-rw-r--r-- | fs/cifs/transport.c | 6 | ||||
-rw-r--r-- | fs/debugfs/inode.c | 22 | ||||
-rw-r--r-- | fs/exec.c | 7 | ||||
-rw-r--r-- | fs/exfat/fatent.c | 3 | ||||
-rw-r--r-- | fs/nfs/internal.h | 25 | ||||
-rw-r--r-- | fs/nfs/nfs42proc.c | 9 | ||||
-rw-r--r-- | fs/nfs/super.c | 27 | ||||
-rw-r--r-- | fs/nfs/write.c | 25 | ||||
-rw-r--r-- | fs/nfsd/vfs.c | 31 | ||||
-rw-r--r-- | fs/open.c | 2 | ||||
-rw-r--r-- | fs/tracefs/inode.c | 31 |
20 files changed, 261 insertions, 144 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9ef162dbd4bc..df8c99c99df9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1088,8 +1088,6 @@ struct btrfs_fs_info { spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; - /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */ - wait_queue_head_t zone_finish_wait; /* Updates are not protected by any lock */ struct btrfs_commit_stats commit_stats; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 820b1f1e6b67..2633137c3e9f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3068,7 +3068,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); init_waitqueue_head(&fs_info->delayed_iputs_wait); - init_waitqueue_head(&fs_info->zone_finish_wait); /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; @@ -4476,6 +4475,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + * We must do this before stopping the block group reclaim task, because + * at btrfs_relocate_block_group() we wait for this bit, and after the + * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we + * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will + * return 1. + */ + btrfs_wake_unfinished_drop(fs_info); + + /* * We may have the reclaim task running and relocating a data block group, * in which case it may create delayed iputs. So stop it before we park * the cleaner kthread otherwise we can get new delayed iputs after @@ -4493,12 +4503,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ kthread_park(fs_info->cleaner_kthread); - /* - * If we had UNFINISHED_DROPS we could still be processing them, so - * clear that bit and wake up relocation so it can stop. - */ - btrfs_wake_unfinished_drop(fs_info); - /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); @@ -4521,6 +4525,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); + /* + * After we parked the cleaner kthread, ordered extents may have + * completed and created new delayed iputs. If one of the async reclaim + * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we + * can hang forever trying to stop it, because if a delayed iput is + * added after it ran btrfs_run_delayed_iputs() and before it called + * btrfs_wait_on_delayed_iputs(), it will hang forever since there is + * no one else to run iputs. + * + * So wait for all ongoing ordered extents to complete and then run + * delayed iputs. This works because once we reach this point no one + * can either create new ordered extents nor create delayed iputs + * through some other means. + * + * Also note that btrfs_wait_ordered_roots() is not safe here, because + * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, + * but the delayed iput for the respective inode is made only when doing + * the final btrfs_put_ordered_extent() (which must happen at + * btrfs_finish_ordered_io() when we are unmounting). + */ + btrfs_flush_workqueue(fs_info->endio_write_workers); + /* Ordered extents for free space inodes. */ + btrfs_flush_workqueue(fs_info->endio_freespace_worker); + btrfs_run_delayed_iputs(fs_info); + cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ad250892028d..1372210869b1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1644,10 +1644,9 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, done_offset = end; if (done_offset == start) { - struct btrfs_fs_info *info = inode->root->fs_info; - - wait_var_event(&info->zone_finish_wait, - !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags)); + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); continue; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d0cbeb7ae81c..435559ba94fa 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -199,7 +199,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); if (flags & BTRFS_BLOCK_GROUP_DATA) - return SZ_1G; + return BTRFS_MAX_DATA_CHUNK_SIZE; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return SZ_32M; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 064ab2a79c80..f63ff91e2883 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5267,6 +5267,9 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, ctl->stripe_size); } + /* Stripe size should not go beyond 1G. */ + ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); + /* Align to BTRFS_STRIPE_LEN */ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); ctl->chunk_size = ctl->stripe_size * data_stripes; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b150b07ba1a7..73c6929f7be6 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -421,10 +421,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) * since btrfs adds the pages one by one to a bio, and btrfs cannot * increase the metadata reservation even if it increases the number of * extents, it is safe to stick with the limit. + * + * With the zoned emulation, we can have non-zoned device on the zoned + * mode. In this case, we don't have a valid max zone append size. So, + * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. */ - zone_info->max_zone_append_size = - min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, - (u64)bdev_max_segments(bdev) << PAGE_SHIFT); + if (bdev_is_zoned(bdev)) { + zone_info->max_zone_append_size = min_t(u64, + (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, + (u64)bdev_max_segments(bdev) << PAGE_SHIFT); + } else { + zone_info->max_zone_append_size = + (u64)bdev_max_segments(bdev) << PAGE_SHIFT; + } if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -1178,7 +1187,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) * offset. */ static int calculate_alloc_pointer(struct btrfs_block_group *cache, - u64 *offset_ret) + u64 *offset_ret, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; @@ -1188,6 +1197,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, int ret; u64 length; + /* + * Avoid tree lookups for a new block group, there's no use for it. + * It must always be 0. + * + * Also, we have a lock chain of extent buffer lock -> chunk mutex. + * For new a block group, this function is called from + * btrfs_make_block_group() which is already taking the chunk mutex. + * Thus, we cannot call calculate_alloc_pointer() which takes extent + * buffer locks to avoid deadlock. + */ + if (new) { + *offset_ret = 0; + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1323,6 +1347,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) else num_conventional++; + /* + * Consider a zone as active if we can allow any number of + * active zones. + */ + if (!device->zone_info->max_active_zones) + __set_bit(i, active); + if (!is_sequential) { alloc_offsets[i] = WP_CONVENTIONAL; continue; @@ -1389,45 +1420,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) __set_bit(i, active); break; } - - /* - * Consider a zone as active if we can allow any number of - * active zones. - */ - if (!device->zone_info->max_active_zones) - __set_bit(i, active); } if (num_sequential > 0) cache->seq_zone = true; if (num_conventional > 0) { - /* - * Avoid calling calculate_alloc_pointer() for new BG. It - * is no use for new BG. It must be always 0. - * - * Also, we have a lock chain of extent buffer lock -> - * chunk mutex. For new BG, this function is called from - * btrfs_make_block_group() which is already taking the - * chunk mutex. Thus, we cannot call - * calculate_alloc_pointer() which takes extent buffer - * locks to avoid deadlock. - */ - /* Zone capacity is always zone size in emulation */ cache->zone_capacity = cache->length; - if (new) { - cache->alloc_offset = 0; - goto out; - } - ret = calculate_alloc_pointer(cache, &last_alloc); - if (ret || map->num_stripes == num_conventional) { - if (!ret) - cache->alloc_offset = last_alloc; - else - btrfs_err(fs_info, + ret = calculate_alloc_pointer(cache, &last_alloc, new); + if (ret) { + btrfs_err(fs_info, "zoned: failed to determine allocation offset of bg %llu", - cache->start); + cache->start); + goto out; + } else if (map->num_stripes == num_conventional) { + cache->alloc_offset = last_alloc; + cache->zone_is_active = 1; goto out; } } @@ -1495,13 +1504,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } - if (cache->zone_is_active) { - btrfs_get_block_group(cache); - spin_lock(&fs_info->zone_active_bgs_lock); - list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); - } - out: if (cache->alloc_offset > fs_info->zone_size) { btrfs_err(fs_info, @@ -1526,10 +1528,16 @@ out: ret = -EIO; } - if (!ret) + if (!ret) { cache->meta_write_pointer = cache->alloc_offset + cache->start; - - if (ret) { + if (cache->zone_is_active) { + btrfs_get_block_group(cache); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&cache->active_bg_list, + &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } + } else { kfree(cache->physical_map); cache->physical_map = NULL; } @@ -1910,10 +1918,44 @@ out_unlock: return ret; } +static void wait_eb_writebacks(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + const u64 end = block_group->start + block_group->length; + struct radix_tree_iter iter; + struct extent_buffer *eb; + void __rcu **slot; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, + block_group->start >> fs_info->sectorsize_bits) { + eb = radix_tree_deref_slot(slot); + if (!eb) + continue; + if (radix_tree_deref_retry(eb)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + if (eb->start < block_group->start) + continue; + if (eb->start >= end) + break; + + slot = radix_tree_iter_resume(slot, &iter); + rcu_read_unlock(); + wait_on_extent_buffer_writeback(eb); + rcu_read_lock(); + } + rcu_read_unlock(); +} + static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct map_lookup *map; + const bool is_metadata = (block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); int ret = 0; int i; @@ -1924,8 +1966,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ } /* Check if we have unwritten allocated space */ - if ((block_group->flags & - (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && + if (is_metadata && block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; @@ -1950,6 +1991,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* No need to wait for NOCOW writers. Zoned mode does not allow that */ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, block_group->length); + /* Wait for extent buffers to be written. */ + if (is_metadata) + wait_eb_writebacks(block_group); spin_lock(&block_group->lock); @@ -2007,8 +2051,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* For active_bg_list */ btrfs_put_block_group(block_group); - clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); - wake_up_all(&fs_info->zone_finish_wait); + clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); return 0; } diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 81f4c15936d0..5b4a7a32bdc5 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 38 -#define CIFS_VERSION "2.38" +#define SMB3_PRODUCT_BUILD 39 +#define CIFS_VERSION "2.39" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a0a06b6f252b..7ae6f2c08153 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -702,9 +702,6 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) int length = 0; int total_read; - smb_msg->msg_control = NULL; - smb_msg->msg_controllen = 0; - for (total_read = 0; msg_data_left(smb_msg); total_read += length) { try_to_freeze(); @@ -760,7 +757,7 @@ int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read) { - struct msghdr smb_msg; + struct msghdr smb_msg = {}; struct kvec iov = {.iov_base = buf, .iov_len = to_read}; iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read); @@ -770,15 +767,13 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) { - struct msghdr smb_msg; + struct msghdr smb_msg = {}; /* * iov_iter_discard already sets smb_msg.type and count and iov_offset * and cifs_readv_from_socket sets msg_control and msg_controllen * so little to initialize in struct msghdr */ - smb_msg.msg_name = NULL; - smb_msg.msg_namelen = 0; iov_iter_discard(&smb_msg.msg_iter, READ, to_read); return cifs_readv_from_socket(server, &smb_msg); @@ -788,7 +783,7 @@ int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, unsigned int to_read) { - struct msghdr smb_msg; + struct msghdr smb_msg = {}; struct bio_vec bv = { .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read); @@ -2350,7 +2345,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) ses = tcon->ses; cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count); spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if (--tcon->tc_count > 0) { + spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); return; } @@ -2359,6 +2356,7 @@ cifs_put_tcon(struct cifs_tcon *tcon) WARN_ON(tcon->tc_count < 0); list_del_init(&tcon->tcon_list); + spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); /* cancel polling of interfaces */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index fa738adc031f..6f38b134a346 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3575,6 +3575,9 @@ static ssize_t __cifs_writev( ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from) { + struct file *file = iocb->ki_filp; + + cifs_revalidate_mapping(file->f_inode); return __cifs_writev(iocb, from, true); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c2fe035e573b..9a2753e21170 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -194,10 +194,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, *sent = 0; - smb_msg->msg_name = (struct sockaddr *) &server->dstaddr; - smb_msg->msg_namelen = sizeof(struct sockaddr); - smb_msg->msg_control = NULL; - smb_msg->msg_controllen = 0; if (server->noblocksnd) smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; else @@ -309,7 +305,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, sigset_t mask, oldmask; size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; - struct msghdr smb_msg; + struct msghdr smb_msg = {}; __be32 rfc1002_marker; if (cifs_rdma_enabled(server)) { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 3dcf0b8b4e93..232cfdf095ae 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -745,6 +745,28 @@ void debugfs_remove(struct dentry *dentry) EXPORT_SYMBOL_GPL(debugfs_remove); /** + * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it + * @name: a pointer to a string containing the name of the item to look up. + * @parent: a pointer to the parent dentry of the item. + * + * This is the equlivant of doing something like + * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting + * handled for the directory being looked up. + */ +void debugfs_lookup_and_remove(const char *name, struct dentry *parent) +{ + struct dentry *dentry; + + dentry = debugfs_lookup(name, parent); + if (!dentry) + return; + + debugfs_remove(dentry); + dput(dentry); +} +EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove); + +/** * debugfs_rename - rename a file/directory in the debugfs filesystem * @old_dir: a pointer to the parent dentry for the renamed object. This * should be a directory dentry. diff --git a/fs/exec.c b/fs/exec.c index 9a5ca7b82bfc..d046dbb9cbd0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,7 +65,6 @@ #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> -#include <linux/time_namespace.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -979,12 +978,10 @@ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; - bool vfork; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; - vfork = !!tsk->vfork_done; old_mm = current->mm; exec_mm_release(tsk, old_mm); if (old_mm) @@ -1029,10 +1026,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); - - if (vfork) - timens_on_fork(tsk->nsproxy, tsk); - if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index ee0b7cf51157..41ae4cce1f42 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -270,8 +270,7 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu) struct super_block *sb = dir->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct buffer_head *bh; - sector_t blknr, last_blknr; - int i; + sector_t blknr, last_blknr, i; blknr = exfat_cluster_to_sector(sbi, clu); last_blknr = blknr + sbi->sect_per_clus; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 27c720d71b4e..898dd95bc7a7 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -606,6 +606,31 @@ static inline gfp_t nfs_io_gfp_mask(void) return GFP_KERNEL; } +/* + * Special version of should_remove_suid() that ignores capabilities. + */ +static inline int nfs_should_remove_suid(const struct inode *inode) +{ + umode_t mode = inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && S_ISREG(mode))) + return kill; + + return 0; +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 068c45b3bc1a..6dab9e408372 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -78,10 +78,15 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, status = nfs4_call_sync(server->client, server, msg, &args.seq_args, &res.seq_res, 0); - if (status == 0) + if (status == 0) { + if (nfs_should_remove_suid(inode)) { + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); + spin_unlock(&inode->i_lock); + } status = nfs_post_op_update_inode_force_wcc(inode, res.falloc_fattr); - + } if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE]) trace_nfs4_fallocate(inode, &args, status); else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 82944e14fcea..ee66ffdb985e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1051,22 +1051,31 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) if (ctx->bsize) sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits); - if (server->nfs_client->rpc_ops->version != 2) { - /* The VFS shouldn't apply the umask to mode bits. We will do - * so ourselves when necessary. + switch (server->nfs_client->rpc_ops->version) { + case 2: + sb->s_time_gran = 1000; + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; + break; + case 3: + /* + * The VFS shouldn't apply the umask to mode bits. + * We will do so ourselves when necessary. */ sb->s_flags |= SB_POSIXACL; sb->s_time_gran = 1; - sb->s_export_op = &nfs_export_ops; - } else - sb->s_time_gran = 1000; - - if (server->nfs_client->rpc_ops->version != 4) { sb->s_time_min = 0; sb->s_time_max = U32_MAX; - } else { + sb->s_export_op = &nfs_export_ops; + break; + case 4: + sb->s_flags |= SB_POSIXACL; + sb->s_time_gran = 1; sb->s_time_min = S64_MIN; sb->s_time_max = S64_MAX; + if (server->caps & NFS_CAP_ATOMIC_OPEN_V1) + sb->s_export_op = &nfs_export_ops; + break; } sb->s_magic = NFS_SUPER_MAGIC; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1843fa235d9b..f41d24b54fd1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1496,31 +1496,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -/* - * Special version of should_remove_suid() that ignores capabilities. - */ -static int nfs_should_remove_suid(const struct inode *inode) -{ - umode_t mode = inode->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && S_ISREG(mode))) - return kill; - - return 0; -} - static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, struct nfs_fattr *fattr) { diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9f486b788ed0..fc17b0ac8729 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -300,6 +300,10 @@ commit_metadata(struct svc_fh *fhp) static void nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) { + /* Ignore mode updates on symlinks */ + if (S_ISLNK(inode->i_mode)) + iap->ia_valid &= ~ATTR_MODE; + /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { iap->ia_mode &= S_IALLUGO; @@ -353,7 +357,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int accmode = NFSD_MAY_SATTR; umode_t ftype = 0; __be32 err; - int host_err; + int host_err = 0; bool get_write_count; bool size_change = (iap->ia_valid & ATTR_SIZE); @@ -391,13 +395,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; inode = d_inode(dentry); - /* Ignore any mode updates on symlinks */ - if (S_ISLNK(inode->i_mode)) - iap->ia_valid &= ~ATTR_MODE; - - if (!iap->ia_valid) - return 0; - nfsd_sanitize_attrs(inode, iap); if (check_guard && guardtime != inode->i_ctime.tv_sec) @@ -448,8 +445,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_unlock; } - iap->ia_valid |= ATTR_CTIME; - host_err = notify_change(&init_user_ns, dentry, iap, NULL); + if (iap->ia_valid) { + iap->ia_valid |= ATTR_CTIME; + host_err = notify_change(&init_user_ns, dentry, iap, NULL); + } out_unlock: if (attr->na_seclabel && attr->na_seclabel->len) @@ -846,10 +845,14 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct svc_rqst *rqstp = sd->u.data; - - svc_rqst_replace_page(rqstp, buf->page); - if (rqstp->rq_res.page_len == 0) - rqstp->rq_res.page_base = buf->offset; + struct page *page = buf->page; // may be a compound one + unsigned offset = buf->offset; + + page += offset / PAGE_SIZE; + for (int i = sd->len; i > 0; i -= PAGE_SIZE) + svc_rqst_replace_page(rqstp, page++); + if (rqstp->rq_res.page_len == 0) // first call + rqstp->rq_res.page_base = offset % PAGE_SIZE; rqstp->rq_res.page_len += sd->len; return sd->len; } diff --git a/fs/open.c b/fs/open.c index 8a813fa5ca56..cf7e5c350a54 100644 --- a/fs/open.c +++ b/fs/open.c @@ -716,6 +716,8 @@ int chown_common(const struct path *path, uid_t user, gid_t group) fs_userns = i_user_ns(inode); retry_deleg: + newattrs.ia_vfsuid = INVALID_VFSUID; + newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) return -EINVAL; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 81d26abf486f..da85b3979195 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -141,6 +141,8 @@ struct tracefs_mount_opts { kuid_t uid; kgid_t gid; umode_t mode; + /* Opt_* bitfield. */ + unsigned int opts; }; enum { @@ -241,6 +243,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) kgid_t gid; char *p; + opts->opts = 0; opts->mode = TRACEFS_DEFAULT_MODE; while ((p = strsep(&data, ",")) != NULL) { @@ -275,24 +278,36 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) * but traditionally tracefs has ignored all mount options */ } + + opts->opts |= BIT(token); } return 0; } -static int tracefs_apply_options(struct super_block *sb) +static int tracefs_apply_options(struct super_block *sb, bool remount) { struct tracefs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct tracefs_mount_opts *opts = &fsi->mount_opts; - inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= opts->mode; + /* + * On remount, only reset mode/uid/gid if they were provided as mount + * options. + */ + + if (!remount || opts->opts & BIT(Opt_mode)) { + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= opts->mode; + } - inode->i_uid = opts->uid; + if (!remount || opts->opts & BIT(Opt_uid)) + inode->i_uid = opts->uid; - /* Set all the group ids to the mount option */ - set_gid(sb->s_root, opts->gid); + if (!remount || opts->opts & BIT(Opt_gid)) { + /* Set all the group ids to the mount option */ + set_gid(sb->s_root, opts->gid); + } return 0; } @@ -307,7 +322,7 @@ static int tracefs_remount(struct super_block *sb, int *flags, char *data) if (err) goto fail; - tracefs_apply_options(sb); + tracefs_apply_options(sb, true); fail: return err; @@ -359,7 +374,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &tracefs_super_operations; - tracefs_apply_options(sb); + tracefs_apply_options(sb, false); return 0; |