diff options
author | Linus Torvalds | 2024-01-08 10:26:08 -0800 |
---|---|---|
committer | Linus Torvalds | 2024-01-08 10:26:08 -0800 |
commit | c604110e662a54568073a03176402b624e740310 (patch) | |
tree | 3121f7a3e57d9cff898029245ad94048a655c792 /fs | |
parent | 1ab33c03145d0f6c345823fc2da935d9a1a9e9fc (diff) | |
parent | dd8f87f21dc3da2eaf46e7401173f935b90b13a8 (diff) |
Merge tag 'vfs-6.8.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull misc vfs updates from Christian Brauner:
"This contains the usual miscellaneous features, cleanups, and fixes
for vfs and individual fses.
Features:
- Add Jan Kara as VFS reviewer
- Show correct device and inode numbers in proc/<pid>/maps for vma
files on stacked filesystems. This is now easily doable thanks to
the backing file work from the last cycles. This comes with
selftests
Cleanups:
- Remove a redundant might_sleep() from wait_on_inode()
- Initialize pointer with NULL, not 0
- Clarify comment on access_override_creds()
- Rework and simplify eventfd_signal() and eventfd_signal_mask()
helpers
- Process aio completions in batches to avoid needless wakeups
- Completely decouple struct mnt_idmap from namespaces. We now only
keep the actual idmapping around and don't stash references to
namespaces
- Reformat maintainer entries to indicate that a given subsystem
belongs to fs/
- Simplify fput() for files that were never opened
- Get rid of various pointless file helpers
- Rename various file helpers
- Rename struct file members after SLAB_TYPESAFE_BY_RCU switch from
last cycle
- Make relatime_need_update() return bool
- Use GFP_KERNEL instead of GFP_USER when allocating superblocks
- Replace deprecated ida_simple_*() calls with their current ida_*()
counterparts
Fixes:
- Fix comments on user namespace id mapping helpers. They aren't
kernel doc comments so they shouldn't be using /**
- s/Retuns/Returns/g in various places
- Add missing parameter documentation on can_move_mount_beneath()
- Rename i_mapping->private_data to i_mapping->i_private_data
- Fix a false-positive lockdep warning in pipe_write() for watch
queues
- Improve __fget_files_rcu() code generation to improve performance
- Only notify writer that pipe resizing has finished after setting
pipe->max_usage otherwise writers are never notified that the pipe
has been resized and hang
- Fix some kernel docs in hfsplus
- s/passs/pass/g in various places
- Fix kernel docs in ntfs
- Fix kcalloc() arguments order reported by gcc 14
- Fix uninitialized value in reiserfs"
* tag 'vfs-6.8.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (36 commits)
reiserfs: fix uninit-value in comp_keys
watch_queue: fix kcalloc() arguments order
ntfs: dir.c: fix kernel-doc function parameter warnings
fs: fix doc comment typo fs tree wide
selftests/overlayfs: verify device and inode numbers in /proc/pid/maps
fs/proc: show correct device and inode numbers in /proc/pid/maps
eventfd: Remove usage of the deprecated ida_simple_xx() API
fs: super: use GFP_KERNEL instead of GFP_USER for super block allocation
fs/hfsplus: wrapper.c: fix kernel-doc warnings
fs: add Jan Kara as reviewer
fs/inode: Make relatime_need_update return bool
pipe: wakeup wr_wait after setting max_usage
file: remove __receive_fd()
file: stop exposing receive_fd_user()
fs: replace f_rcuhead with f_task_work
file: remove pointless wrapper
file: s/close_fd_get_file()/file_close_fd()/g
Improve __fget_files_rcu() code generation (and thus __fget_light())
file: massage cleanup of files that failed to open
fs/pipe: Fix lockdep false-positive in watchqueue pipe_write()
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/aio.c | 85 | ||||
-rw-r--r-- | fs/attr.c | 2 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 52 | ||||
-rw-r--r-- | fs/btrfs/subpage.c | 4 | ||||
-rw-r--r-- | fs/buffer.c | 108 | ||||
-rw-r--r-- | fs/dax.c | 2 | ||||
-rw-r--r-- | fs/direct-io.c | 2 | ||||
-rw-r--r-- | fs/eventfd.c | 46 | ||||
-rw-r--r-- | fs/ext4/inode.c | 4 | ||||
-rw-r--r-- | fs/file.c | 97 | ||||
-rw-r--r-- | fs/file_table.c | 22 | ||||
-rw-r--r-- | fs/gfs2/glock.c | 2 | ||||
-rw-r--r-- | fs/gfs2/ops_fstype.c | 2 | ||||
-rw-r--r-- | fs/hfsplus/wrapper.c | 5 | ||||
-rw-r--r-- | fs/hugetlbfs/inode.c | 4 | ||||
-rw-r--r-- | fs/inode.c | 22 | ||||
-rw-r--r-- | fs/internal.h | 3 | ||||
-rw-r--r-- | fs/mnt_idmapping.c | 159 | ||||
-rw-r--r-- | fs/namei.c | 31 | ||||
-rw-r--r-- | fs/namespace.c | 3 | ||||
-rw-r--r-- | fs/nfs/write.c | 12 | ||||
-rw-r--r-- | fs/nilfs2/inode.c | 4 | ||||
-rw-r--r-- | fs/ntfs/aops.c | 10 | ||||
-rw-r--r-- | fs/ntfs/dir.c | 3 | ||||
-rw-r--r-- | fs/open.c | 5 | ||||
-rw-r--r-- | fs/pipe.c | 24 | ||||
-rw-r--r-- | fs/posix_acl.c | 4 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 3 | ||||
-rw-r--r-- | fs/reiserfs/stree.c | 2 | ||||
-rw-r--r-- | fs/stat.c | 2 | ||||
-rw-r--r-- | fs/super.c | 2 |
31 files changed, 408 insertions, 318 deletions
@@ -266,7 +266,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) return ERR_CAST(inode); inode->i_mapping->a_ops = &aio_ctx_aops; - inode->i_mapping->private_data = ctx; + inode->i_mapping->i_private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; file = alloc_file_pseudo(inode, aio_mnt, "[aio]", @@ -316,10 +316,10 @@ static void put_aio_ring_file(struct kioctx *ctx) /* Prevent further access to the kioctx from migratepages */ i_mapping = aio_ring_file->f_mapping; - spin_lock(&i_mapping->private_lock); - i_mapping->private_data = NULL; + spin_lock(&i_mapping->i_private_lock); + i_mapping->i_private_data = NULL; ctx->aio_ring_file = NULL; - spin_unlock(&i_mapping->private_lock); + spin_unlock(&i_mapping->i_private_lock); fput(aio_ring_file); } @@ -422,9 +422,9 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, rc = 0; - /* mapping->private_lock here protects against the kioctx teardown. */ - spin_lock(&mapping->private_lock); - ctx = mapping->private_data; + /* mapping->i_private_lock here protects against the kioctx teardown. */ + spin_lock(&mapping->i_private_lock); + ctx = mapping->i_private_data; if (!ctx) { rc = -EINVAL; goto out; @@ -476,7 +476,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, out_unlock: mutex_unlock(&ctx->ring_lock); out: - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return rc; } #else @@ -1106,6 +1106,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb) kmem_cache_free(kiocb_cachep, iocb); } +struct aio_waiter { + struct wait_queue_entry w; + size_t min_nr; +}; + /* aio_complete * Called when the io request on the given iocb is complete. */ @@ -1114,7 +1119,7 @@ static void aio_complete(struct aio_kiocb *iocb) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; - unsigned tail, pos, head; + unsigned tail, pos, head, avail; unsigned long flags; /* @@ -1156,6 +1161,10 @@ static void aio_complete(struct aio_kiocb *iocb) ctx->completed_events++; if (ctx->completed_events > 1) refill_reqs_available(ctx, head, tail); + + avail = tail > head + ? tail - head + : tail + ctx->nr_events - head; spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); @@ -1166,7 +1175,7 @@ static void aio_complete(struct aio_kiocb *iocb) * from IRQ context. */ if (iocb->ki_eventfd) - eventfd_signal(iocb->ki_eventfd, 1); + eventfd_signal(iocb->ki_eventfd); /* * We have to order our ring_info tail store above and test @@ -1176,8 +1185,18 @@ static void aio_complete(struct aio_kiocb *iocb) */ smp_mb(); - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + if (waitqueue_active(&ctx->wait)) { + struct aio_waiter *curr, *next; + unsigned long flags; + + spin_lock_irqsave(&ctx->wait.lock, flags); + list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) + if (avail >= curr->min_nr) { + list_del_init_careful(&curr->w.entry); + wake_up_process(curr->w.private); + } + spin_unlock_irqrestore(&ctx->wait.lock, flags); + } } static inline void iocb_put(struct aio_kiocb *iocb) @@ -1290,7 +1309,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, ktime_t until) { - long ret = 0; + struct hrtimer_sleeper t; + struct aio_waiter w; + long ret = 0, ret2 = 0; /* * Note that aio_read_events() is being called as the conditional - i.e. @@ -1306,12 +1327,38 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ - if (until == 0) - aio_read_events(ctx, min_nr, nr, event, &ret); - else - wait_event_interruptible_hrtimeout(ctx->wait, - aio_read_events(ctx, min_nr, nr, event, &ret), - until); + aio_read_events(ctx, min_nr, nr, event, &ret); + if (until == 0 || ret < 0 || ret >= min_nr) + return ret; + + hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + if (until != KTIME_MAX) { + hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); + hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); + } + + init_wait(&w.w); + + while (1) { + unsigned long nr_got = ret; + + w.min_nr = min_nr - ret; + + ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE); + if (!ret2 && !t.task) + ret2 = -ETIME; + + if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2) + break; + + if (nr_got == ret) + schedule(); + } + + finish_wait(&ctx->wait, &w.w); + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + return ret; } diff --git a/fs/attr.c b/fs/attr.c index bdf5deb06ea9..5a13f0c8495f 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -157,7 +157,7 @@ static bool chgrp_ok(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8f724c54fc8e..b6ff6f320198 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -875,7 +875,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, * will not race with any other ebs. */ if (page->mapping) - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); if (fs_info->nodesize >= PAGE_SIZE) { if (!PagePrivate(page)) @@ -1741,16 +1741,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); bit_start++; continue; } @@ -1764,7 +1764,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1816,9 +1816,9 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } @@ -1829,16 +1829,16 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) * crashing the machine for something we can survive anyway. */ if (WARN_ON(!eb)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } if (eb == ctx->eb) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (!ret) return 0; @@ -3062,7 +3062,7 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) { struct btrfs_subpage *subpage; - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); if (PagePrivate(page)) { subpage = (struct btrfs_subpage *)page->private; @@ -3085,14 +3085,14 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag /* * For mapped eb, we're going to change the page private, which should - * be done under the private_lock. + * be done under the i_private_lock. */ if (mapped) - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return; } @@ -3116,7 +3116,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag detach_page_private(page); } if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return; } @@ -3139,7 +3139,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag if (!page_range_has_eb(fs_info, page)) btrfs_detach_subpage(fs_info, page); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); } /* Release all pages attached to the extent buffer */ @@ -3520,7 +3520,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock nor page lock hold. + * allocate memory with i_private_lock nor page lock hold. * * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. @@ -3541,10 +3541,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, goto free_eb; } - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); exists = grab_extent_buffer(fs_info, p); if (exists) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); unlock_page(p); put_page(p); mark_extent_buffer_accessed(exists, p); @@ -3564,7 +3564,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * Thus needs no special handling in error path. */ btrfs_page_inc_eb_refs(fs_info, p); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; @@ -4569,12 +4569,12 @@ static int try_release_subpage_extent_buffer(struct page *page) * Finally to check if we have cleared page private, as if we have * released all ebs in the page, the page private should be cleared now. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) ret = 1; else ret = 0; - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return ret; } @@ -4590,9 +4590,9 @@ int try_release_extent_buffer(struct page *page) * We need to make sure nobody is changing page->private, as we rely on * page->private as the pointer to extent buffer. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return 1; } @@ -4607,10 +4607,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 1b999c6e4193..2347cf15278b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -200,7 +200,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); subpage = (struct btrfs_subpage *)page->private; atomic_inc(&subpage->eb_refs); @@ -215,7 +215,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); subpage = (struct btrfs_subpage *)page->private; ASSERT(atomic_read(&subpage->eb_refs)); diff --git a/fs/buffer.c b/fs/buffer.c index 967f34b70aa8..5ffc44ab4854 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -180,11 +180,11 @@ EXPORT_SYMBOL(end_buffer_write_sync); * Various filesystems appear to want __find_get_block to be non-blocking. * But it's the page lock which protects the buffers. To get around this, * we get exclusion from try_to_free_buffers with the blockdev mapping's - * private_lock. + * i_private_lock. * - * Hack idea: for the blockdev mapping, private_lock contention + * Hack idea: for the blockdev mapping, i_private_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. + * succeeds, there is no need to take i_private_lock. */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) @@ -204,7 +204,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) if (IS_ERR(folio)) goto out; - spin_lock(&bd_mapping->private_lock); + spin_lock(&bd_mapping->i_private_lock); head = folio_buffers(folio); if (!head) goto out_unlock; @@ -236,7 +236,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) 1 << bd_inode->i_blkbits); } out_unlock: - spin_unlock(&bd_mapping->private_lock); + spin_unlock(&bd_mapping->i_private_lock); folio_put(folio); out: return ret; @@ -467,25 +467,25 @@ EXPORT_SYMBOL(mark_buffer_async_write); * * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the - * management of a list of dependent buffers at ->i_mapping->private_list. + * management of a list of dependent buffers at ->i_mapping->i_private_list. * * Locking is a little subtle: try_to_free_buffers() will remove buffers * from their controlling inode's queue when they are being freed. But * try_to_free_buffers() will be operating against the *blockdev* mapping * at the time, not against the S_ISREG file which depends on those buffers. - * So the locking for private_list is via the private_lock in the address_space + * So the locking for i_private_list is via the i_private_lock in the address_space * which backs the buffers. Which is different from the address_space * against which the buffers are listed. So for a particular address_space, - * mapping->private_lock does *not* protect mapping->private_list! In fact, - * mapping->private_list will always be protected by the backing blockdev's - * ->private_lock. + * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact, + * mapping->i_private_list will always be protected by the backing blockdev's + * ->i_private_lock. * * Which introduces a requirement: all buffers on an address_space's - * ->private_list must be from the same address_space: the blockdev's. + * ->i_private_list must be from the same address_space: the blockdev's. * - * address_spaces which do not place buffers at ->private_list via these - * utility functions are free to use private_lock and private_list for - * whatever they want. The only requirement is that list_empty(private_list) + * address_spaces which do not place buffers at ->i_private_list via these + * utility functions are free to use i_private_lock and i_private_list for + * whatever they want. The only requirement is that list_empty(i_private_list) * be true at clear_inode() time. * * FIXME: clear_inode should not call invalidate_inode_buffers(). The @@ -508,7 +508,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); */ /* - * The buffer's backing address_space's private_lock must be held + * The buffer's backing address_space's i_private_lock must be held */ static void __remove_assoc_queue(struct buffer_head *bh) { @@ -519,7 +519,7 @@ static void __remove_assoc_queue(struct buffer_head *bh) int inode_has_buffers(struct inode *inode) { - return !list_empty(&inode->i_data.private_list); + return !list_empty(&inode->i_data.i_private_list); } /* @@ -561,7 +561,7 @@ repeat: * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written * - * Starts I/O against the buffers at mapping->private_list, and waits upon + * Starts I/O against the buffers at mapping->i_private_list, and waits upon * that I/O. * * Basically, this is a convenience function for fsync(). @@ -570,13 +570,13 @@ repeat: */ int sync_mapping_buffers(struct address_space *mapping) { - struct address_space *buffer_mapping = mapping->private_data; + struct address_space *buffer_mapping = mapping->i_private_data; - if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + if (buffer_mapping == NULL || list_empty(&mapping->i_private_list)) return 0; - return fsync_buffers_list(&buffer_mapping->private_lock, - &mapping->private_list); + return fsync_buffers_list(&buffer_mapping->i_private_lock, + &mapping->i_private_list); } EXPORT_SYMBOL(sync_mapping_buffers); @@ -673,17 +673,17 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) struct address_space *buffer_mapping = bh->b_folio->mapping; mark_buffer_dirty(bh); - if (!mapping->private_data) { - mapping->private_data = buffer_mapping; + if (!mapping->i_private_data) { + mapping->i_private_data = buffer_mapping; } else { - BUG_ON(mapping->private_data != buffer_mapping); + BUG_ON(mapping->i_private_data != buffer_mapping); } if (!bh->b_assoc_map) { - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); list_move_tail(&bh->b_assoc_buffers, - &mapping->private_list); + &mapping->i_private_list); bh->b_assoc_map = mapping; - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(mark_buffer_dirty_inode); @@ -706,7 +706,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean * page on the dirty page list. * - * We use private_lock to lock against try_to_free_buffers while using the + * We use i_private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. * @@ -718,7 +718,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) struct buffer_head *head; bool newly_dirty; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; @@ -734,7 +734,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) */ folio_memcg_lock(folio); newly_dirty = !folio_test_set_dirty(folio); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (newly_dirty) __folio_mark_dirty(folio, mapping, 1); @@ -827,7 +827,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) smp_mb(); if (buffer_dirty(bh)) { list_add(&bh->b_assoc_buffers, - &mapping->private_list); + &mapping->i_private_list); bh->b_assoc_map = mapping; } spin_unlock(lock); @@ -851,7 +851,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. * - * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which * assumes that all the buffers are against the blockdev. Not true * for reiserfs. */ @@ -859,13 +859,13 @@ void invalidate_inode_buffers(struct inode *inode) { if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->private_data; + struct list_head *list = &mapping->i_private_list; + struct address_space *buffer_mapping = mapping->i_private_data; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) __remove_assoc_queue(BH_ENTRY(list->next)); - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(invalidate_inode_buffers); @@ -882,10 +882,10 @@ int remove_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->private_data; + struct list_head *list = &mapping->i_private_list; + struct address_space *buffer_mapping = mapping->i_private_data; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) { struct buffer_head *bh = BH_ENTRY(list->next); if (buffer_dirty(bh)) { @@ -894,7 +894,7 @@ int remove_inode_buffers(struct inode *inode) } __remove_assoc_queue(bh); } - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } return ret; } @@ -1064,11 +1064,11 @@ grow_dev_page(struct block_device *bdev, sector_t block, * lock to be atomic wrt __find_get_block(), which does not * run under the folio lock. */ - spin_lock(&inode->i_mapping->private_lock); + spin_lock(&inode->i_mapping->i_private_lock); link_dev_buffers(folio, bh); end_block = folio_init_buffers(folio, bdev, (sector_t)index << sizebits, size); - spin_unlock(&inode->i_mapping->private_lock); + spin_unlock(&inode->i_mapping->i_private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; failed: @@ -1168,7 +1168,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * and then attach the address_space's inode to its superblock's dirty * inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock, * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1246,10 +1246,10 @@ void __bforget(struct buffer_head *bh) if (bh->b_assoc_map) { struct address_space *buffer_mapping = bh->b_folio->mapping; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); list_del_init(&bh->b_assoc_buffers); bh->b_assoc_map = NULL; - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } __brelse(bh); } @@ -1638,7 +1638,7 @@ EXPORT_SYMBOL(block_invalidate_folio); /* * We attach and possibly dirty the buffers atomically wrt - * block_dirty_folio() via private_lock. try_to_free_buffers + * block_dirty_folio() via i_private_lock. try_to_free_buffers * is already excluded via the folio lock. */ struct buffer_head *create_empty_buffers(struct folio *folio, @@ -1656,7 +1656,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio, } while (bh); tail->b_this_page = head; - spin_lock(&folio->mapping->private_lock); + spin_lock(&folio->mapping->i_private_lock); if (folio_test_uptodate(folio) || folio_test_dirty(folio)) { bh = head; do { @@ -1668,7 +1668,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio, } while (bh != head); } folio_attach_private(folio, head); - spin_unlock(&folio->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return head; } @@ -1715,7 +1715,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) if (!folio_buffers(folio)) continue; /* - * We use folio lock instead of bd_mapping->private_lock + * We use folio lock instead of bd_mapping->i_private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock. */ @@ -2883,7 +2883,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * are unused, and releases them if so. * * Exclusion against try_to_free_buffers may be obtained by either - * locking the folio or by holding its mapping's private_lock. + * locking the folio or by holding its mapping's i_private_lock. * * If the folio is dirty but all the buffers are clean then we need to * be sure to mark the folio clean as well. This is because the folio @@ -2894,7 +2894,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * The same applies to regular filesystem folios: if all the buffers are * clean then we set the folio clean and proceed. To do that, we require * total exclusion from block_dirty_folio(). That is obtained with - * private_lock. + * i_private_lock. * * try_to_free_buffers() is non-blocking. */ @@ -2946,7 +2946,7 @@ bool try_to_free_buffers(struct folio *folio) goto out; } - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); ret = drop_buffers(folio, &buffers_to_free); /* @@ -2959,13 +2959,13 @@ bool try_to_free_buffers(struct folio *folio) * the folio's buffers clean. We discover that here and clean * the folio also. * - * private_lock must be held over this entire operation in order + * i_private_lock must be held over this entire operation in order * to synchronise against block_dirty_folio and prevent the * dirty bit from being lost. */ if (ret) folio_cancel_dirty(folio); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; @@ -1128,7 +1128,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ bool zero_edge = srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN; - void *saddr = 0; + void *saddr = NULL; int ret = 0; if (!zero_edge) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 20533266ade6..60456263a338 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1114,7 +1114,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, loff_t offset = iocb->ki_pos; const loff_t end = offset + count; struct dio *dio; - struct dio_submit sdio = { 0, }; + struct dio_submit sdio = { NULL, }; struct buffer_head map_bh = { 0, }; struct blk_plug plug; unsigned long align = offset | iov_iter_alignment(iter); diff --git a/fs/eventfd.c b/fs/eventfd.c index 33a918f9566c..ad8186d47ba7 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -43,7 +43,17 @@ struct eventfd_ctx { int id; }; -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) +/** + * eventfd_signal_mask - Increment the event counter + * @ctx: [in] Pointer to the eventfd context. + * @mask: [in] poll mask + * + * This function is supposed to be called by the kernel in paths that do not + * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX + * value, and we signal this as overflow condition by returning a EPOLLERR + * to poll(2). + */ +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { unsigned long flags; @@ -56,45 +66,23 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) * safe context. */ if (WARN_ON_ONCE(current->in_eventfd)) - return 0; + return; spin_lock_irqsave(&ctx->wqh.lock, flags); current->in_eventfd = 1; - if (ULLONG_MAX - ctx->count < n) - n = ULLONG_MAX - ctx->count; - ctx->count += n; + if (ctx->count < ULLONG_MAX) + ctx->count++; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); - - return n; -} - -/** - * eventfd_signal - Adds @n to the eventfd counter. - * @ctx: [in] Pointer to the eventfd context. - * @n: [in] Value of the counter to be added to the eventfd internal counter. - * The value cannot be negative. - * - * This function is supposed to be called by the kernel in paths that do not - * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX - * value, and we signal this as overflow condition by returning a EPOLLERR - * to poll(2). - * - * Returns the amount by which the counter was incremented. This will be less - * than @n if the counter has overflowed. - */ -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) -{ - return eventfd_signal_mask(ctx, n, 0); } -EXPORT_SYMBOL_GPL(eventfd_signal); +EXPORT_SYMBOL_GPL(eventfd_signal_mask); static void eventfd_free_ctx(struct eventfd_ctx *ctx) { if (ctx->id >= 0) - ida_simple_remove(&eventfd_ida, ctx->id); + ida_free(&eventfd_ida, ctx->id); kfree(ctx); } @@ -407,7 +395,7 @@ static int do_eventfd(unsigned int count, int flags) init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; - ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); + ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL); flags &= EFD_SHARED_FCNTL_FLAGS; flags |= O_RDWR; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 61277f7f8722..0558c8c986d4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1261,7 +1261,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * - * ext4 never places buffers on inode->i_mapping->private_list. metadata + * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, @@ -3213,7 +3213,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) } /* Any metadata buffers to write? */ - if (!list_empty(&inode->i_mapping->private_list)) + if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } diff --git a/fs/file.c b/fs/file.c index 5fb0b146e79e..3b683b9101d8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -629,19 +629,23 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); /** - * pick_file - return file associatd with fd + * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * + * Doesn't take a separate reference count. + * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ -static struct file *pick_file(struct files_struct *files, unsigned fd) +struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; + lockdep_assert_held(&files->file_lock); + if (fd >= fdt->max_fds) return NULL; @@ -660,7 +664,7 @@ int close_fd(unsigned fd) struct file *file; spin_lock(&files->file_lock); - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; @@ -707,7 +711,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); @@ -795,26 +799,21 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) return 0; } -/* - * See close_fd_get_file() below, this variant assumes current->files->file_lock - * is held. - */ -struct file *__close_fd_get_file(unsigned int fd) -{ - return pick_file(current->files, fd); -} - -/* - * variant of close_fd that gets a ref on the file for later fput. - * The caller must ensure that filp_close() called on the file. +/** + * file_close_fd - return file associated with fd + * @fd: file descriptor to retrieve file for + * + * Doesn't take a separate reference count. + * + * Returns: The file associated with @fd (NULL if @fd is not open) */ -struct file *close_fd_get_file(unsigned int fd) +struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; @@ -959,31 +958,45 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; + unsigned long nospec_mask; - if (unlikely(fd >= fdt->max_fds)) - return NULL; - - fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); + /* Mask is a 0 for invalid fd's, ~0 for valid ones */ + nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* - * Ok, we have a file pointer. However, because we do - * this all locklessly under RCU, we may be racing with - * that file being closed. - * - * Such a race can take two forms: - * - * (a) the file ref already went down to zero and the - * file hasn't been reused yet or the file count - * isn't zero but the file has already been reused. + * fdentry points to the 'fd' offset, or fdt->fd[0]. + * Loading from fdt->fd[0] is always safe, because the + * array always exists. */ - file = __get_file_rcu(fdentry); + fdentry = fdt->fd + (fd & nospec_mask); + + /* Do the load, then mask any invalid result */ + file = rcu_dereference_raw(*fdentry); + file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; - if (unlikely(IS_ERR(file))) + /* + * Ok, we have a file pointer that was valid at + * some point, but it might have become stale since. + * + * We need to confirm it by incrementing the refcount + * and then check the lookup again. + * + * atomic_long_inc_not_zero() gives us a full memory + * barrier. We only really need an 'acquire' one to + * protect the loads below, but we don't have that. + */ + if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) continue; /* + * Such a race can take two forms: + * + * (a) the file ref already went down to zero and the + * file hasn't been reused yet or the file count + * isn't zero but the file has already been reused. + * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes @@ -991,7 +1004,8 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * If so, we need to put our ref and try again. */ - if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) { + if (unlikely(file != rcu_dereference_raw(*fdentry)) || + unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } @@ -1128,13 +1142,13 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ - if (atomic_read_acquire(&files->count) == 1) { + if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; return (unsigned long)file; } else { - file = __fget(fd, mask); + file = __fget_files(files, fd, mask); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; @@ -1282,7 +1296,7 @@ out_unlock: } /** - * __receive_fd() - Install received file into file descriptor table + * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry @@ -1296,7 +1310,7 @@ out_unlock: * * Returns newly install fd or -ve on error. */ -int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; @@ -1321,6 +1335,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) __receive_sock(file); return new_fd; } +EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { @@ -1336,12 +1351,6 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) return new_fd; } -int receive_fd(struct file *file, unsigned int o_flags) -{ - return __receive_fd(file, NULL, o_flags); -} -EXPORT_SYMBOL_GPL(receive_fd); - static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; diff --git a/fs/file_table.c b/fs/file_table.c index de4a2915bfd4..3ba764d73fc9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -75,18 +75,6 @@ static inline void file_free(struct file *f) } } -void release_empty_file(struct file *f) -{ - WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED)); - if (atomic_long_dec_and_test(&f->f_count)) { - security_file_free(f); - put_cred(f->f_cred); - if (likely(!(f->f_mode & FMODE_NOACCOUNT))) - percpu_counter_dec(&nr_files); - kmem_cache_free(filp_cachep, f); - } -} - /* * Return the total number of open files in the system */ @@ -419,7 +407,7 @@ static void delayed_fput(struct work_struct *unused) static void ____fput(struct callback_head *work) { - __fput(container_of(work, struct file, f_rcuhead)); + __fput(container_of(work, struct file, f_task_work)); } /* @@ -445,9 +433,13 @@ void fput(struct file *file) if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_rcuhead, ____fput); - if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME)) + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; /* * After this task has run exit_task_work(), diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d6bf1f8c25dc..d8b619ed2f1e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1213,7 +1213,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->host = s->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b108c5d26839..00ce89bdf32c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -117,7 +117,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mapping->host = sb->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 0b791adf02e5..b0cb70400996 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -30,8 +30,7 @@ struct hfsplus_wd { * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O * @data: output pointer for location of requested data - * @op: direction of I/O - * @op_flags: request op flags + * @opf: request op flags * * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads @@ -43,6 +42,8 @@ struct hfsplus_wd { * that starts at the rounded-down address. As long as the data was * read using hfsplus_submit_bio() and the same buffer is used things * will work correctly. + * + * Returns: %0 on success else -errno code */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, void **data, blk_opf_t opf) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f757d4f7ad98..05609ab15cbc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -686,7 +686,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) * at inode creation time. If this is a device special inode, * i_mapping may not point to the original address space. */ - resv_map = (struct resv_map *)(&inode->i_data)->private_data; + resv_map = (struct resv_map *)(&inode->i_data)->i_private_data; /* Only regular and link inodes have associated reserve maps */ if (resv_map) resv_map_release(&resv_map->refs); @@ -1000,7 +1000,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; simple_inode_init_ts(inode); - inode->i_mapping->private_data = resv_map; + inode->i_mapping->i_private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: diff --git a/fs/inode.c b/fs/inode.c index f238d987dec9..6cdb017f45c6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -209,7 +209,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&mapping->nr_thps, 0); #endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, @@ -398,8 +398,8 @@ static void __address_space_init_once(struct address_space *mapping) { xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); + INIT_LIST_HEAD(&mapping->i_private_list); + spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED; } @@ -620,7 +620,7 @@ void clear_inode(struct inode *inode) * nor even WARN_ON(!mapping_empty). */ xa_unlock_irq(&inode->i_data.i_pages); - BUG_ON(!list_empty(&inode->i_data.private_list)); + BUG_ON(!list_empty(&inode->i_data.i_private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); @@ -1836,37 +1836,37 @@ EXPORT_SYMBOL(bmap); * earlier than or equal to either the ctime or mtime, * or if at least a day has passed since the last atime update. */ -static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, +static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { struct timespec64 atime, mtime, ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) - return 1; + return true; /* * Is mtime younger than or equal to atime? If yes, update atime: */ atime = inode_get_atime(inode); mtime = inode_get_mtime(inode); if (timespec64_compare(&mtime, &atime) >= 0) - return 1; + return true; /* * Is ctime younger than or equal to atime? If yes, update atime: */ ctime = inode_get_ctime(inode); if (timespec64_compare(&ctime, &atime) >= 0) - return 1; + return true; /* * Is the previous atime value older than a day? If yes, * update atime: */ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60) - return 1; + return true; /* * Good, we can skip the atime update: */ - return 0; + return false; } /** @@ -2404,7 +2404,7 @@ EXPORT_SYMBOL(inode_init_owner); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) diff --git a/fs/internal.h b/fs/internal.h index 58e43341aebf..a7469ddba9b6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -94,7 +94,6 @@ extern void chroot_fs_refs(const struct path *, const struct path *); struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); -void release_empty_file(struct file *f); static inline void file_put_write_access(struct file *file) { @@ -180,7 +179,7 @@ extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); -extern struct file *__close_fd_get_file(unsigned int fd); +struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 57d1dedf3f8f..64c5205e2b5e 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -9,8 +9,16 @@ #include "internal.h" +/* + * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, + * never from raw values. These are just internal helpers. + */ +#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } +#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } + struct mnt_idmap { - struct user_namespace *owner; + struct uid_gid_map uid_map; + struct uid_gid_map gid_map; refcount_t count; }; @@ -20,25 +28,11 @@ struct mnt_idmap { * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. */ struct mnt_idmap nop_mnt_idmap = { - .owner = &init_user_ns, .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); /** - * check_fsmapping - check whether an mount idmapping is allowed - * @idmap: idmap of the relevent mount - * @sb: super block of the filesystem - * - * Return: true if @idmap is allowed, false if not. - */ -bool check_fsmapping(const struct mnt_idmap *idmap, - const struct super_block *sb) -{ - return idmap->owner != sb->s_user_ns; -} - -/** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check * @@ -53,26 +47,6 @@ static inline bool initial_idmapping(const struct user_namespace *ns) } /** - * no_idmapping - check whether we can skip remapping a kuid/gid - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * - * This function can be used to check whether a remapping between two - * idmappings is required. - * An idmapped mount is a mount that has an idmapping attached to it that - * is different from the filsystem's idmapping and the initial idmapping. - * If the initial mapping is used or the idmapping of the mount and the - * filesystem are identical no remapping is required. - * - * Return: true if remapping can be skipped, false if not. - */ -static inline bool no_idmapping(const struct user_namespace *mnt_userns, - const struct user_namespace *fs_userns) -{ - return initial_idmapping(mnt_userns) || mnt_userns == fs_userns; -} - -/** * make_vfsuid - map a filesystem kuid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping @@ -81,8 +55,8 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns, * Take a @kuid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kuid to be reported to userspace. * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kuid unchanged. + * If initial_idmapping() determines that this is not an idmapped mount + * we can simply return @kuid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kuid won't change when calling * from_kuid() so we can simply retrieve the value via __kuid_val() @@ -94,13 +68,12 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns, */ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, - struct user_namespace *fs_userns, - kuid_t kuid) + struct user_namespace *fs_userns, + kuid_t kuid) { uid_t uid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); @@ -108,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) return INVALID_VFSUID; - return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); + return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); } EXPORT_SYMBOL_GPL(make_vfsuid); @@ -121,8 +94,8 @@ EXPORT_SYMBOL_GPL(make_vfsuid); * Take a @kgid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kgid to be reported to userspace. * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kgid unchanged. + * If initial_idmapping() determines that this is not an idmapped mount + * we can simply return @kgid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kgid won't change when calling * from_kgid() so we can simply retrieve the value via __kgid_val() @@ -136,9 +109,8 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid) { gid_t gid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); @@ -146,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) return INVALID_VFSGID; - return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); + return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); } EXPORT_SYMBOL_GPL(make_vfsgid); @@ -165,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { uid_t uid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); - uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); + uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; if (initial_idmapping(fs_userns)) @@ -193,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { gid_t gid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); - gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); + gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; if (initial_idmapping(fs_userns)) @@ -228,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid) #endif EXPORT_SYMBOL_GPL(vfsgid_in_group_p); +static int copy_mnt_idmap(struct uid_gid_map *map_from, + struct uid_gid_map *map_to) +{ + struct uid_gid_extent *forward, *reverse; + u32 nr_extents = READ_ONCE(map_from->nr_extents); + /* Pairs with smp_wmb() when writing the idmapping. */ + smp_rmb(); + + /* + * Don't blindly copy @map_to into @map_from if nr_extents is + * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we + * read @nr_extents someone could have written an idmapping and + * then we might end up with inconsistent data. So just don't do + * anything at all. + */ + if (nr_extents == 0) + return 0; + + /* + * Here we know that nr_extents is greater than zero which means + * a map has been written. Since idmappings can't be changed + * once they have been written we know that we can safely copy + * from @map_to into @map_from. + */ + + if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + *map_to = *map_from; + return 0; + } + + forward = kmemdup(map_from->forward, + nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); + if (!forward) + return -ENOMEM; + + reverse = kmemdup(map_from->reverse, + nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); + if (!reverse) { + kfree(forward); + return -ENOMEM; + } + + /* + * The idmapping isn't exposed anywhere so we don't need to care + * about ordering between extent pointers and @nr_extents + * initialization. + */ + map_to->forward = forward; + map_to->reverse = reverse; + map_to->nr_extents = nr_extents; + return 0; +} + +static void free_mnt_idmap(struct mnt_idmap *idmap) +{ + if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(idmap->uid_map.forward); + kfree(idmap->uid_map.reverse); + } + if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(idmap->gid_map.forward); + kfree(idmap->gid_map.reverse); + } + kfree(idmap); +} + struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) { struct mnt_idmap *idmap; + int ret; idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); if (!idmap) return ERR_PTR(-ENOMEM); - idmap->owner = get_user_ns(mnt_userns); refcount_set(&idmap->count, 1); + ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); + if (!ret) + ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); + if (ret) { + free_mnt_idmap(idmap); + idmap = ERR_PTR(ret); + } return idmap; } @@ -267,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get); */ void mnt_idmap_put(struct mnt_idmap *idmap) { - if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { - put_user_ns(idmap->owner); - kfree(idmap); - } + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) + free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); diff --git a/fs/namei.c b/fs/namei.c index 71c13b2990b4..faae721e4d63 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL(putname); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -334,7 +334,7 @@ static int check_acl(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -395,7 +395,7 @@ static int acl_permission_check(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -2467,7 +2467,7 @@ static int handle_lookup_down(struct nameidata *nd) return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); @@ -2522,7 +2522,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, return retval; } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { @@ -3158,7 +3158,7 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -3646,7 +3646,7 @@ static int do_open(struct nameidata *nd, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, @@ -3785,10 +3785,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - if (unlikely(file->f_mode & FMODE_OPENED)) - fput(file); - else - release_empty_file(file); + fput(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; @@ -3954,7 +3951,7 @@ EXPORT_SYMBOL(user_path_create); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) @@ -4080,7 +4077,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) @@ -4161,7 +4158,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) @@ -4290,7 +4287,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) @@ -4443,7 +4440,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) @@ -4535,7 +4532,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, diff --git a/fs/namespace.c b/fs/namespace.c index fbf0e596fcd3..78366f114515 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3026,6 +3026,7 @@ static inline bool path_overmounted(const struct path *path) * can_move_mount_beneath - check that we can mount beneath the top mount * @from: mount to mount beneath * @to: mount under which to mount + * @mp: mountpoint of @to * * - Make sure that @to->dentry is actually the root of a mount under * which we can mount another mount. @@ -4288,7 +4289,7 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb)) + if (kattr->mnt_userns == m->mnt_sb->s_user_ns) return -EINVAL; /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b664caea8b4e..7248705faef4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -192,13 +192,13 @@ static struct nfs_page *nfs_folio_find_private_request(struct folio *folio) if (!folio_test_private(folio)) return NULL; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); req = nfs_folio_private_request(folio); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return req; } @@ -769,13 +769,13 @@ static void nfs_inode_add_request(struct nfs_page *req) * Swap-space should not get truncated. Hence no need to plug the race * with invalidate/truncate. */ - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(!folio_test_swapcache(folio))) { set_bit(PG_MAPPED, &req->wb_flags); folio_set_private(folio); folio->private = req; } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. @@ -796,13 +796,13 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio_file_mapping(folio); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(folio && !folio_test_swapcache(folio))) { folio->private = NULL; folio_clear_private(folio); clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index f861f3a0bf5c..2ead36dfa2a3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -214,7 +214,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping, /* * The page may not be locked, eg if called from try_to_unmap_one() */ - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; @@ -230,7 +230,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping, } else if (ret) { nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (nr_dirty) nilfs_set_file_dirty(inode, nr_dirty); diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 71e31e789b29..548f3b51aa5f 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1690,7 +1690,7 @@ const struct address_space_operations ntfs_mst_aops = { * * If the page does not have buffers, we create them and set them uptodate. * The page may not be locked which is why we need to handle the buffers under - * the mapping->private_lock. Once the buffers are marked dirty we no longer + * the mapping->i_private_lock. Once the buffers are marked dirty we no longer * need the lock since try_to_free_buffers() does not free dirty buffers. */ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { @@ -1702,11 +1702,11 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { BUG_ON(!PageUptodate(page)); end = ofs + ni->itype.index.block_size; bh_size = VFS_I(ni)->i_sb->s_blocksize; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (unlikely(!page_has_buffers(page))) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); bh = head = alloc_page_buffers(page, bh_size, true); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(!page_has_buffers(page))) { struct buffer_head *tail; @@ -1730,7 +1730,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { break; set_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); filemap_dirty_folio(mapping, page_folio(page)); if (unlikely(buffers_to_free)) { do { diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 4596c90e7b7c..629723a8d712 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1462,7 +1462,8 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp) /** * ntfs_dir_fsync - sync a directory to disk * @filp: directory to be synced - * @dentry: dentry describing the directory to sync + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) * @datasync: if non-zero only flush user data and not metadata * * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and diff --git a/fs/open.c b/fs/open.c index 3494a9cd8046..954d8fcbb635 100644 --- a/fs/open.c +++ b/fs/open.c @@ -442,7 +442,8 @@ static const struct cred *access_override_creds(void) * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous - * cred accesses will keep things non-RCY. + * cred accesses will keep things non-racy to avoid RCU + * freeing. */ override_cred->non_rcu = 1; @@ -1574,7 +1575,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) int retval; struct file *file; - file = close_fd_get_file(fd); + file = file_close_fd(fd); if (!file) return -EBADF; diff --git a/fs/pipe.c b/fs/pipe.c index 804a7d789452..8d9286a1f2e8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -446,6 +446,18 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) bool was_empty = false; bool wake_next_writer = false; + /* + * Reject writing to watch queue pipes before the point where we lock + * the pipe. + * Otherwise, lockdep would be unhappy if the caller already has another + * pipe locked. + * If we had to support locking a normal pipe and a notification pipe at + * the same time, we could set up lockdep annotations for that, but + * since we don't actually need that, it's simpler to just bail here. + */ + if (pipe_has_watch_queue(pipe)) + return -EXDEV; + /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; @@ -458,11 +470,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - if (pipe_has_watch_queue(pipe)) { - ret = -EXDEV; - goto out; - } - /* * If it wasn't empty we try to merge new data into * the last buffer. @@ -1317,6 +1324,11 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) pipe->tail = tail; pipe->head = head; + if (!pipe_has_watch_queue(pipe)) { + pipe->max_usage = nr_slots; + pipe->nr_accounted = nr_slots; + } + spin_unlock_irq(&pipe->rd_wait.lock); /* This might have made more room for writers */ @@ -1368,8 +1380,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) if (ret < 0) goto out_revert_acct; - pipe->max_usage = nr_slots; - pipe->nr_accounted = nr_slots; return pipe->max_usage * PAGE_SIZE; out_revert_acct: diff --git a/fs/posix_acl.c b/fs/posix_acl.c index a05fe94970ce..e1af20893ebe 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -600,7 +600,7 @@ EXPORT_SYMBOL(__posix_acl_chmod); * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. */ int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, @@ -700,7 +700,7 @@ EXPORT_SYMBOL_GPL(posix_acl_create); * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. * * Called from set_acl inode operations. */ diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 435b61054b5b..1801e409a061 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -273,7 +273,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) const char *name = NULL; if (file) { - struct inode *inode = file_inode(vma->vm_file); + const struct inode *inode = file_user_inode(vma->vm_file); + dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 2138ee7d271d..5faf702f8d15 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1407,7 +1407,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, INITIALIZE_PATH(path); int item_len = 0; int tb_init = 0; - struct cpu_key cpu_key; + struct cpu_key cpu_key = {}; int retval; int quota_cut_bytes = 0; diff --git a/fs/stat.c b/fs/stat.c index f721d26ec3f7..8a8f7e97a742 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -41,7 +41,7 @@ * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before filling in the * uid and gid filds. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. */ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, struct inode *inode, struct kstat *stat) diff --git a/fs/super.c b/fs/super.c index 076392396e72..6fe482371633 100644 --- a/fs/super.c +++ b/fs/super.c @@ -323,7 +323,7 @@ static void destroy_unused_super(struct super_block *s) static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { - struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); + struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL); static const struct super_operations default_op; int i; |