diff options
author | Linus Torvalds | 2019-11-30 10:53:02 -0800 |
---|---|---|
committer | Linus Torvalds | 2019-11-30 10:53:02 -0800 |
commit | 50b8b3f85a01543fb82d3bb9bfe7d06659522c70 (patch) | |
tree | 178bcf210025fec174a9a6fa9a094283f3fcc3ad /fs/ext4 | |
parent | f112a2fd1f5999c6029551f901952392d900cf99 (diff) | |
parent | dfdeeb41fb08fbe11d3cfefba9c0fcd00c95a36d (diff) |
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o:
"This merge window saw the the following new featuers added to ext4:
- Direct I/O via iomap (required the iomap-for-next branch from
Darrick as a prereq).
- Support for using dioread-nolock where the block size < page size.
- Support for encryption for file systems where the block size < page
size.
- Rework of journal credits handling so a revoke-heavy workload will
not cause the journal to run out of space.
- Replace bit-spinlocks with spinlocks in jbd2
Also included were some bug fixes and cleanups, mostly to clean up
corner cases from fuzzed file systems and error path handling"
* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (59 commits)
ext4: work around deleting a file with i_nlink == 0 safely
ext4: add more paranoia checking in ext4_expand_extra_isize handling
jbd2: make jbd2_handle_buffer_credits() handle reserved handles
ext4: fix a bug in ext4_wait_for_tail_page_commit
ext4: bio_alloc with __GFP_DIRECT_RECLAIM never fails
ext4: code cleanup for get_next_id
ext4: fix leak of quota reservations
ext4: remove unused variable warning in parse_options()
ext4: Enable encryption for subpage-sized blocks
fs/buffer.c: support fscrypt in block_read_full_page()
ext4: Add error handling for io_end_vec struct allocation
jbd2: Fine tune estimate of necessary descriptor blocks
jbd2: Provide trace event for handle restarts
ext4: Reserve revoke credits for freed blocks
jbd2: Make credit checking more strict
jbd2: Rename h_buffer_credits to h_total_credits
jbd2: Reserve space for revoke descriptor blocks
jbd2: Drop jbd2_space_needed()
jbd2: Account descriptor blocks into t_outstanding_credits
jbd2: Factor out common parts of stopping and restarting a handle
...
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/ext4.h | 22 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.c | 32 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 106 | ||||
-rw-r--r-- | fs/ext4/extents.c | 149 | ||||
-rw-r--r-- | fs/ext4/file.c | 412 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 72 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 7 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 125 | ||||
-rw-r--r-- | fs/ext4/inode.c | 926 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 103 | ||||
-rw-r--r-- | fs/ext4/namei.c | 50 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 167 | ||||
-rw-r--r-- | fs/ext4/readpage.c | 6 | ||||
-rw-r--r-- | fs/ext4/resize.c | 46 | ||||
-rw-r--r-- | fs/ext4/super.c | 57 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 94 |
16 files changed, 1227 insertions, 1147 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b3a2cc7c0252..f8578caba40d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -198,6 +198,12 @@ struct ext4_system_blocks { */ #define EXT4_IO_END_UNWRITTEN 0x0001 +struct ext4_io_end_vec { + struct list_head list; /* list of io_end_vec */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +}; + /* * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. @@ -211,8 +217,7 @@ typedef struct ext4_io_end { * bios covering the extent */ unsigned int flag; /* unwritten or not */ atomic_t count; /* reference counter */ - loff_t offset; /* offset in the file */ - ssize_t size; /* size of the extent */ + struct list_head list_vec; /* list of ext4_io_end_vec */ } ext4_io_end_t; struct ext4_io_submit { @@ -1579,7 +1584,6 @@ enum { EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ - EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ @@ -2562,8 +2566,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, @@ -2606,7 +2608,6 @@ extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); -extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); @@ -3266,6 +3267,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, + ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_metadata_amount(struct inode *inode, @@ -3298,6 +3301,10 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); +extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred); + /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -3324,6 +3331,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, int len, struct writeback_control *wbc, bool keep_towrite); +extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); +extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); @@ -3381,6 +3390,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 7c70b08d104c..d3b8cdea5df7 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -65,12 +65,14 @@ static int ext4_journal_check_start(struct super_block *sb) } handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks) + int type, int blocks, int rsv_blocks, + int revoke_creds) { journal_t *journal; int err; - trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); + trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds, + _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); @@ -78,8 +80,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, journal = EXT4_SB(sb)->s_journal; if (!journal) return ext4_get_nojournal(); - return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, - type, line); + return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds, + GFP_NOFS, type, line); } int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) @@ -119,8 +121,8 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, return ext4_get_nojournal(); sb = handle->h_journal->j_private; - trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, - _RET_IP_); + trace_ext4_journal_start_reserved(sb, + jbd2_handle_buffer_credits(handle), _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) { jbd2_journal_free_reserved(handle); @@ -133,6 +135,19 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, return handle; } +int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, + int extend_cred, int revoke_cred) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (jbd2_handle_buffer_credits(handle) >= check_cred && + handle->h_revoke_credits >= revoke_cred) + return 0; + extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle)); + revoke_cred = max(0, revoke_cred - handle->h_revoke_credits); + return ext4_journal_extend(handle, extend_cred, revoke_cred); +} + static void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, @@ -278,7 +293,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle->h_type, handle->h_line_no, handle->h_requested_credits, - handle->h_buffer_credits, err); + jbd2_handle_buffer_credits(handle), err); return err; } ext4_error_inode(inode, where, line, @@ -289,7 +304,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle->h_type, handle->h_line_no, handle->h_requested_credits, - handle->h_buffer_credits, err); + jbd2_handle_buffer_credits(handle), + err); } } else { if (inode) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index ef8fcf7d0d3b..a6b9b66dbfad 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -261,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks); + int type, int blocks, int rsv_blocks, + int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -288,28 +289,41 @@ static inline int ext4_handle_is_aborted(handle_t *handle) return 0; } -static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) +static inline int ext4_free_metadata_revoke_credits(struct super_block *sb, + int blocks) { - if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) - return 0; - return 1; + /* Freeing each metadata block can result in freeing one cluster */ + return blocks * EXT4_SB(sb)->s_cluster_ratio; +} + +static inline int ext4_trans_default_revoke_credits(struct super_block *sb) +{ + return ext4_free_metadata_revoke_credits(sb, 8); } #define ext4_journal_start_sb(sb, type, nblocks) \ - __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0) + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \ + ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ - __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) + __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \ + ext4_trans_default_revoke_credits((inode)->i_sb)) + +#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\ + __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\ + ext4_trans_default_revoke_credits((inode)->i_sb)) -#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ - __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks)) +#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \ + __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \ + (revoke_creds)) static inline handle_t *__ext4_journal_start(struct inode *inode, unsigned int line, int type, - int blocks, int rsv_blocks) + int blocks, int rsv_blocks, + int revoke_creds) { return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, - rsv_blocks); + rsv_blocks, revoke_creds); } #define ext4_journal_stop(handle) \ @@ -332,20 +346,68 @@ static inline handle_t *ext4_journal_current_handle(void) return journal_current_handle(); } -static inline int ext4_journal_extend(handle_t *handle, int nblocks) +static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) - return jbd2_journal_extend(handle, nblocks); + return jbd2_journal_extend(handle, nblocks, revoke); return 0; } -static inline int ext4_journal_restart(handle_t *handle, int nblocks) +static inline int ext4_journal_restart(handle_t *handle, int nblocks, + int revoke) { if (ext4_handle_valid(handle)) - return jbd2_journal_restart(handle, nblocks); + return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS); return 0; } +int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, + int extend_cred, int revoke_cred); + + +/* + * Ensure @handle has at least @check_creds credits available. If not, + * transaction will be extended or restarted to contain at least @extend_cred + * credits. Before restarting transaction @fn is executed to allow for cleanup + * before the transaction is restarted. + * + * The return value is < 0 in case of error, 0 in case the handle has enough + * credits or transaction extension succeeded, 1 in case transaction had to be + * restarted. + */ +#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \ + revoke_cred, fn) \ +({ \ + __label__ __ensure_end; \ + int err = __ext4_journal_ensure_credits((handle), (check_cred), \ + (extend_cred), (revoke_cred)); \ + \ + if (err <= 0) \ + goto __ensure_end; \ + err = (fn); \ + if (err < 0) \ + goto __ensure_end; \ + err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \ + if (err == 0) \ + err = 1; \ +__ensure_end: \ + err; \ +}) + +/* + * Ensure given handle has at least requested amount of credits available, + * possibly restarting transaction if needed. We also make sure the transaction + * has space for at least ext4_trans_default_revoke_credits(sb) revoke records + * as freeing one or two blocks is very common pattern and requesting this is + * very cheap. + */ +static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, + int revoke_creds) +{ + return ext4_journal_ensure_credits_fn(handle, credits, credits, + revoke_creds, 0); +} + static inline int ext4_journal_blocks_per_page(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) @@ -407,6 +469,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && !test_opt(inode->i_sb, DELALLOC))) { @@ -437,6 +500,19 @@ static inline int ext4_should_writeback_data(struct inode *inode) return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } +static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) +{ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return 0; + if (!ext4_should_journal_data(inode)) + return 0; + /* + * Data blocks in one extent are contiguous, just account for partial + * clusters at extent boundaries + */ + return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1); +} + /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index fb0f99dc8c22..0e8708b77da6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -100,29 +100,41 @@ static int ext4_split_extent_at(handle_t *handle, static int ext4_find_delayed_extent(struct inode *inode, struct extent_status *newes); -static int ext4_ext_truncate_extend_restart(handle_t *handle, - struct inode *inode, - int needed) +static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { - int err; - - if (!ext4_handle_valid(handle)) - return 0; - if (handle->h_buffer_credits >= needed) - return 0; /* - * If we need to extend the journal get a few extra blocks - * while we're at it for efficiency's sake. + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. */ - needed += 3; - err = ext4_journal_extend(handle, needed - handle->h_buffer_credits); - if (err <= 0) - return err; - err = ext4_truncate_restart_trans(handle, inode, needed); - if (err == 0) - err = -EAGAIN; + BUG_ON(EXT4_JOURNAL(inode) == NULL); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + *dropped = 1; + return 0; +} - return err; +/* + * Make sure 'handle' has at least 'check_cred' credits. If not, restart + * transaction with 'restart_cred' credits. The function drops i_data_sem + * when restarting transaction and gets it after transaction is restarted. + * + * The function returns 0 on success, 1 if transaction had to be restarted, + * and < 0 in case of fatal error. + */ +int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred) +{ + int ret; + int dropped = 0; + + ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, + revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); + if (dropped) + down_write(&EXT4_I(inode)->i_data_sem); + return ret; } /* @@ -1753,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; - /* - * The check for IO to unwritten extent is somewhat racy as we - * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after - * dropping i_data_sem. But reserved blocks should save us in that - * case. - */ + if (ext4_ext_is_unwritten(ex1) && - (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || - atomic_read(&EXT4_I(inode)->i_unwritten) || - (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) + ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) @@ -1840,7 +1845,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, * group descriptor to release the extent tree block. If we * can't get the journal credits, give up. */ - if (ext4_journal_extend(handle, 2)) + if (ext4_journal_extend(handle, 2, + ext4_free_metadata_revoke_credits(inode->i_sb, 1))) return; /* @@ -2727,7 +2733,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; - int depth = ext_depth(inode), credits; + int depth = ext_depth(inode), credits, revoke_credits; struct ext4_extent_header *eh; ext4_lblk_t a, b; unsigned num; @@ -2819,10 +2825,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, credits += (ext_depth(inode)) + 1; } credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); - - err = ext4_ext_truncate_extend_restart(handle, inode, credits); - if (err) + /* + * We may end up freeing some index blocks and data from the + * punched range. Note that partial clusters are accounted for + * by ext4_free_data_revoke_credits(). + */ + revoke_credits = + ext4_free_metadata_revoke_credits(inode->i_sb, + ext_depth(inode)) + + ext4_free_data_revoke_credits(inode, b - a + 1); + + err = ext4_datasem_ensure_credits(handle, inode, credits, + credits, revoke_credits); + if (err) { + if (err > 0) + err = -EAGAIN; goto out; + } err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -2948,7 +2967,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); + handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, + depth + 1, + ext4_free_metadata_revoke_credits(inode->i_sb, depth)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4962,23 +4983,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, int ret = 0; int ret2 = 0; struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - /* - * This is somewhat ugly but the idea is clear: When transaction is - * reserved, everything goes into it. Otherwise we rather start several - * smaller transactions for conversion of each extent separately. - */ - if (handle) { - handle = ext4_journal_start_reserved(handle, - EXT4_HT_EXT_CONVERT); - if (IS_ERR(handle)) - return PTR_ERR(handle); - credits = 0; - } else { + if (!handle) { /* * credits to insert 1 extent into extent tree */ @@ -5009,11 +5020,40 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, if (ret <= 0 || ret2) break; } - if (!credits) - ret2 = ext4_journal_stop(handle); return ret > 0 ? ret2 : ret; } +int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) +{ + int ret, err = 0; + struct ext4_io_end_vec *io_end_vec; + + /* + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. + */ + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + + list_for_each_entry(io_end_vec, &io_end->list_vec, list) { + ret = ext4_convert_unwritten_extents(handle, io_end->inode, + io_end_vec->offset, + io_end_vec->size); + if (ret) + break; + } + + if (handle) + err = ext4_journal_stop(handle); + + return ret < 0 ? ret : err; +} + /* * If newes is not existing extent (newes->ec_pblk equals zero) find * delayed extent at start of newes and update newes accordingly and @@ -5206,13 +5246,10 @@ ext4_access_path(handle_t *handle, struct inode *inode, * descriptor) for each block group; assume two block * groups */ - if (handle->h_buffer_credits < 7) { - credits = ext4_writepage_trans_blocks(inode); - err = ext4_ext_truncate_extend_restart(handle, inode, credits); - /* EAGAIN is success */ - if (err && err != -EAGAIN) - return err; - } + credits = ext4_writepage_trans_blocks(inode); + err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0); + if (err < 0) + return err; err = ext4_ext_get_access(handle, inode, path); return err; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8d2bbcc2d813..6a7293a5cda2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -29,10 +29,58 @@ #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> +#include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "truncate.h" + +static bool ext4_dio_supported(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode)) + return false; + if (fsverity_active(inode)) + return false; + if (ext4_should_journal_data(inode)) + return false; + if (ext4_has_inline_data(inode)) + return false; + return true; +} + +static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock_shared(inode); + /* + * Fallback to buffered I/O if the operation being performed on + * the inode is not supported by direct I/O. The IOCB_DIRECT + * flag needs to be cleared here in order to ensure that the + * direct I/O path within generic_file_read_iter() is not + * taken. + */ + iocb->ki_flags &= ~IOCB_DIRECT; + return generic_file_read_iter(iocb, to); + } + + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, + is_sync_kiocb(iocb)); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX - if (IS_DAX(file_inode(iocb->ki_filp))) + if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_read_iter(iocb, to); + return generic_file_read_iter(iocb, to); } @@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_unwritten_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ext4_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); -} - /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they @@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. @@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + + ret = file_modified(iocb->ki_filp); + if (ret) + return ret; + return iov_iter_count(from); } -#ifdef CONFIG_FS_DAX -static ssize_t -ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) { - struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - inode_lock(inode); - } + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; - ret = file_remove_privs(iocb->ki_filp); - if (ret) - goto out; - ret = file_update_time(iocb->ki_filp); - if (ret) - goto out; - ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); + current->backing_dev_info = NULL; + out: inode_unlock(inode); - if (ret > 0) + if (likely(ret > 0)) { + iocb->ki_pos += ret; ret = generic_write_sync(iocb, ret); + } + return ret; } -#endif -static ssize_t -ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, + ssize_t written, size_t count) { + handle_t *handle; + bool truncate = false; + u8 blkbits = inode->i_blkbits; + ext4_lblk_t written_blk, end_blk; + + /* + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But, the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. + */ + WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); + if (offset + count <= EXT4_I(inode)->i_disksize) { + /* + * We need to ensure that the inode is removed from the orphan + * list if it has been added prematurely, due to writeback of + * delalloc blocks. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + return PTR_ERR(handle); + } + + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } + + return written; + } + + if (written < 0) + goto truncate; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + written = PTR_ERR(handle); + goto truncate; + } + + if (ext4_update_inode_size(inode, offset + written)) + ext4_mark_inode_dirty(handle, inode); + + /* + * We may need to truncate allocated but not written blocks beyond EOF. + */ + written_blk = ALIGN(offset + written, 1 << blkbits); + end_blk = ALIGN(offset + count, 1 << blkbits); + if (written_blk < end_blk && ext4_can_truncate(inode)) + truncate = true; + + /* + * Remove the inode from the orphan list if it has been extended and + * everything went OK. + */ + if (!truncate && inode->i_nlink) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + + if (truncate) { +truncate: + ext4_truncate_failed_write(inode); + /* + * If the truncate operation failed early, then the inode may + * still be on the orphan list. In that case, we need to try + * remove the inode from the in-memory linked list. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return written; +} + +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + loff_t offset = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); - int o_direct = iocb->ki_flags & IOCB_DIRECT; - int unaligned_aio = 0; - int overwrite = 0; + + if (error) + return error; + + if (size && flags & IOMAP_DIO_UNWRITTEN) + return ext4_convert_unwritten_extents(NULL, inode, + offset, size); + + return 0; +} + +static const struct iomap_dio_ops ext4_dio_write_ops = { + .end_io = ext4_dio_write_end_io, +}; + +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + struct inode *inode = file_inode(iocb->ki_filp); + bool extend = false, overwrite = false, unaligned_aio = false; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock(inode); + /* + * Fallback to buffered I/O if the inode does not support + * direct I/O. + */ + return ext4_buffered_write_iter(iocb, from); + } + + ret = ext4_write_checks(iocb, from); + if (ret <= 0) { + inode_unlock(inode); + return ret; + } + + /* + * Unaligned asynchronous direct I/O must be serialized among each + * other as the zeroing of partial blocks of two competing unaligned + * asynchronous direct I/O writes can result in data corruption. + */ + offset = iocb->ki_pos; + count = iov_iter_count(from); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { + unaligned_aio = true; + inode_dio_wait(inode); + } + + /* + * Determine whether the I/O will overwrite allocated and initialized + * blocks. If so, check to see whether it is possible to take the + * dioread_nolock path. + */ + if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && + ext4_should_dioread_nolock(inode)) { + overwrite = true; + downgrade_write(&inode->i_rwsem); + } + + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + extend = true; + ext4_journal_stop(handle); + } + + ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_aio || extend); + + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); + +out: + if (overwrite) + inode_unlock_shared(inode); + else + inode_unlock(inode); + + if (ret >= 0 && iov_iter_count(from)) { + ssize_t err; + loff_t endbyte; + + offset = iocb->ki_pos; + err = ext4_buffered_write_iter(iocb, from); + if (err < 0) + return err; + + /* + * We need to ensure that the pages within the page cache for + * the range covered by this I/O are written to disk and + * invalidated. This is in attempt to preserve the expected + * direct I/O semantics in the case we fallback to buffered I/O + * to complete off the I/O request. + */ + ret += err; + endbyte = offset + err - 1; + err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, + offset, endbyte); + if (!err) + invalidate_mapping_pages(iocb->ki_filp->f_mapping, + offset >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } + + return ret; +} #ifdef CONFIG_FS_DAX - if (IS_DAX(inode)) - return ext4_dax_write_iter(iocb, from); -#endif +static ssize_t +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + bool extend = false; + struct inode *inode = file_inode(iocb->ki_filp); if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) @@ -241,49 +497,55 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) goto out; - /* - * Unaligned direct AIO must be serialized among each other as zeroing - * of partial blocks of two competing unaligned AIOs can result in data - * corruption. - */ - if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && - ext4_unaligned_aio(inode, from, iocb->ki_pos)) { - unaligned_aio = 1; - ext4_unwritten_wait(inode); - } + offset = iocb->ki_pos; + count = iov_iter_count(from); - iocb->private = &overwrite; - /* Check whether we do a DIO overwrite or not */ - if (o_direct && !unaligned_aio) { - if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { - if (ext4_should_dioread_nolock(inode)) - overwrite = 1; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); goto out; } - } - ret = __generic_file_write_iter(iocb, from); - /* - * Unaligned direct AIO must be the only IO in flight. Otherwise - * overlapping aligned IO after unaligned might result in data - * corruption. - */ - if (ret == -EIOCBQUEUED && unaligned_aio) - ext4_unwritten_wait(inode); - inode_unlock(inode); + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } - if (ret > 0) - ret = generic_write_sync(iocb, ret); + extend = true; + ext4_journal_stop(handle); + } - return ret; + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); out: inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); return ret; } +#endif + +static ssize_t +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_write_iter(iocb, from); +#endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_write_iter(iocb, from); + + return ext4_buffered_write_iter(iocb, from); +} #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, @@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); - offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_hole(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); - offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_data(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5508baa11bb6..e10206e7f4bb 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -80,6 +80,43 @@ static int ext4_sync_parent(struct inode *inode) return ret; } +static int ext4_fsync_nojournal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + int ret, err; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY_ALL)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = sync_inode_metadata(inode, 1); + if (!ret) + ret = err; + + if (!ret) + ret = ext4_sync_parent(inode); + if (test_opt(inode->i_sb, BARRIER)) + *needs_barrier = true; + + return ret; +} + +static int ext4_fsync_journal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + *needs_barrier = true; + + return jbd2_complete_transaction(journal, commit_tid); +} + /* * akpm: A new design for ext4_sync_file(). * @@ -91,17 +128,14 @@ static int ext4_sync_parent(struct inode *inode) * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. */ - int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret = 0, err; - tid_t commit_tid; bool needs_barrier = false; + struct inode *inode = file->f_mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(sbi))) return -EIO; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -111,23 +145,15 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_mount_flags value */ smp_rmb(); - if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ret = -EROFS; goto out; } - if (!journal) { - ret = __generic_file_fsync(file, start, end, datasync); - if (!ret) - ret = ext4_sync_parent(inode); - if (test_opt(inode->i_sb, BARRIER)) - goto issue_flush; - goto out; - } - ret = file_write_and_wait_range(file, start, end); if (ret) return ret; + /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -142,18 +168,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) { + if (!sbi->s_journal) + ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier); + else if (ext4_should_journal_data(inode)) ret = ext4_force_commit(inode->i_sb); - goto out; - } + else + ret = ext4_fsync_journal(inode, datasync, &needs_barrier); - commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (journal->j_flags & JBD2_BARRIER && - !jbd2_trans_will_send_data_barrier(journal, commit_tid)) - needs_barrier = true; - ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - issue_flush: err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) ret = err; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 764ff4c56233..dc333e8e51e8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -265,13 +265,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ext4_debug("freeing inode %lu\n", ino); trace_ext4_free_inode(inode); - /* - * Note: we must free any quota before locking the superblock, - * as writing the quota to disk may need the lock as well. - */ dquot_initialize(inode); dquot_free_inode(inode); - dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -927,7 +922,7 @@ repeat_in_this_group: BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, handle_type, nblocks, - 0); + 0, 0); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 36699a131168..3a4ab70fe9e0 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -331,11 +331,14 @@ static int ext4_alloc_branch(handle_t *handle, for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); - } else + } else { ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, ar->inode, ar->goal, ar->flags & EXT4_MB_DELALLOC_RESERVED, NULL, &err); + /* Simplify error cleanup... */ + branch[i+1].bh = NULL; + } if (err) { i--; goto failed; @@ -377,18 +380,25 @@ static int ext4_alloc_branch(handle_t *handle, } return 0; failed: + if (i == indirect_blks) { + /* Free data blocks */ + ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], + ar->len, 0); + i--; + } for (; i >= 0; i--) { /* * We want to ext4_forget() only freshly allocated indirect - * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and - * buffer at branch[0].bh is indirect block / inode already - * existing before ext4_alloc_branch() was called. + * blocks. Buffer for new_blocks[i] is at branch[i+1].bh + * (buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called). Also + * because blocks are freshly allocated, we don't need to + * revoke them which is why we don't set + * EXT4_FREE_BLOCKS_METADATA. */ - if (i > 0 && i != indirect_blks && branch[i].bh) - ext4_forget(handle, 1, ar->inode, branch[i].bh, - branch[i].bh->b_blocknr); - ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], - (i == indirect_blks) ? ar->len : 1, 0); + ext4_free_blocks(handle, ar->inode, branch[i+1].bh, + new_blocks[i], 1, + branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0); } return err; } @@ -689,27 +699,63 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } +static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int *dropped) +{ + int err; + + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + return err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + return err; + /* + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. + */ + BUG_ON(EXT4_JOURNAL(inode) == NULL); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + *dropped = 1; + return 0; +} + /* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. + * extend fails, we restart transaction. */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +static int ext4_ind_truncate_ensure_credits(handle_t *handle, + struct inode *inode, + struct buffer_head *bh, + int revoke_creds) { - if (!ext4_handle_valid(handle)) - return 0; - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) - return 0; - return 1; + int ret; + int dropped = 0; + + ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_blocks_for_truncate(inode), revoke_creds, + ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped)); + if (dropped) + down_write(&EXT4_I(inode)->i_data_sem); + if (ret <= 0) + return ret; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + ret = ext4_journal_get_write_access(handle, bh); + if (unlikely(ret)) + return ret; + } + return 0; } /* @@ -844,27 +890,10 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, return 1; } - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) - goto out_err; - } - err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) - goto out_err; - err = ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - if (unlikely(err)) - goto out_err; - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out_err; - } - } + err = ext4_ind_truncate_ensure_credits(handle, inode, bh, + ext4_free_data_revoke_credits(inode, count)); + if (err < 0) + goto out_err; for (p = first; p < last; p++) *p = 0; @@ -1047,11 +1076,11 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, */ if (ext4_handle_is_aborted(handle)) return; - if (try_to_extend_transaction(handle, inode)) { - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - } + if (ext4_ind_truncate_ensure_credits(handle, inode, + NULL, + ext4_free_metadata_revoke_credits( + inode->i_sb, 1)) < 0) + return; /* * The forget flag here is critical because if diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1d880ae90c1f..28f28de0c1b6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -164,39 +164,18 @@ int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * Restart the transaction associated with *handle. This does a commit, - * so before we call here everything must be consistently dirtied against - * this transaction. - */ -int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, - int nblocks) -{ - int ret; - - /* - * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this - * moment, get_block can be called only for blocks inside i_size since - * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. - */ - BUG_ON(EXT4_JOURNAL(inode) == NULL); - jbd_debug(2, "restarting handle %p\n", handle); - up_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_journal_restart(handle, nblocks); - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - return ret; -} - -/* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; - int extra_credits = 3; + /* + * Credits for final inode cleanup and freeing: + * sb + inode (ext4_orphan_del()), block bitmap, group descriptor + * (xattr block freeing), bitmap, group descriptor (inode freeing) + */ + int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; trace_ext4_evict_inode(inode); @@ -252,8 +231,12 @@ void ext4_evict_inode(struct inode *inode) if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); + /* + * Block bitmap, group descriptor, and inode are accounted in both + * ext4_blocks_for_truncate() and extra_credits. So subtract 3. + */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)+extra_credits); + ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -827,136 +810,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, #define DIO_MAX_BLOCKS 4096 /* - * Get blocks function for the cases that need to start a transaction - - * generally difference cases of direct IO and DAX IO. It also handles retries - * in case of ENOSPC. - */ -static int ext4_get_block_trans(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int flags) -{ - int dio_credits; - handle_t *handle; - int retries = 0; - int ret; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) - bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; - dio_credits = ext4_chunk_trans_blocks(inode, - bh_result->b_size >> inode->i_blkbits); -retry: - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = _ext4_get_block(inode, iblock, bh_result, flags); - ext4_journal_stop(handle); - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; -} - -/* Get block function for DIO reads and writes to inodes without extents */ -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - if (!create) - return _ext4_get_block(inode, iblock, bh, 0); - return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE); -} - -/* - * Get block function for AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete. - */ -static int ext4_dio_get_block_unwritten_async(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * When doing DIO using unwritten extents, we need io_end to convert - * unwritten extents to written on IO completion. We allocate io_end - * once we spot unwritten extent and store it in b_private. Generic - * DIO code keeps b_private set and furthermore passes the value to - * our completion callback in 'private' argument. - */ - if (!ret && buffer_unwritten(bh_result)) { - if (!bh_result->b_private) { - ext4_io_end_t *io_end; - - io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!io_end) - return -ENOMEM; - bh_result->b_private = io_end; - ext4_set_io_unwritten_flag(inode, io_end); - } - set_buffer_defer_completion(bh_result); - } - - return ret; -} - -/* - * Get block function for non-AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete by ext4_direct_IO_write(). - */ -static int ext4_dio_get_block_unwritten_sync(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * Mark inode as having pending DIO writes to unwritten extents. - * ext4_direct_IO_write() checks this flag and converts extents to - * written. - */ - if (!ret && buffer_unwritten(bh_result)) - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - - return ret; -} - -static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - - ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", - inode->i_ino, create); - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = _ext4_get_block(inode, iblock, bh_result, 0); - /* - * Blocks should have been preallocated! ext4_file_write_iter() checks - * that. - */ - WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); - - return ret; -} - - -/* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, @@ -2341,6 +2194,79 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } /* + * mpage_process_page - update page buffers corresponding to changed extent and + * may submit fully mapped page for IO + * + * @mpd - description of extent to map, on return next extent to map + * @m_lblk - logical block mapping. + * @m_pblk - corresponding physical mapping. + * @map_bh - determines on return whether this page requires any further + * mapping or not. + * Scan given page buffers corresponding to changed extent and update buffer + * state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits. + * If the given page is not fully mapped, we update @map to the next extent in + * the given page that needs mapping & return @map_bh as true. + */ +static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, + ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, + bool *map_bh) +{ + struct buffer_head *head, *bh; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + ext4_lblk_t lblk = *m_lblk; + ext4_fsblk_t pblock = *m_pblk; + int err = 0; + int blkbits = mpd->inode->i_blkbits; + ssize_t io_end_size = 0; + struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); + + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + io_end_vec->size += io_end_size; + io_end_size = 0; + + err = mpage_process_page_bufs(mpd, head, bh, lblk); + if (err > 0) + err = 0; + if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) { + err = PTR_ERR(io_end_vec); + goto out; + } + io_end_vec->offset = mpd->map.m_lblk << blkbits; + } + *map_bh = true; + goto out; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + io_end_size += (1 << blkbits); + } while (lblk++, (bh = bh->b_this_page) != head); + + io_end_vec->size += io_end_size; + io_end_size = 0; + *map_bh = false; +out: + *m_lblk = lblk; + *m_pblk = pblock; + return err; +} + +/* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * @@ -2359,12 +2285,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct pagevec pvec; int nr_pages, i; struct inode *inode = mpd->inode; - struct buffer_head *head, *bh; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; - sector_t pblock; + ext4_fsblk_t pblock; int err; + bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; @@ -2380,50 +2306,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - bh = head = page_buffers(page); - do { - if (lblk < mpd->map.m_lblk) - continue; - if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { - /* - * Buffer after end of mapped extent. - * Find next buffer in the page to map. - */ - mpd->map.m_len = 0; - mpd->map.m_flags = 0; - /* - * FIXME: If dioread_nolock supports - * blocksize < pagesize, we need to make - * sure we add size mapped so far to - * io_end->size as the following call - * can submit the page for IO. - */ - err = mpage_process_page_bufs(mpd, head, - bh, lblk); - pagevec_release(&pvec); - if (err > 0) - err = 0; - return err; - } - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock++; - } - clear_buffer_unwritten(bh); - } while (lblk++, (bh = bh->b_this_page) != head); - + err = mpage_process_page(mpd, page, &lblk, &pblock, + &map_bh); /* - * FIXME: This is going to break if dioread_nolock - * supports blocksize < pagesize as we will try to - * convert potentially unmapped parts of inode. + * If map_bh is true, means page may require further bh + * mapping, or maybe the page was submitted for IO. + * So we return to call further extent mapping. */ - mpd->io_submit.io_end->size += PAGE_SIZE; + if (err < 0 || map_bh == true) + goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); - if (err < 0) { - pagevec_release(&pvec); - return err; - } + if (err < 0) + goto out; } pagevec_release(&pvec); } @@ -2431,6 +2326,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; +out: + pagevec_release(&pvec); + return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) @@ -2510,9 +2408,13 @@ static int mpage_map_and_submit_extent(handle_t *handle, int err; loff_t disksize; int progress = 0; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + struct ext4_io_end_vec *io_end_vec; - mpd->io_submit.io_end->offset = - ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) + return PTR_ERR(io_end_vec); + io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -3406,473 +3308,235 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) return inode->i_state & I_DIRTY_DATASYNC; } -static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned flags, struct iomap *iomap, struct iomap *srcmap) +static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, + struct ext4_map_blocks *map, loff_t offset, + loff_t length) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned int blkbits = inode->i_blkbits; - unsigned long first_block, last_block; - struct ext4_map_blocks map; - bool delalloc = false; - int ret; - - if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) - return -EINVAL; - first_block = offset >> blkbits; - last_block = min_t(loff_t, (offset + length - 1) >> blkbits, - EXT4_MAX_LOGICAL_BLOCK); - - if (flags & IOMAP_REPORT) { - if (ext4_has_inline_data(inode)) { - ret = ext4_inline_data_iomap(inode, iomap); - if (ret != -EAGAIN) { - if (ret == 0 && offset >= iomap->length) - ret = -ENOENT; - return ret; - } - } - } else { - if (WARN_ON_ONCE(ext4_has_inline_data(inode))) - return -ERANGE; - } - - map.m_lblk = first_block; - map.m_len = last_block - first_block + 1; - - if (flags & IOMAP_REPORT) { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - - if (ret == 0) { - ext4_lblk_t end = map.m_lblk + map.m_len - 1; - struct extent_status es; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map.m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) { - /* entire range is a hole */ - } else if (es.es_lblk > map.m_lblk) { - /* range starts with a hole */ - map.m_len = es.es_lblk - map.m_lblk; - } else { - ext4_lblk_t offs = 0; - - if (es.es_lblk < map.m_lblk) - offs = map.m_lblk - es.es_lblk; - map.m_lblk = es.es_lblk + offs; - map.m_len = es.es_len - offs; - delalloc = true; - } - } - } else if (flags & IOMAP_WRITE) { - int dio_credits; - handle_t *handle; - int retries = 0; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); -retry: - /* - * Either we allocate blocks and then we don't get unwritten - * extent so we have reserved enough credits, or the blocks - * are already allocated and unwritten and in that case - * extent conversion fits in the credits as well. - */ - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) { - ext4_journal_stop(handle); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; - } - - /* - * If we added blocks beyond i_size, we need to make sure they - * will get truncated if we crash before updating i_size in - * ext4_iomap_end(). For faults we don't need to do that (and - * even cannot because for orphan list operations inode_lock is - * required) - if we happen to instantiate block beyond i_size, - * it is because we race with truncate which has already added - * the inode to the orphan list. - */ - if (!(flags & IOMAP_FAULT) && first_block + map.m_len > - (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { - int err; - - err = ext4_orphan_add(handle, inode); - if (err < 0) { - ext4_journal_stop(handle); - return err; - } - } - ext4_journal_stop(handle); - } else { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - } + u8 blkbits = inode->i_blkbits; + /* + * Writes that span EOF might trigger an I/O size update on completion, + * so consider them to be dirty for the purpose of O_DSYNC, even if + * there is no other metadata changes being made or are pending. + */ iomap->flags = 0; - if (ext4_inode_datasync_dirty(inode)) + if (ext4_inode_datasync_dirty(inode) || + offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; + + if (map->m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = sbi->s_daxdev; - iomap->offset = (u64)first_block << blkbits; - iomap->length = (u64)map.m_len << blkbits; + iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + iomap->offset = (u64) map->m_lblk << blkbits; + iomap->length = (u64) map->m_len << blkbits; - if (ret == 0) { - iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; + /* + * Flags passed to ext4_map_blocks() for direct I/O writes can result + * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits + * set. In order for any allocated unwritten extents to be converted + * into written extents correctly within the ->end_io() handler, we + * need to ensure that the iomap->type is set appropriately. Hence, the + * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has + * been set first. + */ + if (map->m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + iomap->addr = (u64) map->m_pblk << blkbits; + } else if (map->m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = (u64) map->m_pblk << blkbits; } else { - if (map.m_flags & EXT4_MAP_MAPPED) { - iomap->type = IOMAP_MAPPED; - } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { - iomap->type = IOMAP_UNWRITTEN; - } else { - WARN_ON_ONCE(1); - return -EIO; - } - iomap->addr = (u64)map.m_pblk << blkbits; + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; } - - if (map.m_flags & EXT4_MAP_NEW) - iomap->flags |= IOMAP_F_NEW; - - return 0; } -static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, - ssize_t written, unsigned flags, struct iomap *iomap) +static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, + unsigned int flags) { - int ret = 0; handle_t *handle; - int blkbits = inode->i_blkbits; - bool truncate = false; + u8 blkbits = inode->i_blkbits; + int ret, dio_credits, m_flags = 0, retries = 0; - if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) - return 0; - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto orphan_del; - } - if (ext4_update_inode_size(inode, offset + written)) - ext4_mark_inode_dirty(handle, inode); /* - * We may need to truncate allocated but not written blocks beyond EOF. + * Trim the mapping request to the maximum value that we can map at + * once for direct I/O. */ - if (iomap->offset + iomap->length > - ALIGN(inode->i_size, 1 << blkbits)) { - ext4_lblk_t written_blk, end_blk; + if (map->m_len > DIO_MAX_BLOCKS) + map->m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); - written_blk = (offset + written) >> blkbits; - end_blk = (offset + length) >> blkbits; - if (written_blk < end_blk && ext4_can_truncate(inode)) - truncate = true; - } +retry: /* - * Remove inode from orphan list if we were extending a inode and - * everything went fine. + * Either we allocate blocks and then don't get an unwritten extent, so + * in that case we have reserved enough credits. Or, the blocks are + * already allocated and unwritten. In that case, the extent conversion + * fits into the credits as well. */ - if (!truncate && inode->i_nlink && - !list_empty(&EXT4_I(inode)->i_orphan)) - ext4_orphan_del(handle, inode); - ext4_journal_stop(handle); - if (truncate) { - ext4_truncate_failed_write(inode); -orphan_del: - /* - * If truncate failed early the inode might still be on the - * orphan list; we need to make sure the inode is removed from - * the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - return ret; -} - -const struct iomap_ops ext4_iomap_ops = { - .iomap_begin = ext4_iomap_begin, - .iomap_end = ext4_iomap_end, -}; - -static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) -{ - ext4_io_end_t *io_end = private; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); - /* if not async direct IO just return */ - if (!io_end) - return 0; + /* + * DAX and direct I/O are the only two operations that are currently + * supported with IOMAP_WRITE. + */ + WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); + if (IS_DAX(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + /* + * We use i_size instead of i_disksize here because delalloc writeback + * can complete at any point during the I/O and subsequently push the + * i_disksize out to i_size. This could be beyond where direct I/O is + * happening and thus expose allocated blocks to direct I/O reads. + */ + else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE; + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - io_end, io_end->inode->i_ino, iocb, offset, size); + ret = ext4_map_blocks(handle, inode, map, m_flags); /* - * Error during AIO DIO. We cannot convert unwritten extents as the - * data was not written. Just clear the unwritten flag and drop io_end. + * We cannot fill holes in indirect tree based inodes as that could + * expose stale data in the case of a crash. Use the magic error code + * to fallback to buffered I/O. */ - if (size <= 0) { - ext4_clear_io_unwritten_flag(io_end); - size = 0; - } - io_end->offset = offset; - io_end->size = size; - ext4_put_io_end(io_end); + if (!m_flags && !ret) + ret = -ENOTBLK; - return 0; + ext4_journal_stop(handle); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + return ret; } -/* - * Handling of direct IO writes. - * - * For ext4 extent files, ext4 will do direct-io write even to holes, - * preallocated extents, and those write extend the file, no need to - * fall back to buffered IO. - * - * For holes, we fallocate those blocks, mark them as unwritten - * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as unwritten. - * - * The unwritten extents will be converted to written when DIO is completed. - * For async direct IO, since the IO may still pending when return, we - * set up an end_io call back function, which will do the conversion - * when async direct IO completed. - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - */ -static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) + +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap, struct iomap *srcmap) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - ssize_t ret; - loff_t offset = iocb->ki_pos; - size_t count = iov_iter_count(iter); - int overwrite = 0; - get_block_t *get_block_func = NULL; - int dio_flags = 0; - loff_t final_size = offset + count; - int orphan = 0; - handle_t *handle; + int ret; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; - if (final_size > inode->i_size || final_size > ei->i_disksize) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ext4_update_i_disksize(inode, inode->i_size); - ext4_journal_stop(handle); - } + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; - BUG_ON(iocb->private == NULL); + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; /* - * Make all waiters for direct IO properly wait also for extent - * conversion. This also disallows race between truncate() and - * overwrite DIO as i_dio_count needs to be incremented under i_mutex. + * Calculate the first and last logical blocks respectively. */ - inode_dio_begin(inode); + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; - /* If we do a overwrite dio, i_mutex locking can be released */ - overwrite = *((int *)iocb->private); + if (flags & IOMAP_WRITE) + ret = ext4_iomap_alloc(inode, &map, flags); + else + ret = ext4_map_blocks(NULL, inode, &map, 0); + + if (ret < 0) + return ret; - if (overwrite) - inode_unlock(inode); + ext4_set_iomap(inode, iomap, &map, offset, length); + return 0; +} + +static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ /* - * For extent mapped files we could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as unwritten to prevent - * parallel buffered read to expose the stale data before DIO complete - * the data IO. - * - * As to previously fallocated extents, ext4 get_block will just simply - * mark the buffer mapped but still keep the extents unwritten. - * - * For non AIO case, we will convert those unwritten extents to written - * after return back from blockdev_direct_IO. That way we save us from - * allocating io_end structure and also the overhead of offloading - * the extent convertion to a workqueue. - * - * For async DIO, the conversion needs to be deferred when the - * IO is completed. The ext4 end_io callback function will be - * called to take care of the conversion work. Here for async - * case, we allocate an io_end structure to hook to the iocb. + * Check to see whether an error occurred while writing out the data to + * the allocated blocks. If so, return the magic error code so that we + * fallback to buffered I/O and attempt to complete the remainder of + * the I/O. Any blocks that may have been allocated in preparation for + * the direct I/O will be reused during buffered I/O. */ - iocb->private = NULL; - if (overwrite) - get_block_func = ext4_dio_get_block_overwrite; - else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || - round_down(offset, i_blocksize(inode)) >= inode->i_size) { - get_block_func = ext4_dio_get_block; - dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; - } else if (is_sync_kiocb(iocb)) { - get_block_func = ext4_dio_get_block_unwritten_sync; - dio_flags = DIO_LOCKING; - } else { - get_block_func = ext4_dio_get_block_unwritten_async; - dio_flags = DIO_LOCKING; - } - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - get_block_func, ext4_end_io_dio, NULL, - dio_flags); + if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + return -ENOTBLK; - if (ret > 0 && !overwrite && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(NULL, inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } + return 0; +} - inode_dio_end(inode); - /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) - inode_lock(inode); +const struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, + .iomap_end = ext4_iomap_end, +}; - if (ret < 0 && final_size > inode->i_size) - ext4_truncate_failed_write(inode); +static bool ext4_iomap_is_delalloc(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct extent_status es; + ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - /* Handle extending of i_size after direct IO write */ - if (orphan) { - int err; + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map->m_lblk, end, &es); - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - /* - * We wrote the data but cannot extend - * i_size. Bail out. In async io case, we do - * not return error here because we have - * already submmitted the corresponding - * bio. Returning error here makes the caller - * think that this IO is done and failed - * resulting in race with bio's completion - * handler. - */ - if (!ret) - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); + if (!es.es_len || es.es_lblk > end) + return false; - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size || end > ei->i_disksize) { - ext4_update_i_disksize(inode, end); - if (end > inode->i_size) - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; + if (es.es_lblk > map->m_lblk) { + map->m_len = es.es_lblk - map->m_lblk; + return false; } -out: - return ret; -} -static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); - ssize_t ret; + offset = map->m_lblk - es.es_lblk; + map->m_len = es.es_len - offset; - /* - * Shared inode_lock is enough for us - it protects against concurrent - * writes & truncates and since we take care of writing back page cache, - * we are protected against page writeback as well. - */ - inode_lock_shared(inode); - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret) - goto out_unlock; - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, ext4_dio_get_block, NULL, NULL, 0); -out_unlock: - inode_unlock_shared(inode); - return ret; + return true; } -static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - ssize_t ret; + int ret; + bool delalloc = false; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; -#ifdef CONFIG_FS_ENCRYPTION - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) - return 0; -#endif - if (fsverity_active(inode)) - return 0; + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + + if (ext4_has_inline_data(inode)) { + ret = ext4_inline_data_iomap(inode, iomap); + if (ret != -EAGAIN) { + if (ret == 0 && offset >= iomap->length) + ret = -ENOENT; + return ret; + } + } /* - * If we are doing data journalling we don't support O_DIRECT + * Calculate the first and last logical block respectively. */ - if (ext4_should_journal_data(inode)) - return 0; + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; - /* Let buffer I/O handle the inline data case. */ - if (ext4_has_inline_data(inode)) - return 0; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + if (ret == 0) + delalloc = ext4_iomap_is_delalloc(inode, &map); - trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (iov_iter_rw(iter) == READ) - ret = ext4_direct_IO_read(iocb, iter); - else - ret = ext4_direct_IO_write(iocb, iter); - trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); - return ret; + ext4_set_iomap(inode, iomap, &map, offset, length); + if (delalloc && iomap->type == IOMAP_HOLE) + iomap->type = IOMAP_DELALLOC; + + return 0; } +const struct iomap_ops ext4_iomap_report_ops = { + .iomap_begin = ext4_iomap_begin_report, +}; + /* * Pages can be marked dirty completely asynchronously from ext4's journalling * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do @@ -3910,7 +3574,7 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -3927,7 +3591,7 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_journalled_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -3943,7 +3607,7 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -5450,11 +5114,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) offset = inode->i_size & (PAGE_SIZE - 1); /* - * All buffers in the last page remain valid? Then there's nothing to - * do. We do the check mainly to optimize the common PAGE_SIZE == - * blocksize case + * If the page is fully truncated, we don't need to wait for any commit + * (and we even should not as __ext4_journalled_invalidatepage() may + * strip all buffers from the page but keep the page dirty which can then + * confuse e.g. concurrent ext4_writepage() seeing dirty page without + * buffers). Also we don't need to wait for any commit if all buffers in + * the page remain valid. This is most beneficial for the common case of + * blocksize == PAGESIZE. */ - if (offset > PAGE_SIZE - i_blocksize(inode)) + if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { page = find_lock_page(inode->i_mapping, @@ -5915,8 +5583,23 @@ static int __ext4_expand_extra_isize(struct inode *inode, { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; + unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); int error; + /* this was checked at iget time, but double check for good measure */ + if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || + (ei->i_extra_isize & 3)) { + EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + return -EFSCORRUPTED; + } + if ((new_extra_isize < ei->i_extra_isize) || + (new_extra_isize < 4) || + (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) + return -EINVAL; /* Should never happen */ + raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); @@ -5968,9 +5651,8 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode, * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ - if (ext4_handle_valid(handle) && - jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0) + if (ext4_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b1e4d359f73b..89725fa42573 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -50,29 +50,9 @@ static int finish_range(handle_t *handle, struct inode *inode, needed = ext4_ext_calc_credits_for_single_extent(inode, lb->last_block - lb->first_block + 1, path); - /* - * Make sure the credit we accumalated is not really high - */ - if (needed && ext4_handle_has_enough_credits(handle, - EXT4_RESERVE_TRANS_BLOCKS)) { - up_write((&EXT4_I(inode)->i_data_sem)); - retval = ext4_journal_restart(handle, needed); - down_write((&EXT4_I(inode)->i_data_sem)); - if (retval) - goto err_out; - } else if (needed) { - retval = ext4_journal_extend(handle, needed); - if (retval) { - /* - * IF not able to extend the journal restart the journal - */ - up_write((&EXT4_I(inode)->i_data_sem)); - retval = ext4_journal_restart(handle, needed); - down_write((&EXT4_I(inode)->i_data_sem)); - if (retval) - goto err_out; - } - } + retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); + if (retval < 0) + goto err_out; retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); @@ -196,42 +176,30 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, } -static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) -{ - int retval = 0, needed; - - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - /* - * We are freeing a blocks. During this we touch - * superblock, group descriptor and block bitmap. - * So allocate a credit of 3. We may update - * quota (user and group). - */ - needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); - - if (ext4_journal_extend(handle, needed) != 0) - retval = ext4_journal_restart(handle, needed); - - return retval; -} - static int free_dind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i; __le32 *tmp_idata; struct buffer_head *bh; + struct super_block *sb = inode->i_sb; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + int err; - bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { - extend_credit_for_blkdel(handle, inode); + err = ext4_journal_ensure_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(sb, 1)); + if (err < 0) { + put_bh(bh); + return err; + } ext4_free_blocks(handle, inode, NULL, le32_to_cpu(tmp_idata[i]), 1, EXT4_FREE_BLOCKS_METADATA | @@ -239,7 +207,10 @@ static int free_dind_blocks(handle_t *handle, } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(sb, 1)); + if (err < 0) + return err; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -270,7 +241,10 @@ static int free_tind_blocks(handle_t *handle, } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -283,7 +257,11 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data[0]), 1, EXT4_FREE_BLOCKS_METADATA | @@ -318,12 +296,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * One credit accounted for writing the * i_data field of the original inode */ - retval = ext4_journal_extend(handle, 1); - if (retval) { - retval = ext4_journal_restart(handle, 1); - if (retval) - goto err_out; - } + retval = ext4_journal_ensure_credits(handle, 1, 0); + if (retval < 0) + goto err_out; i_data[0] = ei->i_data[EXT4_IND_BLOCK]; i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; @@ -391,15 +366,20 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); - if (retval) - break; + if (retval) { + put_bh(bh); + return retval; + } } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - return retval; + return 0; } /* @@ -574,9 +554,9 @@ err_out: } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ - if (ext4_journal_extend(handle, 1) != 0) - ext4_journal_restart(handle, 1); - + retval = ext4_journal_ensure_credits(handle, 1, 0); + if (retval < 0) + goto out_stop; /* * Mark the tmp_inode as of size zero */ @@ -594,6 +574,7 @@ err_out: /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); +out_stop: ext4_journal_stop(handle); out: unlock_new_inode(tmp_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a427d2031a8d..a856997d87b5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2547,18 +2547,29 @@ static void ext4_dec_count(handle_t *handle, struct inode *inode) } +/* + * Add non-directory inode to a directory. On success, the inode reference is + * consumed by dentry is instantiation. This is also indicated by clearing of + * *inodep pointer. On failure, the caller is responsible for dropping the + * inode reference in the safe context. + */ static int ext4_add_nondir(handle_t *handle, - struct dentry *dentry, struct inode *inode) + struct dentry *dentry, struct inode **inodep) { + struct inode *dir = d_inode(dentry->d_parent); + struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); d_instantiate_new(dentry, inode); + *inodep = NULL; return 0; } drop_nlink(inode); + ext4_orphan_add(handle, inode); unlock_new_inode(inode); - iput(inode); return err; } @@ -2592,12 +2603,12 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); + err = ext4_add_nondir(handle, dentry, &inode); } if (handle) ext4_journal_stop(handle); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2624,12 +2635,12 @@ retry: if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); + err = ext4_add_nondir(handle, dentry, &inode); } if (handle) ext4_journal_stop(handle); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2779,10 +2790,12 @@ retry: if (err) { out_clear_inode: clear_nlink(inode); + ext4_orphan_add(handle, inode); unlock_new_inode(inode); ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); iput(inode); - goto out_stop; + goto out_retry; } ext4_inc_count(handle, dir); ext4_update_dx_flag(dir); @@ -2796,6 +2809,7 @@ out_clear_inode: out_stop: if (handle) ext4_journal_stop(handle); +out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -3182,18 +3196,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - if (inode->i_nlink == 0) { - ext4_warning_inode(inode, "Deleting file '%.*s' with no links", - dentry->d_name.len, dentry->d_name.name); - set_nlink(inode, 1); - } retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_unlink; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); - drop_nlink(inode); + if (inode->i_nlink == 0) + ext4_warning_inode(inode, "Deleting file '%.*s' with no links", + dentry->d_name.len, dentry->d_name.name); + else + drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); @@ -3328,12 +3341,11 @@ static int ext4_symlink(struct inode *dir, inode->i_size = disk_link.len - 1; } EXT4_I(inode)->i_disksize = inode->i_size; - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - + err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); + if (inode) + iput(inode); goto out_free_encrypted_link; err_drop_inode: diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 12ceadef32c5..24aeedb8fc75 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -31,18 +31,56 @@ #include "acl.h" static struct kmem_cache *io_end_cachep; +static struct kmem_cache *io_end_vec_cachep; int __init ext4_init_pageio(void) { io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); if (io_end_cachep == NULL) return -ENOMEM; + + io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0); + if (io_end_vec_cachep == NULL) { + kmem_cache_destroy(io_end_cachep); + return -ENOMEM; + } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_end_vec_cachep); +} + +struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec; + + io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS); + if (!io_end_vec) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&io_end_vec->list); + list_add_tail(&io_end_vec->list, &io_end->list_vec); + return io_end_vec; +} + +static void ext4_free_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec, *tmp; + + if (list_empty(&io_end->list_vec)) + return; + list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) { + list_del(&io_end_vec->list); + kmem_cache_free(io_end_vec_cachep, io_end_vec); + } +} + +struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end) +{ + BUG_ON(list_empty(&io_end->list_vec)); + return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list); } /* @@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) ext4_finish_bio(bio); bio_put(bio); } + ext4_free_io_end_vec(io_end); kmem_cache_free(io_end_cachep, io_end); } @@ -136,29 +175,26 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) * cannot get to ext4_ext_truncate() before all IOs overlapping that range are * completed (happens from ext4_free_ioend()). */ -static int ext4_end_io(ext4_io_end_t *io) +static int ext4_end_io_end(ext4_io_end_t *io_end) { - struct inode *inode = io->inode; - loff_t offset = io->offset; - ssize_t size = io->size; - handle_t *handle = io->handle; + struct inode *inode = io_end->inode; + handle_t *handle = io_end->handle; int ret = 0; - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", - io, inode->i_ino, io->list.next, io->list.prev); + io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_extents(handle, inode, offset, size); + io_end->handle = NULL; /* Following call will use up the handle */ + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " - "(inode %lu, offset %llu, size %zd, error %d)", - inode->i_ino, offset, size, ret); + "(inode %lu, error %d)", inode->i_ino, ret); } - ext4_clear_io_unwritten_flag(io); - ext4_release_io_end(io); + ext4_clear_io_unwritten_flag(io_end); + ext4_release_io_end(io_end); return ret; } @@ -166,21 +202,21 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) { #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; + ext4_io_end_t *io_end, *io_end0, *io_end1; if (list_empty(head)) return; ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); - list_for_each_entry(io, head, list) { - cur = &io->list; + list_for_each_entry(io_end, head, list) { + cur = &io_end->list; before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); + io_end0 = container_of(before, ext4_io_end_t, list); after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); + io_end1 = container_of(after, ext4_io_end_t, list); ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); + io_end, inode->i_ino, io_end0, io_end1); } #endif } @@ -207,7 +243,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) static int ext4_do_flush_completed_IO(struct inode *inode, struct list_head *head) { - ext4_io_end_t *io; + ext4_io_end_t *io_end; struct list_head unwritten; unsigned long flags; struct ext4_inode_info *ei = EXT4_I(inode); @@ -219,11 +255,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode, spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); while (!list_empty(&unwritten)) { - io = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); - list_del_init(&io->list); + io_end = list_entry(unwritten.next, ext4_io_end_t, list); + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + list_del_init(&io_end->list); - err = ext4_end_io(io); + err = ext4_end_io_end(io_end); if (unlikely(!ret && err)) ret = err; } @@ -242,19 +278,22 @@ void ext4_end_io_rsv_work(struct work_struct *work) ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { - ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); - if (io) { - io->inode = inode; - INIT_LIST_HEAD(&io->list); - atomic_set(&io->count, 1); + ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags); + + if (io_end) { + io_end->inode = inode; + INIT_LIST_HEAD(&io_end->list); + INIT_LIST_HEAD(&io_end->list_vec); + atomic_set(&io_end->count, 1); } - return io; + return io_end; } void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (atomic_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || + list_empty(&io_end->list_vec)) { ext4_release_io_end(io_end); return; } @@ -268,9 +307,8 @@ int ext4_put_io_end(ext4_io_end_t *io_end) if (atomic_dec_and_test(&io_end->count)) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_extents(io_end->handle, - io_end->inode, io_end->offset, - io_end->size); + err = ext4_convert_unwritten_io_end_vec(io_end->handle, + io_end); io_end->handle = NULL; ext4_clear_io_unwritten_flag(io_end); } @@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio) struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " - "(offset %llu size %ld starting block %llu)", + "starting block %llu)", bio->bi_status, inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); mapping_set_error(inode->i_mapping, @@ -358,14 +394,16 @@ void ext4_io_submit_init(struct ext4_io_submit *io, io->io_end = NULL; } -static int io_submit_init_bio(struct ext4_io_submit *io, - struct buffer_head *bh) +static void io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) { struct bio *bio; + /* + * bio_alloc will _always_ be able to allocate a bio if + * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). + */ bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - if (!bio) - return -ENOMEM; bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; @@ -373,13 +411,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io, io->io_bio = bio; io->io_next_block = bh->b_blocknr; wbc_init_bio(io->io_wbc, bio); - return 0; } -static int io_submit_add_bh(struct ext4_io_submit *io, - struct inode *inode, - struct page *page, - struct buffer_head *bh) +static void io_submit_add_bh(struct ext4_io_submit *io, + struct inode *inode, + struct page *page, + struct buffer_head *bh) { int ret; @@ -388,9 +425,7 @@ submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init_bio(io, bh); - if (ret) - return ret; + io_submit_init_bio(io, bh); io->io_bio->bi_write_hint = inode->i_write_hint; } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); @@ -398,7 +433,6 @@ submit_and_retry: goto submit_and_retry; wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size); io->io_next_block++; - return 0; } int ext4_bio_write_page(struct ext4_io_submit *io, @@ -491,8 +525,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } - bounce_page = NULL; - goto out; + + printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); + redirty_page_for_writepage(wbc, page); + do { + clear_buffer_async_write(bh); + bh = bh->b_this_page; + } while (bh != head); + goto unlock; } } @@ -500,30 +540,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh); - if (ret) { - /* - * We only get here on ENOMEM. Not much else - * we can do but mark the page as dirty, and - * better luck next time. - */ - break; - } + io_submit_add_bh(io, inode, + bounce_page ? bounce_page : page, bh); nr_submitted++; clear_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - /* Error stopped previous loop? Clean up buffers... */ - if (ret) { - out: - fscrypt_free_bounce_page(bounce_page); - printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); - redirty_page_for_writepage(wbc, page); - do { - clear_buffer_async_write(bh); - bh = bh->b_this_page; - } while (bh != head); - } +unlock: unlock_page(page); /* Nothing submitted - we have to end page writeback */ if (!nr_submitted) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a30b203fa461..fef7755300c3 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -360,10 +360,12 @@ int ext4_mpage_readpages(struct address_space *mapping, if (bio == NULL) { struct bio_post_read_ctx *ctx; + /* + * bio_alloc will _always_ be able to allocate a bio if + * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). + */ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) - goto set_error_page; ctx = get_bio_post_read_ctx(inode, bio, page->index); if (IS_ERR(ctx)) { bio_put(bio); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c0e9aef376a7..a8c0f2b5b6e1 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -388,28 +388,10 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, return bh; } -/* - * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA. - * If that fails, restart the transaction & regain write access for the - * buffer head which is used for block_bitmap modifications. - */ -static int extend_or_restart_transaction(handle_t *handle, int thresh) +static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits) { - int err; - - if (ext4_handle_has_enough_credits(handle, thresh)) - return 0; - - err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); - if (err < 0) - return err; - if (err) { - err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); - if (err) - return err; - } - - return 0; + return ext4_journal_ensure_credits_fn(handle, credits, + EXT4_MAX_TRANS_DATA, 0, 0); } /* @@ -451,8 +433,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, continue; } - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) return err; bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); @@ -544,8 +526,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, struct buffer_head *gdb; ext4_debug("update backup group %#04llx\n", block); - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; gdb = sb_getblk(sb, block); @@ -602,8 +584,8 @@ handle_bb: /* Initialize block bitmap of the @group */ block = group_data[i].block_bitmap; - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; bh = bclean(handle, sb, block); @@ -631,8 +613,8 @@ handle_ib: /* Initialize inode bitmap of the @group */ block = group_data[i].inode_bitmap; - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; /* Mark unused entries in inode bitmap used */ bh = bclean(handle, sb, block); @@ -1109,10 +1091,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, ext4_fsblk_t backup_block; /* Out of journal space, and can't get more - abort - so sad */ - if (ext4_handle_valid(handle) && - handle->h_buffer_credits == 0 && - ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && - (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) break; if (meta_bg == 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b3cbf8622eab..c54019979e80 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1172,9 +1172,9 @@ void ext4_clear_inode(struct inode *inode) { invalidate_inode_buffers(inode); clear_inode(inode); - dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1388,7 +1388,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static int ext4_enable_quotas(struct super_block *sb); -static int ext4_get_next_id(struct super_block *sb, struct kqid *qid); static struct dquot **ext4_get_dquots(struct inode *inode) { @@ -1406,7 +1405,7 @@ static const struct dquot_operations ext4_quota_operations = { .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, - .get_next_id = ext4_get_next_id, + .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -2065,7 +2064,7 @@ static int parse_options(char *options, struct super_block *sb, unsigned int *journal_ioprio, int is_remount) { - struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; substring_t args[MAX_OPT_ARGS]; int token; @@ -2119,16 +2118,6 @@ static int parse_options(char *options, struct super_block *sb, } } #endif - if (test_opt(sb, DIOREAD_NOLOCK)) { - int blocksize = - BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - - if (blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "dioread_nolock if block size != PAGE_SIZE"); - return 0; - } - } return 1; } @@ -3569,12 +3558,15 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + unsigned def_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; - /* determine the minimum size of new large inodes, if present */ - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && - sbi->s_want_extra_isize == 0) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; + if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = 0; + return; + } + if (sbi->s_want_extra_isize < 4) { + sbi->s_want_extra_isize = def_extra_isize; if (ext4_has_feature_extra_isize(sb)) { if (sbi->s_want_extra_isize < le16_to_cpu(es->s_want_extra_isize)) @@ -3587,10 +3579,10 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb) } } /* Check if enough inode space is available */ - if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; + if ((sbi->s_want_extra_isize > sbi->s_inode_size) || + (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size)) { + sbi->s_want_extra_isize = def_extra_isize; ext4_msg(sb, KERN_INFO, "required extra inode space not available"); } @@ -4453,13 +4445,6 @@ no_journal: } } - if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && - (blocksize != PAGE_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Unsupported blocksize for fs encryption"); - goto failed_mount_wq; - } - if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); goto failed_mount_wq; @@ -6033,18 +6018,6 @@ out: } return len; } - -static int ext4_get_next_id(struct super_block *sb, struct kqid *qid) -{ - const struct quota_format_ops *ops; - - if (!sb_has_quota_loaded(sb, qid->type)) - return -ESRCH; - ops = sb_dqopt(sb)->ops[qid->type]; - if (!ops || !ops->get_next_id) - return -ENOSYS; - return dquot_get_next_id(sb, qid); -} #endif static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 491f9ee4040e..8966a5439a22 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -967,55 +967,6 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, return credits; } -static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, - int credits, struct buffer_head *bh, - bool dirty, bool block_csum) -{ - int error; - - if (!ext4_handle_valid(handle)) - return 0; - - if (handle->h_buffer_credits >= credits) - return 0; - - error = ext4_journal_extend(handle, credits - handle->h_buffer_credits); - if (!error) - return 0; - if (error < 0) { - ext4_warning(inode->i_sb, "Extend journal (error %d)", error); - return error; - } - - if (bh && dirty) { - if (block_csum) - ext4_xattr_block_csum_set(inode, bh); - error = ext4_handle_dirty_metadata(handle, NULL, bh); - if (error) { - ext4_warning(inode->i_sb, "Handle metadata (error %d)", - error); - return error; - } - } - - error = ext4_journal_restart(handle, credits); - if (error) { - ext4_warning(inode->i_sb, "Restart journal (error %d)", error); - return error; - } - - if (bh) { - error = ext4_journal_get_write_access(handle, bh); - if (error) { - ext4_warning(inode->i_sb, - "Get write access failed (error %d)", - error); - return error; - } - } - return 0; -} - static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { @@ -1149,6 +1100,24 @@ cleanup: return saved_err; } +static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh, bool block_csum, bool dirty) +{ + int error; + + if (bh && dirty) { + if (block_csum) + ext4_xattr_block_csum_set(inode, bh); + error = ext4_handle_dirty_metadata(handle, NULL, bh); + if (error) { + ext4_warning(inode->i_sb, "Handle metadata (error %d)", + error); + return error; + } + } + return 0; +} + static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, @@ -1185,13 +1154,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, continue; } - err = ext4_xattr_ensure_credits(handle, parent, credits, bh, - dirty, block_csum); - if (err) { + err = ext4_journal_ensure_credits_fn(handle, credits, credits, + ext4_free_metadata_revoke_credits(parent->i_sb, 1), + ext4_xattr_restart_fn(handle, parent, bh, block_csum, + dirty)); + if (err < 0) { ext4_warning_inode(ea_inode, "Ensure credits err=%d", err); continue; } + if (err > 0) { + err = ext4_journal_get_write_access(handle, bh); + if (err) { + ext4_warning_inode(ea_inode, + "Re-get write access err=%d", + err); + continue; + } + } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) { @@ -2335,7 +2315,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, flags & XATTR_CREATE); brelse(bh); - if (!ext4_handle_has_enough_credits(handle, credits)) { + if (jbd2_handle_buffer_credits(handle) < credits) { error = -ENOSPC; goto cleanup; } @@ -2862,11 +2842,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct inode *ea_inode; int error; - error = ext4_xattr_ensure_credits(handle, inode, extra_credits, - NULL /* bh */, - false /* dirty */, - false /* block_csum */); - if (error) { + error = ext4_journal_ensure_credits(handle, extra_credits, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (error < 0) { EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } |