diff options
author | Linus Torvalds | 2020-06-02 19:21:40 -0700 |
---|---|---|
committer | Linus Torvalds | 2020-06-02 19:21:40 -0700 |
commit | 16d91548d1057691979de4686693f0ff92f46000 (patch) | |
tree | 0ab23001a138badebf7702416753c44e277e0ff7 /fs/xfs/xfs_inode.c | |
parent | d9afbb3509900a953f5cf90bc57e793ee80c1108 (diff) | |
parent | 6dcde60efd946e38fac8d276a6ca47492103e856 (diff) |
Merge tag 'xfs-5.8-merge-8' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs updates from Darrick Wong:
"Most of the changes this cycle are refactoring of existing code in
preparation for things landing in the future.
We also fixed various problems and deficiencies in the quota
implementation, and (I hope) the last of the stale read vectors by
forcing write allocations to go through the unwritten state until the
write completes.
Summary:
- Various cleanups to remove dead code, unnecessary conditionals,
asserts, etc.
- Fix a linker warning caused by xfs stuffing '-g' into CFLAGS
redundantly.
- Tighten up our dmesg logging to ensure that everything is prefixed
with 'XFS' for easier grepping.
- Kill a bunch of typedefs.
- Refactor the deferred ops code to reduce indirect function calls.
- Increase type-safety with the deferred ops code.
- Make the DAX mount options a tri-state.
- Fix some error handling problems in the inode flush code and clean
up other inode flush warts.
- Refactor log recovery so that each log item recovery functions now
live with the other log item processing code.
- Fix some SPDX forms.
- Fix quota counter corruption if the fs crashes after running
quotacheck but before any dquots get logged.
- Don't fail metadata verification on zero-entry attr leaf blocks,
since they're just part of the disk format now due to a historic
lack of log atomicity.
- Don't allow SWAPEXT between files with different [ugp]id when
quotas are enabled.
- Refactor inode fork reading and verification to run directly from
the inode-from-disk function. This means that we now actually
guarantee that _iget'ted inodes are totally verified and ready to
go.
- Move the incore inode fork format and extent counts to the ifork
structure.
- Scalability improvements by reducing cacheline pingponging in
struct xfs_mount.
- More scalability improvements by removing m_active_trans from the
hot path.
- Fix inode counter update sanity checking to run /only/ on debug
kernels.
- Fix longstanding inconsistency in what error code we return when a
program hits project quota limits (ENOSPC).
- Fix group quota returning the wrong error code when a program hits
group quota limits.
- Fix per-type quota limits and grace periods for group and project
quotas so that they actually work.
- Allow extension of individual grace periods.
- Refactor the non-reclaim inode radix tree walking code to remove a
bunch of stupid little functions and straighten out the
inconsistent naming schemes.
- Fix a bug in speculative preallocation where we measured a new
allocation based on the last extent mapping in the file instead of
looking farther for the last contiguous space allocation.
- Force delalloc writes to unwritten extents. This closes a stale
disk contents exposure vector if the system goes down before the
write completes.
- More lockdep whackamole"
* tag 'xfs-5.8-merge-8' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (129 commits)
xfs: more lockdep whackamole with kmem_alloc*
xfs: force writes to delalloc regions to unwritten
xfs: refactor xfs_iomap_prealloc_size
xfs: measure all contiguous previous extents for prealloc size
xfs: don't fail unwritten extent conversion on writeback due to edquot
xfs: rearrange xfs_inode_walk_ag parameters
xfs: straighten out all the naming around incore inode tree walks
xfs: move xfs_inode_ag_iterator to be closer to the perag walking code
xfs: use bool for done in xfs_inode_ag_walk
xfs: fix inode ag walk predicate function return values
xfs: refactor eofb matching into a single helper
xfs: remove __xfs_icache_free_eofblocks
xfs: remove flags argument from xfs_inode_ag_walk
xfs: remove xfs_inode_ag_iterator_flags
xfs: remove unused xfs_inode_ag_iterator function
xfs: replace open-coded XFS_ICI_NO_TAG
xfs: move eofblocks conversion function to xfs_ioctl.c
xfs: allow individual quota grace period extension
xfs: per-type quota timers and warn limits
xfs: switch xfs_get_defquota to take explicit type
...
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r-- | fs/xfs/xfs_inode.c | 263 |
1 files changed, 89 insertions, 174 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d1772786af29..64f5f9a440ae 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -112,7 +112,7 @@ xfs_ilock_data_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE && (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -125,7 +125,8 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + if (ip->i_afp && + ip->i_afp->if_format == XFS_DINODE_FMT_BTREE && (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -825,7 +826,7 @@ xfs_ialloc( inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; - ip->i_d.di_nextents = 0; + ip->i_df.if_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); tv = current_time(inode); @@ -851,7 +852,7 @@ xfs_ialloc( case S_IFCHR: case S_IFBLK: case S_IFSOCK: - ip->i_d.di_format = XFS_DINODE_FMT_DEV; + ip->i_df.if_format = XFS_DINODE_FMT_DEV; ip->i_df.if_flags = 0; flags |= XFS_ILOG_DEV; break; @@ -907,7 +908,7 @@ xfs_ialloc( } /* FALLTHROUGH */ case S_IFLNK: - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = 0; ip->i_df.if_u1.if_root = NULL; @@ -915,11 +916,6 @@ xfs_ialloc( default: ASSERT(0); } - /* - * Attribute fork settings for new inode. - */ - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_anextents = 0; /* * Log the new values stuffed into the inode. @@ -1686,7 +1682,7 @@ xfs_inactive_truncate( if (error) goto error_trans_cancel; - ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_df.if_nextents == 0); error = xfs_trans_commit(tp); if (error) @@ -1836,7 +1832,7 @@ xfs_inactive( if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; error = xfs_qm_dqattach(ip); @@ -1862,7 +1858,6 @@ xfs_inactive( } ASSERT(!ip->i_afp); - ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_d.di_forkoff == 0); /* @@ -2172,7 +2167,7 @@ xfs_iunlink_update_inode( ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0); if (error) return error; @@ -2302,7 +2297,7 @@ xfs_iunlink_map_ino( return error; } - error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0); if (error) { xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", __func__, error); @@ -2602,7 +2597,7 @@ xfs_ifree_cluster( xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; struct xfs_log_item *lip; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); @@ -2662,7 +2657,7 @@ xfs_ifree_cluster( */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; + iip = (struct xfs_inode_log_item *)lip; ASSERT(iip->ili_logged == 1); lip->li_cb = xfs_istale_done; xfs_trans_ail_copy_lsn(mp->m_ail, @@ -2712,24 +2707,6 @@ xfs_ifree_cluster( } /* - * Free any local-format buffers sitting around before we reset to - * extents format. - */ -static inline void -xfs_ifree_local_data( - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_ifork *ifp; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return; - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); -} - -/* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have * no pages associated with it. This routine also assumes that @@ -2749,8 +2726,7 @@ xfs_ifree( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(ip->i_d.di_nextents == 0); - ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_df.if_nextents == 0); ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); ASSERT(ip->i_d.di_nblocks == 0); @@ -2765,16 +2741,23 @@ xfs_ifree( if (error) return error; - xfs_ifree_local_data(ip, XFS_DATA_FORK); - xfs_ifree_local_data(ip, XFS_ATTR_FORK); + /* + * Free any local-format data sitting around before we reset the + * data fork to extents format. Note that the attr fork data has + * already been freed by xfs_attr_inactive. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + kmem_free(ip->i_df.if_u1.if_data); + ip->i_df.if_u1.if_data = NULL; + ip->i_df.if_bytes = 0; + } VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_flags2 = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* Don't attempt to replay owner changes for a deleted inode */ ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); @@ -3496,6 +3479,7 @@ xfs_iflush_cluster( struct xfs_inode **cilist; struct xfs_inode *cip; struct xfs_ino_geometry *igeo = M_IGEO(mp); + int error = 0; int nr_found; int clcount = 0; int i; @@ -3588,11 +3572,10 @@ xfs_iflush_cluster( * re-check that it's dirty before flushing. */ if (!xfs_inode_clean(cip)) { - int error; error = xfs_iflush_int(cip, bp); if (error) { xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto cluster_corrupt_out; + goto out_free; } clcount++; } else { @@ -3611,37 +3594,7 @@ out_free: kmem_free(cilist); out_put: xfs_perag_put(pag); - return 0; - - -cluster_corrupt_out: - /* - * Corruption detected in the clustering loop. Invalidate the - * inode buffer and shut down the filesystem. - */ - rcu_read_unlock(); - - /* - * We'll always have an inode attached to the buffer for completion - * process by the time we are called from xfs_iflush(). Hence we have - * always need to do IO completion processing to abort the inodes - * attached to the buffer. handle them just like the shutdown case in - * xfs_buf_submit(). - */ - ASSERT(bp->b_iodone); - bp->b_flags |= XBF_ASYNC; - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp); - - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(cip, false); - kmem_free(cilist); - xfs_perag_put(pag); - return -EFSCORRUPTED; + return error; } /* @@ -3667,8 +3620,8 @@ xfs_iflush( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || + ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); *bpp = NULL; @@ -3688,42 +3641,20 @@ xfs_iflush( } /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk. - * - * We also have to remove the log item from the AIL in this case, - * as we wait for an empty AIL as part of the unmount process. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - goto abort_out; - } - - /* * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, we - * simply want to return with the inode still dirty. + * operation here, so we may get an EAGAIN error. In that case, return + * leaving the inode dirty. * * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode, so we treat it the same as failing - * xfs_iflush_int(). + * and we cannot flush the inode. Abort the flush and shut down. */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, - 0); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK); if (error == -EAGAIN) { xfs_ifunlock(ip); return error; } if (error) - goto corrupt_out; - - /* - * First flush out the inode that xfs_iflush was called with. - */ - error = xfs_iflush_int(ip, bp); - if (error) - goto corrupt_out; + goto abort; /* * If the buffer is pinned then push on the log now so we won't @@ -3733,61 +3664,32 @@ xfs_iflush( xfs_log_force(mp, 0); /* - * inode clustering: try to gather other inodes into this write + * Flush the provided inode then attempt to gather others from the + * cluster into the write. * - * Note: Any error during clustering will result in the filesystem - * being shut down and completion callbacks run on the cluster buffer. - * As we have already flushed and attached this inode to the buffer, - * it has already been aborted and released by xfs_iflush_cluster() and - * so we have no further error handling to do here. + * Note: Once we attempt to flush an inode, we must run buffer + * completion callbacks on any failure. If this fails, simulate an I/O + * failure on the buffer and shut down. */ - error = xfs_iflush_cluster(ip, bp); - if (error) - return error; + error = xfs_iflush_int(ip, bp); + if (!error) + error = xfs_iflush_cluster(ip, bp); + if (error) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + goto shutdown; + } *bpp = bp; return 0; -corrupt_out: - if (bp) - xfs_buf_relse(bp); +abort: + xfs_iflush_abort(ip); +shutdown: xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -abort_out: - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(ip, false); return error; } -/* - * If there are inline format data / attr forks attached to this inode, - * make sure they're not corrupt. - */ -bool -xfs_inode_verify_forks( - struct xfs_inode *ip) -{ - struct xfs_ifork *ifp; - xfs_failaddr_t fa; - - fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", - ifp->if_u1.if_data, ifp->if_bytes, fa); - return false; - } - - fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp ? ifp->if_u1.if_data : NULL, - ifp ? ifp->if_bytes : 0, fa); - return false; - } - return true; -} - STATIC int xfs_iflush_int( struct xfs_inode *ip, @@ -3796,61 +3698,68 @@ xfs_iflush_int( struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || + ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); - /* set *dip = inode's place in the buffer */ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); + /* + * We don't flush the inode if any of the following checks fail, but we + * do still update the log item and attach to the backing buffer as if + * the flush happened. This is a formality to facilitate predictable + * error handling as the caller will shutdown and fail the buffer. + */ + error = -EFSCORRUPTED; if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); - goto corrupt_out; + goto flush_out; } if (S_ISREG(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && - (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > + if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, __func__, ip->i_ino, - ip->i_d.di_nextents + ip->i_d.di_anextents, + ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), ip->i_d.di_nblocks, ip); - goto corrupt_out; + goto flush_out; } if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_d.di_forkoff, ip); - goto corrupt_out; + goto flush_out; } /* @@ -3865,9 +3774,16 @@ xfs_iflush_int( if (!xfs_sb_version_has_v3inode(&mp->m_sb)) ip->i_d.di_flushiter++; - /* Check the inline fork data before we write out. */ - if (!xfs_inode_verify_forks(ip)) - goto corrupt_out; + /* + * If there are inline format data / attr forks attached to this inode, + * make sure they are not corrupt. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_data(ip)) + goto flush_out; + if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_attr(ip)) + goto flush_out; /* * Copy the dirty parts of the inode into the on-disk inode. We always @@ -3910,6 +3826,8 @@ xfs_iflush_int( * need the AIL lock, because it is a 64 bit value that cannot be read * atomically. */ + error = 0; +flush_out: iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; @@ -3919,10 +3837,10 @@ xfs_iflush_int( &iip->ili_item.li_lsn); /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. + * Attach the inode item callback to the buffer whether the flush + * succeeded or not. If not, the caller will shut down and fail I/O + * completion on the buffer to remove the inode from the AIL and release + * the flush lock. */ xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); @@ -3931,10 +3849,7 @@ xfs_iflush_int( ASSERT(!list_empty(&bp->b_li_list)); ASSERT(bp->b_iodone != NULL); - return 0; - -corrupt_out: - return -EFSCORRUPTED; + return error; } /* Release an inode. */ |