diff options
104 files changed, 4689 insertions, 2675 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 04611a1068b4..b056cfc6398e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -102,6 +102,7 @@ xfs-y += xfs_log.o \ xfs_buf_item_recover.o \ xfs_dquot_item_recover.o \ xfs_extfree_item.o \ + xfs_attr_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ xfs_inode_item_recover.o \ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b52ed339727f..d3f2886fdc08 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2511,7 +2511,7 @@ __xfs_free_extent_later( ASSERT(bno != NULLFSBLOCK); ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); ASSERT(!isnullstartblock(bno)); agno = XFS_FSB_TO_AGNO(mp, bno); agbno = XFS_FSB_TO_AGBNO(mp, bno); @@ -2777,7 +2777,7 @@ xfs_alloc_get_freelist( xfs_agblock_t bno; __be32 *agfl_bno; int error; - int logflags; + uint32_t logflags; struct xfs_mount *mp = tp->t_mountp; struct xfs_perag *pag; @@ -2830,9 +2830,9 @@ xfs_alloc_get_freelist( */ void xfs_alloc_log_agf( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields) /* mask of fields to be logged (XFS_AGF_...) */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte offset */ int last; /* last byte offset */ @@ -2902,7 +2902,7 @@ xfs_alloc_put_freelist( struct xfs_perag *pag; __be32 *blockp; int error; - int logflags; + uint32_t logflags; __be32 *agfl_bno; int startoff; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d4c057b764f9..84ca09b2223f 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -121,7 +121,7 @@ void xfs_alloc_log_agf( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields);/* mask of fields to be logged (XFS_AGF_...) */ + uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ /* * Interface for inode allocation to force the pag data to be initialized. diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 23523b802539..14ae0826bc15 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -24,6 +24,11 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" +#include "xfs_attr_item.h" +#include "xfs_log.h" + +struct kmem_cache *xfs_attri_cache; +struct kmem_cache *xfs_attrd_cache; /* * xfs_attr.c @@ -53,26 +58,22 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); -STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_addname_clear_incomplete( - struct xfs_delattr_context *dac); +static int xfs_attr_node_try_addname(struct xfs_attr_item *attr); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr); +STATIC int xfs_attr_node_remove_attr(struct xfs_attr_item *attr); STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, struct xfs_da_state **state); -STATIC int xfs_attr_fillstate(xfs_da_state_t *state); -STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp); -STATIC int xfs_attr_node_removename(struct xfs_da_args *args, - struct xfs_da_state *state); int xfs_inode_hasattr( struct xfs_inode *ip) { - if (!XFS_IFORK_Q(ip) || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0)) + if (!XFS_IFORK_Q(ip)) + return 0; + if (!ip->i_afp) + return 0; + if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0) return 0; return 1; } @@ -97,6 +98,123 @@ xfs_attr_is_leaf( return imap.br_startoff == 0 && imap.br_blockcount == 1; } +/* + * XXX (dchinner): name path state saving and refilling is an optimisation to + * avoid needing to look up name entries after rolling transactions removing + * remote xattr blocks between the name entry lookup and name entry removal. + * This optimisation got sidelined when combining the set and remove state + * machines, but the code has been left in place because it is worthwhile to + * restore the optimisation once the combined state machine paths have settled. + * + * This comment is a public service announcement to remind Future Dave that he + * still needs to restore this code to working order. + */ +#if 0 +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commits have released these buffers. + */ +static int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + trace_xfs_attr_fillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + return 0; +} + +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commits have released those + * buffers from our grip. + */ +static int +xfs_attr_refillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + trace_xfs_attr_refillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + return 0; +} +#else +static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; } +#endif + /*======================================================================== * Overall external interface routines. *========================================================================*/ @@ -166,7 +284,7 @@ xfs_attr_get( /* * Calculate how many blocks we need for the new attribute, */ -STATIC int +int xfs_attr_calc_size( struct xfs_da_args *args, int *local) @@ -199,6 +317,33 @@ xfs_attr_calc_size( return nblks; } +/* Initialize transaction reservation for attr operations */ +void +xfs_init_attr_trans( + struct xfs_da_args *args, + struct xfs_trans_res *tres, + unsigned int *total) +{ + struct xfs_mount *mp = args->dp->i_mount; + + if (args->value) { + tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; + *total = args->total; + } else { + *tres = M_RES(mp)->tr_attrrm; + *total = XFS_ATTRRM_SPACE_RES(mp); + } +} + +/* + * Add an attr to a shortform fork. If there is no space, + * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC. + * to use. + */ STATIC int xfs_attr_try_sf_addname( struct xfs_inode *dp, @@ -230,411 +375,470 @@ xfs_attr_try_sf_addname( return error; } -/* - * Check to see if the attr should be upgraded from non-existent or shortform to - * single-leaf-block attribute list. - */ -static inline bool -xfs_attr_is_shortform( - struct xfs_inode *ip) +static int +xfs_attr_sf_addname( + struct xfs_attr_item *attr) { - return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0); + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + int error = 0; + + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) { + ASSERT(!error || error == -EEXIST); + attr->xattri_dela_state = XFS_DAS_DONE; + goto out; + } + + /* + * It won't fit in the shortform, transform to a leaf block. GROT: + * another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp); + if (error) + return error; + + /* + * Prevent the leaf buffer from being unlocked so that a concurrent AIL + * push cannot grab the half-baked leaf buffer and run into problems + * with the write verifier. + */ + xfs_trans_bhold(args->trans, attr->xattri_leaf_bp); + attr->xattri_dela_state = XFS_DAS_LEAF_ADD; +out: + trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp); + return error; } /* - * Checks to see if a delayed attribute transaction should be rolled. If so, - * transaction is finished or rolled as needed. + * Handle the state change on completion of a multi-state attr operation. + * + * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first + * modification in a attr replace operation and we still have to do the second + * state, indicated by @replace_state. + * + * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on + * completion of the second half of the attr replace operation we correctly + * signal that it is done. */ -STATIC int -xfs_attr_trans_roll( - struct xfs_delattr_context *dac) +static enum xfs_delattr_state +xfs_attr_complete_op( + struct xfs_attr_item *attr, + enum xfs_delattr_state replace_state) { - struct xfs_da_args *args = dac->da_args; - int error; + struct xfs_da_args *args = attr->xattri_da_args; + bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; + + args->op_flags &= ~XFS_DA_OP_REPLACE; + if (do_replace) { + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + return replace_state; + } + return XFS_DAS_DONE; +} + +static int +xfs_attr_leaf_addname( + struct xfs_attr_item *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error; + + ASSERT(xfs_attr_is_leaf(args->dp)); + + /* + * Use the leaf buffer we may already hold locked as a result of + * a sf-to-leaf conversion. The held buffer is no longer valid + * after this call, regardless of the result. + */ + error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp); + attr->xattri_leaf_bp = NULL; + + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; - if (dac->flags & XFS_DAC_DEFER_FINISH) { /* - * The caller wants us to finish all the deferred ops so that we - * avoid pinning the log tail with a large number of deferred - * ops. + * We're not in leaf format anymore, so roll the transaction and + * retry the add to the newly allocated node block. */ - dac->flags &= ~XFS_DAC_DEFER_FINISH; - error = xfs_defer_finish(&args->trans); - } else - error = xfs_trans_roll_inode(&args->trans, args->dp); + attr->xattri_dela_state = XFS_DAS_NODE_ADD; + goto out; + } + if (error) + return error; + /* + * We need to commit and roll if we need to allocate remote xattr blocks + * or perform more xattr manipulations. Otherwise there is nothing more + * to do and we can return success. + */ + if (args->rmtblkno) + attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; + else + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_LEAF_REPLACE); +out: + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); return error; } /* - * Set the attribute specified in @args. + * Add an entry to a node format attr tree. + * + * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell + * the difference between leaf + remote attr blocks and a node format tree, + * so we may still end up having to convert from leaf to node format here. */ -int -xfs_attr_set_args( - struct xfs_da_args *args) +static int +xfs_attr_node_addname( + struct xfs_attr_item *attr) { - struct xfs_buf *leaf_bp = NULL; - int error = 0; - struct xfs_delattr_context dac = { - .da_args = args, - }; + struct xfs_da_args *args = attr->xattri_da_args; + int error; - do { - error = xfs_attr_set_iter(&dac, &leaf_bp); - if (error != -EAGAIN) - break; + ASSERT(!attr->xattri_leaf_bp); + + error = xfs_attr_node_addname_find_attr(attr); + if (error) + return error; - error = xfs_attr_trans_roll(&dac); - if (error) { - if (leaf_bp) - xfs_trans_brelse(args->trans, leaf_bp); + error = xfs_attr_node_try_addname(attr); + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) return error; - } - } while (true); + /* + * No state change, we really are in node form now + * but we need the transaction rolled to continue. + */ + goto out; + } + if (error) + return error; + if (args->rmtblkno) + attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT; + else + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_NODE_REPLACE); +out: + trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp); return error; } -STATIC int -xfs_attr_sf_addname( - struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp) +static int +xfs_attr_rmtval_alloc( + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_inode *dp = args->dp; + struct xfs_da_args *args = attr->xattri_da_args; int error = 0; /* - * Try to add the attr to the attribute list in the inode. + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. */ - error = xfs_attr_try_sf_addname(dp, args); + if (attr->xattri_blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(attr); + if (error) + return error; + /* Roll the transaction only if there is more to allocate. */ + if (attr->xattri_blkcnt > 0) + goto out; + } - /* Should only be 0, -EEXIST or -ENOSPC */ - if (error != -ENOSPC) + error = xfs_attr_rmtval_set_value(args); + if (error) return error; + attr->xattri_dela_state = xfs_attr_complete_op(attr, + ++attr->xattri_dela_state); /* - * It won't fit in the shortform, transform to a leaf block. GROT: - * another possible req'mt for a double-split btree op. + * If we are not doing a rename, we've finished the operation but still + * have to clear the incomplete flag protecting the new attr from + * exposing partially initialised state if we crash during creation. */ - error = xfs_attr_shortform_to_leaf(args, leaf_bp); - if (error) - return error; + if (attr->xattri_dela_state == XFS_DAS_DONE) + error = xfs_attr3_leaf_clearflag(args); +out: + trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp); + return error; +} + +/* + * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers + * for later deletion of the entry. + */ +static int +xfs_attr_leaf_mark_incomplete( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error; /* - * Prevent the leaf buffer from being unlocked so that a concurrent AIL - * push cannot grab the half-baked leaf buffer and run into problems - * with the write verifier. + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. */ - xfs_trans_bhold(args->trans, *leaf_bp); + error = xfs_attr_fillstate(state); + if (error) + return error; /* - * We're still in XFS_DAS_UNINIT state here. We've converted - * the attr fork to leaf format and will restart with the leaf - * add. + * Mark the attribute as INCOMPLETE */ - trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp); - dac->flags |= XFS_DAC_DEFER_FINISH; - return -EAGAIN; + return xfs_attr3_leaf_setflag(args); } /* - * Set the attribute specified in @args. - * This routine is meant to function as a delayed operation, and may return - * -EAGAIN when the transaction needs to be rolled. Calling functions will need - * to handle this, and recall the function until a successful error code is - * returned. + * Initial setup for xfs_attr_node_removename. Make sure the attr is there and + * the blocks are valid. Attr keys with remote blocks will be marked + * incomplete. */ -int -xfs_attr_set_iter( - struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp) +static +int xfs_attr_node_removename_setup( + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_inode *dp = args->dp; - struct xfs_buf *bp = NULL; - int forkoff, error = 0; - - /* State machine switch */ - switch (dac->dela_state) { - case XFS_DAS_UNINIT: - /* - * If the fork is shortform, attempt to add the attr. If there - * is no space, this converts to leaf format and returns - * -EAGAIN with the leaf buffer held across the roll. The caller - * will deal with a transaction roll error, but otherwise - * release the hold once we return with a clean transaction. - */ - if (xfs_attr_is_shortform(dp)) - return xfs_attr_sf_addname(dac, leaf_bp); - if (*leaf_bp != NULL) { - xfs_trans_bhold_release(args->trans, *leaf_bp); - *leaf_bp = NULL; - } + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state **state = &attr->xattri_da_state; + int error; - if (xfs_attr_is_leaf(dp)) { - error = xfs_attr_leaf_try_add(args, *leaf_bp); - if (error == -ENOSPC) { - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - - /* - * Finish any deferred work items and roll the - * transaction once more. The goal here is to - * call node_addname with the inode and - * transaction in the same state (inode locked - * and joined, transaction clean) no matter how - * we got to this step. - * - * At this point, we are still in - * XFS_DAS_UNINIT, but when we come back, we'll - * be a node, so we'll fall down into the node - * handling code below - */ - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } else if (error) { - return error; - } + error = xfs_attr_node_hasname(args, state); + if (error != -EEXIST) + goto out; + error = 0; - dac->dela_state = XFS_DAS_FOUND_LBLK; - } else { - error = xfs_attr_node_addname_find_attr(dac); - if (error) - return error; + ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); + ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == + XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_node_addname(dac); - if (error) - return error; + error = xfs_attr_leaf_mark_incomplete(args, *state); + if (error) + goto out; + if (args->rmtblkno > 0) + error = xfs_attr_rmtval_invalidate(args); +out: + if (error) + xfs_da_state_free(*state); - dac->dela_state = XFS_DAS_FOUND_NBLK; - } - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - case XFS_DAS_FOUND_LBLK: - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ + return error; +} - /* Open coded xfs_attr_rmtval_set without trans handling */ - if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) { - dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT; - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(dac); - if (error) - return error; - } - } +/* + * Remove the original attr we have just replaced. This is dependent on the + * original lookup and insert placing the old attr in args->blkno/args->index + * and the new attr in args->blkno2/args->index2. + */ +static int +xfs_attr_leaf_remove_attr( + struct xfs_attr_item *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int forkoff; + int error; - /* - * Repeat allocating remote blocks for the attr value until - * blkcnt drops to zero. - */ - if (dac->blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(dac); - if (error) - return error; - trace_xfs_attr_set_iter_return(dac->dela_state, - args->dp); - return -EAGAIN; - } + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); + if (error) + return error; - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; + xfs_attr3_leaf_remove(bp, args); - /* - * If this is not a rename, clear the incomplete flag and we're - * done. - */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - return error; - } + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - return error; - /* - * Commit the flag value change and start the next trans in - * series. - */ - dac->dela_state = XFS_DAS_FLIP_LFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - case XFS_DAS_FLIP_LFLAG: - /* - * Dismantle the "old" attribute/value pair by removing a - * "remote" value (if it exists). - */ - xfs_attr_restore_rmt_blk(args); - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; + return error; +} - fallthrough; - case XFS_DAS_RM_LBLK: - /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - dac->dela_state = XFS_DAS_RM_LBLK; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - if (error) - return error; +/* + * Shrink an attribute from leaf to shortform. Used by the node format remove + * path when the node format collapses to a single block and so we have to check + * if it can be collapsed further. + */ +static int +xfs_attr_leaf_shrink( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp; + int forkoff; + int error; - dac->dela_state = XFS_DAS_RD_LEAF; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - } + if (!xfs_attr_is_leaf(dp)) + return 0; - fallthrough; - case XFS_DAS_RD_LEAF: - /* - * This is the last step for leaf format. Read the block with - * the old attr, remove the old attr, check for shortform - * conversion and return. - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); - if (error) - return error; + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; - xfs_attr3_leaf_remove(bp, args); + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) { + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + } else { + xfs_trans_brelse(args->trans, bp); + } - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ + return error; +} - return error; +/* + * Run the attribute operation specified in @attr. + * + * This routine is meant to function as a delayed operation and will set the + * state to XFS_DAS_DONE when the operation is complete. Calling functions will + * need to handle this, and recall the function until either an error or + * XFS_DAS_DONE is detected. + */ +int +xfs_attr_set_iter( + struct xfs_attr_item *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error = 0; - case XFS_DAS_FOUND_NBLK: - /* - * Find space for remote blocks and fall into the allocation - * state. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(dac); - if (error) - return error; + /* State machine switch */ +next_state: + switch (attr->xattri_dela_state) { + case XFS_DAS_UNINIT: + ASSERT(0); + return -EFSCORRUPTED; + case XFS_DAS_SF_ADD: + return xfs_attr_sf_addname(attr); + case XFS_DAS_LEAF_ADD: + return xfs_attr_leaf_addname(attr); + case XFS_DAS_NODE_ADD: + return xfs_attr_node_addname(attr); + + case XFS_DAS_SF_REMOVE: + error = xfs_attr_sf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + case XFS_DAS_LEAF_REMOVE: + error = xfs_attr_leaf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + case XFS_DAS_NODE_REMOVE: + error = xfs_attr_node_removename_setup(attr); + if (error == -ENOATTR && + (args->op_flags & XFS_DA_OP_RECOVERY)) { + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + error = 0; + break; } + if (error) + return error; + attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT; + if (args->rmtblkno == 0) + attr->xattri_dela_state++; + break; + case XFS_DAS_LEAF_SET_RMT: + case XFS_DAS_NODE_SET_RMT: + error = xfs_attr_rmtval_find_space(attr); + if (error) + return error; + attr->xattri_dela_state++; fallthrough; - case XFS_DAS_ALLOC_NODE: - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - dac->dela_state = XFS_DAS_ALLOC_NODE; - if (args->rmtblkno > 0) { - if (dac->blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(dac); - if (error) - return error; - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } - - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; - } - /* - * If this was not a rename, clear the incomplete flag and we're - * done. - */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - goto out; - } + case XFS_DAS_LEAF_ALLOC_RMT: + case XFS_DAS_NODE_ALLOC_RMT: + error = xfs_attr_rmtval_alloc(attr); + if (error) + return error; + if (attr->xattri_dela_state == XFS_DAS_DONE) + break; + goto next_state; + case XFS_DAS_LEAF_REPLACE: + case XFS_DAS_NODE_REPLACE: /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. + * We must "flip" the incomplete flags on the "new" and "old" + * attribute/value pairs so that one disappears and one appears + * atomically. */ error = xfs_attr3_leaf_flipflags(args); if (error) - goto out; + return error; /* - * Commit the flag value change and start the next trans in - * series + * We must commit the flag value change now to make it atomic + * and then we can start the next trans in series at REMOVE_OLD. */ - dac->dela_state = XFS_DAS_FLIP_NFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; + attr->xattri_dela_state++; + break; - case XFS_DAS_FLIP_NFLAG: + case XFS_DAS_LEAF_REMOVE_OLD: + case XFS_DAS_NODE_REMOVE_OLD: /* - * Dismantle the "old" attribute/value pair by removing a - * "remote" value (if it exists). + * If we have a remote attr, start the process of removing it + * by invalidating any cached buffers. + * + * If we don't have a remote attr, we skip the remote block + * removal state altogether with a second state increment. */ xfs_attr_restore_rmt_blk(args); - - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - fallthrough; - case XFS_DAS_RM_NBLK: - /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - dac->dela_state = XFS_DAS_RM_NBLK; if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - + error = xfs_attr_rmtval_invalidate(args); if (error) return error; + } else { + attr->xattri_dela_state++; + } + + attr->xattri_dela_state++; + goto next_state; - dac->dela_state = XFS_DAS_CLR_FLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; + case XFS_DAS_LEAF_REMOVE_RMT: + case XFS_DAS_NODE_REMOVE_RMT: + error = xfs_attr_rmtval_remove(attr); + if (error == -EAGAIN) { + error = 0; + break; } + if (error) + return error; - fallthrough; - case XFS_DAS_CLR_FLAG: /* - * The last state for node format. Look up the old attr and - * remove it. + * We've finished removing the remote attr blocks, so commit the + * transaction and move on to removing the attr name from the + * leaf/node block. Removing the attr might require a full + * transaction reservation for btree block freeing, so we + * can't do that in the same transaction where we removed the + * remote attr blocks. */ - error = xfs_attr_node_addname_clear_incomplete(dac); + attr->xattri_dela_state++; + break; + + case XFS_DAS_LEAF_REMOVE_ATTR: + error = xfs_attr_leaf_remove_attr(attr); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + + case XFS_DAS_NODE_REMOVE_ATTR: + error = xfs_attr_node_remove_attr(attr); + if (!error) + error = xfs_attr_leaf_shrink(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); break; default: ASSERT(0); break; } -out: + + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp); return error; } @@ -668,30 +872,79 @@ xfs_attr_lookup( return xfs_attr_node_hasname(args, NULL); } -/* - * Remove the attribute specified in @args. - */ -int -xfs_attr_remove_args( +static int +xfs_attr_item_init( + struct xfs_da_args *args, + unsigned int op_flags, /* op flag (set or remove) */ + struct xfs_attr_item **attr) /* new xfs_attr_item */ +{ + + struct xfs_attr_item *new; + + new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS); + new->xattri_op_flags = op_flags; + new->xattri_da_args = args; + + *attr = new; + return 0; +} + +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_add( struct xfs_da_args *args) { - int error; - struct xfs_delattr_context dac = { - .da_args = args, - }; + struct xfs_attr_item *new; + int error = 0; - do { - error = xfs_attr_remove_iter(&dac); - if (error != -EAGAIN) - break; + error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_SET, &new); + if (error) + return error; - error = xfs_attr_trans_roll(&dac); - if (error) - return error; + new->xattri_dela_state = xfs_attr_init_add_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); - } while (true); + return 0; +} - return error; +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_replace( + struct xfs_da_args *args) +{ + struct xfs_attr_item *new; + int error = 0; + + error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REPLACE, &new); + if (error) + return error; + + new->xattri_dela_state = xfs_attr_init_replace_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); + + return 0; +} + +/* Removes an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_remove( + struct xfs_da_args *args) +{ + + struct xfs_attr_item *new; + int error; + + error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REMOVE, &new); + if (error) + return error; + + new->xattri_dela_state = xfs_attr_init_remove_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); + + return 0; } /* @@ -709,6 +962,7 @@ xfs_attr_set( int error, local; int rmt_blks = 0; unsigned int total; + int delayed = xfs_has_larp(mp); if (xfs_is_shutdown(dp->i_mount)) return -EIO; @@ -730,8 +984,6 @@ xfs_attr_set( if (args->value) { XFS_STATS_INC(mp, xs_attr_set); - - args->op_flags |= XFS_DA_OP_ADDNAME; args->total = xfs_attr_calc_size(args, &local); /* @@ -748,61 +1000,68 @@ xfs_attr_set( return error; } - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - total = args->total; - if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); } else { XFS_STATS_INC(mp, xs_attr_remove); - - tres = M_RES(mp)->tr_attrrm; - total = XFS_ATTRRM_SPACE_RES(mp); rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } + if (delayed) { + error = xfs_attr_use_log_assist(mp); + if (error) + return error; + } + /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ + xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) - return error; + goto drop_incompat; if (args->value || xfs_inode_hasattr(dp)) { error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(args->trans, dp, + XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); if (error) goto out_trans_cancel; } error = xfs_attr_lookup(args); - if (args->value) { - if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) - goto out_trans_cancel; - if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out_trans_cancel; - if (error != -ENOATTR && error != -EEXIST) + switch (error) { + case -EEXIST: + /* if no value, we are performing a remove operation */ + if (!args->value) { + error = xfs_attr_defer_remove(args); + break; + } + /* Pure create fails if the attr already exists */ + if (args->attr_flags & XATTR_CREATE) goto out_trans_cancel; - error = xfs_attr_set_args(args); - if (error) - goto out_trans_cancel; - /* shortform attribute has already been committed */ - if (!args->trans) - goto out_unlock; - } else { - if (error != -EEXIST) + error = xfs_attr_defer_replace(args); + break; + case -ENOATTR: + /* Can't remove what isn't there. */ + if (!args->value) goto out_trans_cancel; - error = xfs_attr_remove_args(args); - if (error) + /* Pure replace fails if no existing attr to replace. */ + if (args->attr_flags & XATTR_REPLACE) goto out_trans_cancel; + + error = xfs_attr_defer_add(args); + break; + default: + goto out_trans_cancel; } + if (error) + goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the @@ -821,6 +1080,9 @@ xfs_attr_set( error = xfs_trans_commit(args->trans); out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); +drop_incompat: + if (delayed) + xlog_drop_incompat_feat(mp->m_log); return error; out_trans_cancel: @@ -829,6 +1091,40 @@ out_trans_cancel: goto out_unlock; } +int __init +xfs_attri_init_cache(void) +{ + xfs_attri_cache = kmem_cache_create("xfs_attri", + sizeof(struct xfs_attri_log_item), + 0, 0, NULL); + + return xfs_attri_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attri_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attri_cache); + xfs_attri_cache = NULL; +} + +int __init +xfs_attrd_init_cache(void) +{ + xfs_attrd_cache = kmem_cache_create("xfs_attrd", + sizeof(struct xfs_attrd_log_item), + 0, 0, NULL); + + return xfs_attrd_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attrd_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attrd_cache); + xfs_attrd_cache = NULL; +} + /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ @@ -845,28 +1141,41 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) * Add a name to the shortform attribute list structure * This is the external routine. */ -STATIC int -xfs_attr_shortform_addname(xfs_da_args_t *args) +static int +xfs_attr_shortform_addname( + struct xfs_da_args *args) { - int newsize, forkoff, retval; + int newsize, forkoff; + int error; trace_xfs_attr_sf_addname(args); - retval = xfs_attr_shortform_lookup(args); - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - return retval; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) - return retval; - retval = xfs_attr_sf_removename(args); - if (retval) - return retval; + error = xfs_attr_shortform_lookup(args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + return error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + return error; + + error = xfs_attr_sf_removename(args); + if (error) + return error; + /* - * Since we have removed the old attr, clear ATTR_REPLACE so - * that the leaf format add routine won't trip over the attr - * not being around. + * Since we have removed the old attr, clear XFS_DA_OP_REPLACE + * so that the new attr doesn't fit in shortform format, the + * leaf format add routine won't trip over the attr not being + * around. */ - args->attr_flags &= ~XATTR_REPLACE; + args->op_flags &= ~XFS_DA_OP_REPLACE; + break; + case 0: + break; + default: + return error; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || @@ -889,8 +1198,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * External routines when attribute list is one block *========================================================================*/ -/* Store info about a remote block */ -STATIC void +/* Save the current remote block info and clear the current pointers. */ +static void xfs_attr_save_rmt_blk( struct xfs_da_args *args) { @@ -899,10 +1208,13 @@ xfs_attr_save_rmt_blk( args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; args->rmtvaluelen2 = args->rmtvaluelen; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* Set stored info about a remote block */ -STATIC void +static void xfs_attr_restore_rmt_blk( struct xfs_da_args *args) { @@ -928,45 +1240,54 @@ xfs_attr_leaf_try_add( struct xfs_da_args *args, struct xfs_buf *bp) { - int retval; + int error; /* - * Look up the given attribute in the leaf block. Figure out if - * the given flags produce an error or call for an atomic rename. + * If the caller provided a buffer to us, it is locked and held in + * the transaction because it just did a shortform to leaf conversion. + * Hence we don't need to read it again. Otherwise read in the leaf + * buffer. */ - retval = xfs_attr_leaf_hasname(args, &bp); - if (retval != -ENOATTR && retval != -EEXIST) - return retval; - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out_brelse; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) + if (bp) { + xfs_trans_bhold_release(args->trans, bp); + } else { + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; + } + + /* + * Look up the xattr name to set the insertion point for the new xattr. + */ + error = xfs_attr3_leaf_lookup_int(bp, args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto out_brelse; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) goto out_brelse; trace_xfs_attr_leaf_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ - xfs_attr_save_rmt_blk(args); - /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto out_brelse; } - /* - * Add the attribute to the leaf block - */ return xfs_attr3_leaf_add(bp, args); out_brelse: xfs_trans_brelse(args->trans, bp); - return retval; + return error; } /* @@ -1012,9 +1333,10 @@ xfs_attr_leaf_removename( dp = args->dp; error = xfs_attr_leaf_hasname(args, &bp); - if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); + if (args->op_flags & XFS_DA_OP_RECOVERY) + return 0; return error; } else if (error != -EEXIST) return error; @@ -1098,46 +1420,45 @@ xfs_attr_node_hasname( STATIC int xfs_attr_node_addname_find_attr( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; - int retval; + struct xfs_da_args *args = attr->xattri_da_args; + int error; /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - retval = xfs_attr_node_hasname(args, &dac->da_state); - if (retval != -ENOATTR && retval != -EEXIST) - goto error; - - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto error; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) + error = xfs_attr_node_hasname(args, &attr->xattri_da_state); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) goto error; - trace_xfs_attr_node_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ - xfs_attr_save_rmt_blk(args); + trace_xfs_attr_node_replace(args); /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto error; } return 0; error: - if (dac->da_state) - xfs_da_state_free(dac->da_state); - return retval; + if (attr->xattri_da_state) + xfs_da_state_free(attr->xattri_da_state); + return error; } /* @@ -1146,21 +1467,13 @@ error: * This will involve walking down the Btree, and may involve splitting * leaf nodes and even splitting intermediate nodes up to and including * the root node (a special case of an intermediate node). - * - * "Remote" attribute values confuse the issue and atomic rename operations - * add a whole extra layer of confusion on top of that. - * - * This routine is meant to function as a delayed operation, and may return - * -EAGAIN when the transaction needs to be rolled. Calling functions will need - * to handle this, and recall the function until a successful error code is - *returned. */ -STATIC int -xfs_attr_node_addname( - struct xfs_delattr_context *dac) +static int +xfs_attr_node_try_addname( + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = dac->da_state; + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state = attr->xattri_da_state; struct xfs_da_state_blk *blk; int error; @@ -1175,25 +1488,9 @@ xfs_attr_node_addname( /* * Its really a single leaf node, but it had * out-of-line values so it looked like it *might* - * have been a b-tree. + * have been a b-tree. Let the caller deal with this. */ - xfs_da_state_free(state); - state = NULL; - error = xfs_attr3_leaf_to_node(args); - if (error) - goto out; - - /* - * Now that we have converted the leaf to a node, we can - * roll the transaction, and try xfs_attr3_leaf_add - * again on re-entry. No need to set dela_state to do - * this. dela_state is still unset by this function at - * this point. - */ - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_node_addname_return( - dac->dela_state, args->dp); - return -EAGAIN; + goto out; } /* @@ -1205,7 +1502,6 @@ xfs_attr_node_addname( error = xfs_da3_split(state); if (error) goto out; - dac->flags |= XFS_DAC_DEFER_FINISH; } else { /* * Addition succeeded, update Btree hashvals. @@ -1214,24 +1510,42 @@ xfs_attr_node_addname( } out: - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); return error; } +static int +xfs_attr_node_removename( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + struct xfs_da_state_blk *blk; + int retval; -STATIC int -xfs_attr_node_addname_clear_incomplete( - struct xfs_delattr_context *dac) + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); + + return retval; +} + +static int +xfs_attr_node_remove_attr( + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; struct xfs_da_state *state = NULL; int retval = 0; int error = 0; /* - * Re-find the "old" attribute entry after any split ops. The INCOMPLETE - * flag means that we will find the "old" attr, not the "new" one. + * The attr we are removing has already been marked incomplete, so + * we need to set the filter appropriately to re-find the "old" + * attribute entry after any split ops. */ args->attr_filter |= XFS_ATTR_INCOMPLETE; state = xfs_da_state_alloc(args); @@ -1261,362 +1575,6 @@ out: } /* - * Shrink an attribute from leaf to shortform - */ -STATIC int -xfs_attr_node_shrink( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - struct xfs_inode *dp = args->dp; - int error, forkoff; - struct xfs_buf *bp; - - /* - * Have to get rid of the copy of this dabuf in the state. - */ - ASSERT(state->path.active == 1); - ASSERT(state->path.blk[0].bp); - state->path.blk[0].bp = NULL; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - return error; - - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - } else - xfs_trans_brelse(args->trans, bp); - - return error; -} - -/* - * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers - * for later deletion of the entry. - */ -STATIC int -xfs_attr_leaf_mark_incomplete( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - int error; - - /* - * Fill in disk block numbers in the state structure - * so that we can get the buffers back after we commit - * several transactions in the following calls. - */ - error = xfs_attr_fillstate(state); - if (error) - return error; - - /* - * Mark the attribute as INCOMPLETE - */ - return xfs_attr3_leaf_setflag(args); -} - -/* - * Initial setup for xfs_attr_node_removename. Make sure the attr is there and - * the blocks are valid. Attr keys with remote blocks will be marked - * incomplete. - */ -STATIC -int xfs_attr_node_removename_setup( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state **state = &dac->da_state; - int error; - - error = xfs_attr_node_hasname(args, state); - if (error != -EEXIST) - goto out; - error = 0; - - ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); - ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == - XFS_ATTR_LEAF_MAGIC); - - if (args->rmtblkno > 0) { - error = xfs_attr_leaf_mark_incomplete(args, *state); - if (error) - goto out; - - error = xfs_attr_rmtval_invalidate(args); - } -out: - if (error) - xfs_da_state_free(*state); - - return error; -} - -STATIC int -xfs_attr_node_removename( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - struct xfs_da_state_blk *blk; - int retval; - - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[state->path.active-1]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); - - return retval; -} - -/* - * Remove the attribute specified in @args. - * - * This will involve walking down the Btree, and may involve joining - * leaf nodes and even joining intermediate nodes up to and including - * the root node (a special case of an intermediate node). - * - * This routine is meant to function as either an in-line or delayed operation, - * and may return -EAGAIN when the transaction needs to be rolled. Calling - * functions will need to handle this, and call the function until a - * successful error code is returned. - */ -int -xfs_attr_remove_iter( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = dac->da_state; - int retval, error = 0; - struct xfs_inode *dp = args->dp; - - trace_xfs_attr_node_removename(args); - - switch (dac->dela_state) { - case XFS_DAS_UNINIT: - if (!xfs_inode_hasattr(dp)) - return -ENOATTR; - - /* - * Shortform or leaf formats don't require transaction rolls and - * thus state transitions. Call the right helper and return. - */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_sf_removename(args); - - if (xfs_attr_is_leaf(dp)) - return xfs_attr_leaf_removename(args); - - /* - * Node format may require transaction rolls. Set up the - * state context and fall into the state machine. - */ - if (!dac->da_state) { - error = xfs_attr_node_removename_setup(dac); - if (error) - return error; - state = dac->da_state; - } - - fallthrough; - case XFS_DAS_RMTBLK: - dac->dela_state = XFS_DAS_RMTBLK; - - /* - * If there is an out-of-line value, de-allocate the blocks. - * This is done before we remove the attribute so that we don't - * overflow the maximum size of a transaction and/or hit a - * deadlock. - */ - if (args->rmtblkno > 0) { - /* - * May return -EAGAIN. Roll and repeat until all remote - * blocks are removed. - */ - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) { - trace_xfs_attr_remove_iter_return( - dac->dela_state, args->dp); - return error; - } else if (error) { - goto out; - } - - /* - * Refill the state structure with buffers (the prior - * calls released our buffers) and close out this - * transaction before proceeding. - */ - ASSERT(args->rmtblkno == 0); - error = xfs_attr_refillstate(state); - if (error) - goto out; - dac->dela_state = XFS_DAS_RM_NAME; - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_RM_NAME: - /* - * If we came here fresh from a transaction roll, reattach all - * the buffers to the current transaction. - */ - if (dac->dela_state == XFS_DAS_RM_NAME) { - error = xfs_attr_refillstate(state); - if (error) - goto out; - } - - retval = xfs_attr_node_removename(args, state); - - /* - * Check to see if the tree needs to be collapsed. If so, roll - * the transacton and fall into the shrink state. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - - dac->flags |= XFS_DAC_DEFER_FINISH; - dac->dela_state = XFS_DAS_RM_SHRINK; - trace_xfs_attr_remove_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_RM_SHRINK: - /* - * If the result is small enough, push it all into the inode. - * This is our final state so it's safe to return a dirty - * transaction. - */ - if (xfs_attr_is_leaf(dp)) - error = xfs_attr_node_shrink(args, state); - ASSERT(error != -EAGAIN); - break; - default: - ASSERT(0); - error = -EINVAL; - goto out; - } -out: - if (state) - xfs_da_state_free(state); - return error; -} - -/* - * Fill in the disk block numbers in the state structure for the buffers - * that are attached to the state structure. - * This is done so that we can quickly reattach ourselves to those buffers - * after some set of transaction commits have released these buffers. - */ -STATIC int -xfs_attr_fillstate(xfs_da_state_t *state) -{ - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level; - - trace_xfs_attr_fillstate(state->args); - - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = xfs_buf_daddr(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } - - /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". - */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = xfs_buf_daddr(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } - - return 0; -} - -/* - * Reattach the buffers to the state structure based on the disk block - * numbers stored in the state structure. - * This is done after some set of transaction commits have released those - * buffers from our grip. - */ -STATIC int -xfs_attr_refillstate(xfs_da_state_t *state) -{ - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level, error; - - trace_xfs_attr_refillstate(state->args); - - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } - } - - /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". - */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } - } - - return 0; -} - -/* * Retrieve the attribute data from a node attribute list. * * This routine gets called for any attribute fork that has more than one diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 5e71f719bdd5..1af7abe29eef 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -28,6 +28,15 @@ struct xfs_attr_list_context; */ #define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ +static inline bool xfs_has_larp(struct xfs_mount *mp) +{ +#ifdef DEBUG + return xfs_globals.larp; +#else + return false; +#endif +} + /* * Kernel-internal version of the attrlist cursor. */ @@ -425,7 +434,7 @@ struct xfs_attr_list_context { */ /* - * Enum values for xfs_delattr_context.da_state + * Enum values for xfs_attr_item.xattri_da_state * * These values are used by delayed attribute operations to keep track of where * they were before they returned -EAGAIN. A return code of -EAGAIN signals the @@ -434,46 +443,105 @@ struct xfs_attr_list_context { * to where it was and resume executing where it left off. */ enum xfs_delattr_state { - XFS_DAS_UNINIT = 0, /* No state has been set yet */ - XFS_DAS_RMTBLK, /* Removing remote blks */ - XFS_DAS_RM_NAME, /* Remove attr name */ - XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ - XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ - XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ - XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ - XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ - XFS_DAS_RD_LEAF, /* Read in the new leaf */ - XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */ - XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ - XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ - XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ + XFS_DAS_UNINIT = 0, /* No state has been set yet */ + + /* + * Initial sequence states. The replace setup code relies on the + * ADD and REMOVE states for a specific format to be sequential so + * that we can transform the initial operation to be performed + * according to the xfs_has_larp() state easily. + */ + XFS_DAS_SF_ADD, /* Initial sf add state */ + XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */ + + XFS_DAS_LEAF_ADD, /* Initial leaf add state */ + XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */ + + XFS_DAS_NODE_ADD, /* Initial node add state */ + XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */ + + /* Leaf state set/replace/remove sequence */ + XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */ + XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */ + XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */ + XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */ + XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */ + + /* Node state sequence, must match leaf state above */ + XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */ + XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */ + XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */ + XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */ + XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */ + + XFS_DAS_DONE, /* finished operation */ }; +#define XFS_DAS_STRINGS \ + { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \ + { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \ + { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \ + { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \ + { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \ + { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \ + { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \ + { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \ + { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ + { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ + { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \ + { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \ + { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \ + { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \ + { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ + { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \ + { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \ + { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \ + { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \ + { XFS_DAS_DONE, "XFS_DAS_DONE" } + /* - * Defines for xfs_delattr_context.flags + * Defines for xfs_attr_item.xattri_flags */ -#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */ -#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/ +#define XFS_DAC_LEAF_ADDNAME_INIT 0x01 /* xfs_attr_leaf_addname init*/ /* * Context used for keeping track of delayed attribute operations */ -struct xfs_delattr_context { - struct xfs_da_args *da_args; +struct xfs_attr_item { + struct xfs_da_args *xattri_da_args; + + /* + * Used by xfs_attr_set to hold a leaf buffer across a transaction roll + */ + struct xfs_buf *xattri_leaf_bp; /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - int blkcnt; + struct xfs_bmbt_irec xattri_map; + xfs_dablk_t xattri_lblkno; + int xattri_blkcnt; /* Used in xfs_attr_node_removename to roll through removing blocks */ - struct xfs_da_state *da_state; + struct xfs_da_state *xattri_da_state; /* Used to keep track of current state of delayed operation */ - unsigned int flags; - enum xfs_delattr_state dela_state; + unsigned int xattri_flags; + enum xfs_delattr_state xattri_dela_state; + + /* + * Attr operation being performed - XFS_ATTR_OP_FLAGS_* + */ + unsigned int xattri_op_flags; + + /* + * used to log this item to an intent containing a list of attrs to + * commit later + */ + struct list_head xattri_list; }; + /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -489,11 +557,81 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); -int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove_args(struct xfs_da_args *args); -int xfs_attr_remove_iter(struct xfs_delattr_context *dac); +int xfs_attr_set_iter(struct xfs_attr_item *attr); +int xfs_attr_remove_iter(struct xfs_attr_item *attr); bool xfs_attr_namecheck(const void *name, size_t length); -void xfs_delattr_context_init(struct xfs_delattr_context *dac, - struct xfs_da_args *args); +int xfs_attr_calc_size(struct xfs_da_args *args, int *local); +void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, + unsigned int *total); + +extern struct kmem_cache *xfs_attri_cache; +extern struct kmem_cache *xfs_attrd_cache; + +int __init xfs_attri_init_cache(void); +void xfs_attri_destroy_cache(void); +int __init xfs_attrd_init_cache(void); +void xfs_attrd_destroy_cache(void); + +/* + * Check to see if the attr should be upgraded from non-existent or shortform to + * single-leaf-block attribute list. + */ +static inline bool +xfs_attr_is_shortform( + struct xfs_inode *ip) +{ + return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0); +} + +static inline enum xfs_delattr_state +xfs_attr_init_add_state(struct xfs_da_args *args) +{ + /* + * When called from the completion of a attr remove to determine the + * next state, the attribute fork may be null. This can occur only occur + * on a pure remove, but we grab the next state before we check if a + * replace operation is being performed. If we are called from any other + * context, i_afp is guaranteed to exist. Hence if the attr fork is + * null, we were called from a pure remove operation and so we are done. + */ + if (!args->dp->i_afp) + return XFS_DAS_DONE; + + args->op_flags |= XFS_DA_OP_ADDNAME; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_ADD; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_ADD; + return XFS_DAS_NODE_ADD; +} + +static inline enum xfs_delattr_state +xfs_attr_init_remove_state(struct xfs_da_args *args) +{ + args->op_flags |= XFS_DA_OP_REMOVE; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_REMOVE; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_REMOVE; + return XFS_DAS_NODE_REMOVE; +} + +/* + * If we are logging the attributes, then we have to start with removal of the + * old attribute so that there is always consistent state that we can recover + * from if the system goes down part way through. We always log the new attr + * value, so even when we remove the attr first we still have the information in + * the log to finish the replace operation atomically. + */ +static inline enum xfs_delattr_state +xfs_attr_init_replace_state(struct xfs_da_args *args) +{ + args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE; + if (xfs_has_larp(args->dp->i_mount)) + return xfs_attr_init_remove_state(args); + return xfs_attr_init_add_state(args); +} #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 014daa8c542d..15a990409463 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -28,6 +28,7 @@ #include "xfs_dir2.h" #include "xfs_log.h" #include "xfs_ag.h" +#include "xfs_errortag.h" /* @@ -310,6 +311,15 @@ xfs_attr3_leaf_verify( return fa; /* + * Empty leaf blocks should never occur; they imply the existence of a + * software bug that needs fixing. xfs_repair also flags them as a + * corruption that needs fixing, so we should never let these go to + * disk. + */ + if (ichdr.count == 0) + return __this_address; + + /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -445,6 +455,14 @@ xfs_attr3_leaf_read( * Namespace helper routines *========================================================================*/ +/* + * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE + * flag on disk - if there's an incomplete attr then recovery needs to tear it + * down. If there's no incomplete attr, then recovery needs to tear that attr + * down to replace it with the attr that has been logged. In this case, the + * INCOMPLETE flag will not be set in attr->attr_filter, but rather + * XFS_DA_OP_RECOVERY will be set in args->op_flags. + */ static bool xfs_attr_match( struct xfs_da_args *args, @@ -452,14 +470,18 @@ xfs_attr_match( unsigned char *name, int flags) { + if (args->namelen != namelen) return false; if (memcmp(args->name, name, namelen) != 0) return false; - /* - * If we are looking for incomplete entries, show only those, else only - * show complete entries. - */ + + /* Recovery ignores the INCOMPLETE flag. */ + if ((args->op_flags & XFS_DA_OP_RECOVERY) && + args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return true; + + /* All remaining matches need to be filtered by INCOMPLETE state. */ if (args->attr_filter != (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) return false; @@ -798,6 +820,14 @@ xfs_attr_sf_removename( sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); + + /* + * If we are recovering an operation, finding nothing to + * remove is not an error - it just means there was nothing + * to clean up. + */ + if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY)) + return 0; if (error != -EEXIST) return error; size = xfs_attr_sf_entsize(sfe); @@ -818,7 +848,7 @@ xfs_attr_sf_removename( totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && - !(args->op_flags & XFS_DA_OP_ADDNAME)) { + !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); @@ -1127,9 +1157,17 @@ xfs_attr3_leaf_to_shortform( goto out; if (forkoff == -1) { - ASSERT(xfs_has_attr2(dp->i_mount)); - ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); - xfs_attr_fork_remove(dp, args->trans); + /* + * Don't remove the attr fork if this operation is the first + * part of a attr replace operations. We're going to add a new + * attr immediately, so we need to keep the attr fork around in + * this case. + */ + if (!(args->op_flags & XFS_DA_OP_REPLACE)) { + ASSERT(xfs_has_attr2(dp->i_mount)); + ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); + xfs_attr_fork_remove(dp, args->trans); + } goto out; } @@ -1189,6 +1227,11 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + error = -EIO; + goto out; + } + error = xfs_da_grow_inode(args, &blkno); if (error) goto out; @@ -1486,8 +1529,9 @@ xfs_attr3_leaf_add_work( entry->flags = args->attr_filter; if (tmp) entry->flags |= XFS_ATTR_LOCAL; - if (args->op_flags & XFS_DA_OP_RENAME) { - entry->flags |= XFS_ATTR_INCOMPLETE; + if (args->op_flags & XFS_DA_OP_REPLACE) { + if (!xfs_has_larp(mp)) + entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { args->index2++; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 83b95be9ded8..4250159ecced 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -568,14 +568,14 @@ xfs_attr_rmtval_stale( */ int xfs_attr_rmtval_find_space( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_bmbt_irec *map = &dac->map; + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_bmbt_irec *map = &attr->xattri_map; int error; - dac->lblkno = 0; - dac->blkcnt = 0; + attr->xattri_lblkno = 0; + attr->xattri_blkcnt = 0; args->rmtblkcnt = 0; args->rmtblkno = 0; memset(map, 0, sizeof(struct xfs_bmbt_irec)); @@ -584,8 +584,8 @@ xfs_attr_rmtval_find_space( if (error) return error; - dac->blkcnt = args->rmtblkcnt; - dac->lblkno = args->rmtblkno; + attr->xattri_blkcnt = args->rmtblkcnt; + attr->xattri_lblkno = args->rmtblkno; return 0; } @@ -598,17 +598,18 @@ xfs_attr_rmtval_find_space( */ int xfs_attr_rmtval_set_blk( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; - struct xfs_bmbt_irec *map = &dac->map; + struct xfs_bmbt_irec *map = &attr->xattri_map; int nmap; int error; nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno, - dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total, + error = xfs_bmapi_write(args->trans, dp, + (xfs_fileoff_t)attr->xattri_lblkno, + attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total, map, &nmap); if (error) return error; @@ -618,8 +619,8 @@ xfs_attr_rmtval_set_blk( (map->br_startblock != HOLESTARTBLOCK)); /* roll attribute extent map forwards */ - dac->lblkno += map->br_blockcount; - dac->blkcnt -= map->br_blockcount; + attr->xattri_lblkno += map->br_blockcount; + attr->xattri_blkcnt -= map->br_blockcount; return 0; } @@ -673,9 +674,9 @@ xfs_attr_rmtval_invalidate( */ int xfs_attr_rmtval_remove( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; int error, done; /* @@ -695,8 +696,8 @@ xfs_attr_rmtval_remove( * the parent */ if (!done) { - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp); + trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state, + args->dp); return -EAGAIN; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index d72eff30ca18..62b398edec3f 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_remove(struct xfs_attr_item *attr); int xfs_attr_rmt_find_hole(struct xfs_da_args *args); int xfs_attr_rmtval_set_value(struct xfs_da_args *args); -int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac); -int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_set_blk(struct xfs_attr_item *attr); +int xfs_attr_rmtval_find_space(struct xfs_attr_item *attr); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 74198dd82b03..6833110d1bd4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels( xfs_mount_t *mp, /* file system mount structure */ int whichfork) /* data or attr fork */ { + uint64_t maxblocks; /* max blocks at this level */ + xfs_extnum_t maxleafents; /* max leaf entries possible */ int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ int maxrootrecs; /* max records in root block */ int minleafrecs; /* min records in leaf block */ int minnoderecs; /* min records in node block */ int sz; /* root block size */ /* - * The maximum number of extents in a file, hence the maximum number of - * leaf entries, is controlled by the size of the on-disk extent count, - * either a signed 32-bit number for the data fork, or a signed 16-bit - * number for the attr fork. + * The maximum number of extents in a fork, hence the maximum number of + * leaf entries, is controlled by the size of the on-disk extent count. * * Note that we can no longer assume that if we are in ATTR1 that the * fork offset of all the inodes will be @@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels( * ATTR2 we have to assume the worst case scenario of a minimum size * available. */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; + maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), + whichfork); + if (whichfork == XFS_DATA_FORK) sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); - } else { - maxleafents = MAXAEXTNUM; + else sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); - } + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + maxblocks = howmany_64(maxleafents, minleafrecs); for (level = 1; maxblocks > 1; level++) { if (maxblocks <= maxrootrecs) maxblocks = 1; else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + maxblocks = howmany_64(maxblocks, minnoderecs); } mp->m_bm_maxlevels[whichfork] = level; ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk()); @@ -468,7 +466,7 @@ error0: if (bp_release) xfs_trans_brelse(NULL, bp); error_norelse: - xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + xfs_warn(mp, "%s: BAD after btree leaves for %llu extents", __func__, i); xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -485,7 +483,7 @@ STATIC void xfs_bmap_validate_ret( xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_bmbt_irec_t *mval, int nmap, int ret_nmap) @@ -1399,7 +1397,7 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ @@ -1452,7 +1450,7 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -1470,13 +1468,13 @@ xfs_bmap_add_extent_delay_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -1950,7 +1948,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec old; @@ -2000,7 +1998,7 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -2018,13 +2016,13 @@ xfs_bmap_add_extent_unwritten_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; /* @@ -2479,7 +2477,7 @@ xfs_bmap_add_extent_hole_delay( xfs_filblks_t newlen=0; /* new indirect size */ xfs_filblks_t oldlen=0; /* old indirect size */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t temp; /* temp for indirect calculations */ ifp = XFS_IFORK_PTR(ip, whichfork); @@ -2510,15 +2508,15 @@ xfs_bmap_add_extent_hole_delay( */ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN))) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) state |= BMAP_RIGHT_CONTIG; /* @@ -2616,7 +2614,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp, - int flags) + uint32_t flags) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; @@ -2626,7 +2624,7 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); @@ -2661,17 +2659,17 @@ xfs_bmap_add_extent_hole_real( left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && new->br_startblock + new->br_blockcount == right.br_startblock && new->br_state == right.br_state && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN)) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -2906,15 +2904,15 @@ xfs_bmap_extsize_align( /* * For large extent hint sizes, the aligned extent might be larger than - * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls - * the length back under MAXEXTLEN. The outer allocation loops handle - * short allocation just fine, so it is safe to do this. We only want to - * do it when we are forced to, though, because it means more allocation - * operations are required. + * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so + * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer + * allocation loops handle short allocation just fine, so it is safe to + * do this. We only want to do it when we are forced to, though, because + * it means more allocation operations are required. */ - while (align_alen > MAXEXTLEN) + while (align_alen > XFS_MAX_BMBT_EXTLEN) align_alen -= extsz; - ASSERT(align_alen <= MAXEXTLEN); + ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN); /* * If the previous block overlaps with this proposed allocation @@ -3004,9 +3002,9 @@ xfs_bmap_extsize_align( return -EINVAL; } else { ASSERT(orig_off >= align_off); - /* see MAXEXTLEN handling above */ + /* see XFS_BMBT_MAX_EXTLEN handling above */ ASSERT(orig_end <= align_off + align_alen || - align_alen + extsz > MAXEXTLEN); + align_alen + extsz > XFS_MAX_BMBT_EXTLEN); } #ifdef DEBUG @@ -3766,7 +3764,7 @@ xfs_bmapi_trim_map( xfs_fileoff_t obno, xfs_fileoff_t end, int n, - int flags) + uint32_t flags) { if ((flags & XFS_BMAPI_ENTIRE) || got->br_startoff + got->br_blockcount <= obno) { @@ -3811,7 +3809,7 @@ xfs_bmapi_update_map( xfs_fileoff_t obno, xfs_fileoff_t end, int *n, - int flags) + uint32_t flags) { xfs_bmbt_irec_t *mval = *map; @@ -3864,7 +3862,7 @@ xfs_bmapi_read( xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); @@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc( * Cap the alloc length. Keep track of prealloc so we know whether to * tag the inode before we return. */ - alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN); + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); if (prealloc && alen >= len) @@ -4104,7 +4102,7 @@ xfs_bmapi_allocate( if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) bma->prev.br_startoff = NULLFILEOFF; } else { - bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN); if (!bma->eof) bma->length = XFS_FILBLKS_MIN(bma->length, bma->got.br_startoff - bma->offset); @@ -4184,7 +4182,7 @@ xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, struct xfs_bmbt_irec *mval, xfs_filblks_t len, - int flags) + uint32_t flags) { int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); @@ -4312,7 +4310,7 @@ xfs_bmapi_write( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting file offs. mapped */ xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ + uint32_t flags, /* XFS_BMAPI_... */ xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ @@ -4424,8 +4422,8 @@ xfs_bmapi_write( * xfs_extlen_t and therefore 32 bits. Hence we have to * check for 32-bit overflows and handle them here. */ - if (len > (xfs_filblks_t)MAXEXTLEN) - bma.length = MAXEXTLEN; + if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN) + bma.length = XFS_MAX_BMBT_EXTLEN; else bma.length = len; @@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); error = xfs_iext_count_may_overflow(ip, whichfork, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, 0); - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || bma.got.br_startoff > offset_fsb) { /* @@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc( bma.ip = ip; bma.wasdel = true; bma.offset = bma.got.br_startoff; - bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, + XFS_MAX_BMBT_EXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* @@ -4629,7 +4630,7 @@ xfs_bmapi_remap( xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -4641,7 +4642,7 @@ xfs_bmapi_remap( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(len > 0); - ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); + ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_NORMAP))); @@ -4801,7 +4802,7 @@ xfs_bmap_del_extent_delay( int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; xfs_filblks_t got_indlen, new_indlen, stolen; - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); int error = 0; bool isrt; @@ -4926,7 +4927,7 @@ xfs_bmap_del_extent_cow( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec new; xfs_fileoff_t del_endoff, got_endoff; - int state = BMAP_COWFORK; + uint32_t state = BMAP_COWFORK; XFS_STATS_INC(mp, xs_del_exlist); @@ -4999,7 +5000,7 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ int whichfork, /* data or attr fork */ - int bflags) /* bmapi flags */ + uint32_t bflags) /* bmapi flags */ { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ @@ -5015,7 +5016,7 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t new; /* new record to be inserted */ /* REFERENCED */ uint qfield; /* quota field to update */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; mp = ip->i_mount; @@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real( * Deleting the middle of the extent. */ - /* - * For directories, -ENOSPC is returned since a directory entry - * remove operation must not fail due to low extent count - * availability. -ENOSPC will be handled by higher layers of XFS - * by letting the corresponding empty Data/Free blocks to linger - * until a future remove operation. Dabtree blocks would be - * swapped with the last block in the leaf space and then the - * new last block will be unmapped. - * - * The above logic also applies to the source directory entry of - * a rename operation. - */ - error = xfs_iext_count_may_overflow(ip, whichfork, 1); - if (error) { - ASSERT(S_ISDIR(VFS_I(ip)->i_mode) && - whichfork == XFS_DATA_FORK); - error = -ENOSPC; - goto done; - } - old = got; got.br_blockcount = del->br_startoff - got.br_startoff; @@ -5281,7 +5262,7 @@ __xfs_bunmapi( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t start, /* first file offset deleted */ xfs_filblks_t *rlen, /* i/o: amount remaining */ - int flags, /* misc flags */ + uint32_t flags, /* misc flags */ xfs_extnum_t nexts) /* number of extents max */ { struct xfs_btree_cur *cur; /* bmap btree cursor */ @@ -5299,7 +5280,6 @@ __xfs_bunmapi( int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ - xfs_fileoff_t max_len; xfs_fileoff_t end; struct xfs_iext_cursor icur; bool done = false; @@ -5318,16 +5298,6 @@ __xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); - /* - * Guesstimate how many blocks we can unmap without running the risk of - * blowing out the transaction with a mix of EFIs and reflink - * adjustments. - */ - if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) - max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); - else - max_len = len; - error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; @@ -5366,7 +5336,7 @@ __xfs_bunmapi( extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && - (nexts == 0 || extno < nexts) && max_len > 0) { + (nexts == 0 || extno < nexts)) { /* * Is the found extent after a hole in which end lives? * Just back up to the previous extent, if so. @@ -5400,14 +5370,6 @@ __xfs_bunmapi( if (del.br_startoff + del.br_blockcount > end + 1) del.br_blockcount = end + 1 - del.br_startoff; - /* How much can we safely unmap? */ - if (max_len < del.br_blockcount) { - del.br_startoff += del.br_blockcount - max_len; - if (!wasdel) - del.br_startblock += del.br_blockcount - max_len; - del.br_blockcount = max_len; - } - if (!isrt) goto delete; @@ -5543,7 +5505,6 @@ delete: if (error) goto error0; - max_len -= del.br_blockcount; end = del.br_startoff - 1; nodelete: /* @@ -5609,7 +5570,7 @@ xfs_bunmapi( struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_extnum_t nexts, int *done) { @@ -5641,7 +5602,7 @@ xfs_bmse_can_merge( if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || - (left->br_blockcount + got->br_blockcount > MAXEXTLEN)) + (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN)) return false; return true; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 03d9aaf87413..16db95b11589 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -39,7 +39,7 @@ struct xfs_bmalloca { bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ int datatype;/* data type being allocated */ - int flags; + uint32_t flags; }; #define XFS_BMAP_MAX_NMAP 4 @@ -47,17 +47,17 @@ struct xfs_bmalloca { /* * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ -#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ -#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ -#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ +#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */ +#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */ +#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ -#define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_CONVERT (1u << 5) /* * allocate zeroed extents - this requires all newly allocated user data extents @@ -65,7 +65,7 @@ struct xfs_bmalloca { * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found * during the allocation range to zeroed written extents. */ -#define XFS_BMAPI_ZERO 0x080 +#define XFS_BMAPI_ZERO (1u << 6) /* * Map the inode offset to the block given in ap->firstblock. Primarily @@ -75,16 +75,16 @@ struct xfs_bmalloca { * For bunmapi, this flag unmaps the range without adjusting quota, reducing * refcount, or freeing the blocks. */ -#define XFS_BMAPI_REMAP 0x100 +#define XFS_BMAPI_REMAP (1u << 7) /* Map something in the CoW fork. */ -#define XFS_BMAPI_COWFORK 0x200 +#define XFS_BMAPI_COWFORK (1u << 8) /* Skip online discard of freed extents */ -#define XFS_BMAPI_NODISCARD 0x1000 +#define XFS_BMAPI_NODISCARD (1u << 9) /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ -#define XFS_BMAPI_NORMAP 0x2000 +#define XFS_BMAPI_NORMAP (1u << 10) #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -106,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w) (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); } -static inline int xfs_bmapi_whichfork(int bmapi_flags) +static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { if (bmapi_flags & XFS_BMAPI_COWFORK) return XFS_COW_FORK; @@ -124,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) /* * Flags for xfs_bmap_add_extent*. */ -#define BMAP_LEFT_CONTIG (1 << 0) -#define BMAP_RIGHT_CONTIG (1 << 1) -#define BMAP_LEFT_FILLING (1 << 2) -#define BMAP_RIGHT_FILLING (1 << 3) -#define BMAP_LEFT_DELAY (1 << 4) -#define BMAP_RIGHT_DELAY (1 << 5) -#define BMAP_LEFT_VALID (1 << 6) -#define BMAP_RIGHT_VALID (1 << 7) -#define BMAP_ATTRFORK (1 << 8) -#define BMAP_COWFORK (1 << 9) +#define BMAP_LEFT_CONTIG (1u << 0) +#define BMAP_RIGHT_CONTIG (1u << 1) +#define BMAP_LEFT_FILLING (1u << 2) +#define BMAP_RIGHT_FILLING (1u << 3) +#define BMAP_LEFT_DELAY (1u << 4) +#define BMAP_RIGHT_DELAY (1u << 5) +#define BMAP_LEFT_VALID (1u << 6) +#define BMAP_RIGHT_VALID (1u << 7) +#define BMAP_ATTRFORK (1u << 8) +#define BMAP_COWFORK (1u << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ @@ -183,15 +183,15 @@ int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, - int *nmap, int flags); + int *nmap, uint32_t flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, + xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, @@ -243,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); -static inline int xfs_bmap_fork_to_state(int whichfork) +static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { switch (whichfork) { case XFS_ATTR_FORK: @@ -260,7 +260,7 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags); + uint32_t flags); extern struct kmem_cache *xfs_bmap_intent_cache; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 453309fc85f2..2b77d45c215f 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -597,7 +597,11 @@ xfs_bmbt_maxrecs( return xfs_bmbt_block_maxrecs(blocklen, leaf); } -/* Compute the max possible height for block mapping btrees. */ +/* + * Calculate the maximum possible height of the btree that the on-disk format + * supports. This is used for sizing structures large enough to support every + * possible configuration of a filesystem that might get mounted. + */ unsigned int xfs_bmbt_maxlevels_ondisk(void) { @@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void) minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2; /* One extra level for the inode root. */ - return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1; + return xfs_btree_compute_maxlevels(minrecs, + XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1; } /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c1500b238520..2aa300f7461f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -51,6 +51,52 @@ xfs_btree_magic( return magic; } +static xfs_failaddr_t +xfs_btree_check_lblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_fsblock_t fsb, + xfs_fsblock_t sibling) +{ + if (sibling == NULLFSBLOCK) + return NULL; + if (sibling == fsb) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_lptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_fsbno(mp, sibling)) + return __this_address; + } + + return NULL; +} + +static xfs_failaddr_t +xfs_btree_check_sblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_agblock_t sibling) +{ + if (sibling == NULLAGBLOCK) + return NULL; + if (sibling == agbno) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_sptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_agbno(mp, agno, sibling)) + return __this_address; + } + return NULL; +} + /* * Check a long btree block header. Return the address of the failing check, * or NULL if everything is ok. @@ -65,6 +111,8 @@ __xfs_btree_check_lblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_fsblock_t fsb = NULLFSBLOCK; if (crc) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -83,16 +131,16 @@ __xfs_btree_check_lblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + be64_to_cpu(block->bb_u.l.bb_leftsib)); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + be64_to_cpu(block->bb_u.l.bb_rightsib)); + return fa; } /* Check a long btree block header. */ @@ -130,6 +178,9 @@ __xfs_btree_check_sblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_agblock_t agbno = NULLAGBLOCK; + xfs_agnumber_t agno = NULLAGNUMBER; if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -146,16 +197,18 @@ __xfs_btree_check_sblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) { + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); + } + + fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, + be32_to_cpu(block->bb_u.s.bb_leftsib)); + if (!fa) + fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, + agbno, be32_to_cpu(block->bb_u.s.bb_rightsib)); + return fa; } /* Check a short btree block header. */ @@ -751,20 +804,20 @@ xfs_btree_lastrec( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets, /* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last) /* output: last byte offset */ { int i; /* current bit number */ - int64_t imask; /* mask for current bit number */ + uint32_t imask; /* mask for current bit number */ ASSERT(fields != 0); /* * Find the lowest bit, so the first byte offset. */ - for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + for (i = 0, imask = 1u; ; i++, imask <<= 1) { if (imask & fields) { *first = offsets[i]; break; @@ -773,7 +826,7 @@ xfs_btree_offsets( /* * Find the highest bit, so the last byte offset. */ - for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { if (imask & fields) { *last = offsets[i + 1] - 1; break; @@ -1456,7 +1509,7 @@ void xfs_btree_log_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_buf *bp, /* buffer containing btree block */ - int fields) /* mask of fields: XFS_BB_... */ + uint32_t fields) /* mask of fields: XFS_BB_... */ { int first; /* first byte offset logged */ int last; /* last byte offset logged */ @@ -4271,6 +4324,21 @@ xfs_btree_visit_block( if (xfs_btree_ptr_is_null(cur, &rptr)) return -ENOENT; + /* + * We only visit blocks once in this walk, so we have to avoid the + * internal xfs_btree_lookup_get_block() optimisation where it will + * return the same block without checking if the right sibling points + * back to us and creates a cyclic reference in the btree. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } else { + if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4445,20 +4513,21 @@ xfs_btree_lblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_fsblock_t fsb; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) - return __this_address; - - return NULL; + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + be64_to_cpu(block->bb_u.l.bb_leftsib)); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + be64_to_cpu(block->bb_u.l.bb_rightsib)); + return fa; } /** @@ -4499,7 +4568,9 @@ xfs_btree_sblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - xfs_agblock_t agno; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) @@ -4507,14 +4578,13 @@ xfs_btree_sblock_verify( /* sibling pointer verification */ agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) - return __this_address; - - return NULL; + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + be32_to_cpu(block->bb_u.s.bb_leftsib)); + if (!fa) + fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + be32_to_cpu(block->bb_u.s.bb_rightsib)); + return fa; } /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 22d9f411fde6..eef27858a013 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); /* * For logging record fields. */ -#define XFS_BB_MAGIC (1 << 0) -#define XFS_BB_LEVEL (1 << 1) -#define XFS_BB_NUMRECS (1 << 2) -#define XFS_BB_LEFTSIB (1 << 3) -#define XFS_BB_RIGHTSIB (1 << 4) -#define XFS_BB_BLKNO (1 << 5) -#define XFS_BB_LSN (1 << 6) -#define XFS_BB_UUID (1 << 7) -#define XFS_BB_OWNER (1 << 8) +#define XFS_BB_MAGIC (1u << 0) +#define XFS_BB_LEVEL (1u << 1) +#define XFS_BB_NUMRECS (1u << 2) +#define XFS_BB_LEFTSIB (1u << 3) +#define XFS_BB_RIGHTSIB (1u << 4) +#define XFS_BB_BLKNO (1u << 5) +#define XFS_BB_LSN (1u << 6) +#define XFS_BB_UUID (1u << 7) +#define XFS_BB_OWNER (1u << 8) #define XFS_BB_NUM_BITS 5 -#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1) #define XFS_BB_NUM_BITS_CRC 9 -#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) +#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -345,7 +345,7 @@ xfs_btree_dup_cursor( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ @@ -435,7 +435,7 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. */ -void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t); void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 9dc1ecb9713d..aa74f3fdb571 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_errortag.h" /* * xfs_da_btree.c @@ -482,6 +483,9 @@ xfs_da3_split( trace_xfs_da_split(state->args); + if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + return -EIO; + /* * Walk back up the tree splitting/inserting/adjusting as necessary. * If we need to insert and there isn't room, split the node, then diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 0faf7d9ac241..ed2303e4d46a 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -30,6 +30,7 @@ struct xfs_da_geometry { unsigned int free_hdr_size; /* dir2 free header size */ unsigned int free_max_bests; /* # of bests entries in dir2 free */ xfs_dablk_t freeblk; /* blockno of free data v2 */ + xfs_extnum_t max_extents; /* Max. extents in corresponding fork */ xfs_dir2_data_aoff_t data_first_offset; size_t data_entry_offset; @@ -76,27 +77,31 @@ typedef struct xfs_da_args { xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ int rmtvaluelen2; /* remote attr value length in bytes */ - int op_flags; /* operation flags */ + uint32_t op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; /* * Operation flags: */ -#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ -#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ -#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ -#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ -#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ -#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */ +#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */ +#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */ +#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ +#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ +#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ +#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ +#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ - { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_REPLACE, "REPLACE" }, \ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_NOTIME, "NOTIME" } + { XFS_DA_OP_NOTIME, "NOTIME" }, \ + { XFS_DA_OP_REMOVE, "REMOVE" }, \ + { XFS_DA_OP_RECOVERY, "RECOVERY" } /* * Storage for holding state during Btree searches and split/join ops. @@ -197,7 +202,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp, * Utility routines. */ -#define XFS_DABUF_MAP_HOLE_OK (1 << 0) +#define XFS_DABUF_MAP_HOLE_OK (1u << 0) int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5a49caa5c9df..25e2841084e1 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) * Directory address space divided into sections, * spaces separated by 32GB. */ +#define XFS_DIR2_MAX_SPACES 3 #define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) #define XFS_DIR2_DATA_SPACE 0 #define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) @@ -688,10 +689,10 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ -#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) -#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) -#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) -#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) +#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) /* diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 0805ade2d300..ceb222b4f261 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -22,6 +22,10 @@ #include "xfs_refcount.h" #include "xfs_bmap.h" #include "xfs_alloc.h" +#include "xfs_buf.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -184,9 +188,10 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, + [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; -static void +static bool xfs_defer_create_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp, @@ -197,6 +202,7 @@ xfs_defer_create_intent( if (!dfp->dfp_intent) dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + return dfp->dfp_intent != NULL; } /* @@ -204,16 +210,18 @@ xfs_defer_create_intent( * associated extents, then add the entire intake list to the end of * the pending list. */ -STATIC void +static bool xfs_defer_create_intents( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; + bool ret = false; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { trace_xfs_defer_create_intent(tp->t_mountp, dfp); - xfs_defer_create_intent(tp, dfp, true); + ret |= xfs_defer_create_intent(tp, dfp, true); } + return ret; } /* Abort all the intents that were committed. */ @@ -487,7 +495,7 @@ int xfs_defer_finish_noroll( struct xfs_trans **tp) { - struct xfs_defer_pending *dfp; + struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); @@ -506,17 +514,20 @@ xfs_defer_finish_noroll( * of time that any one intent item can stick around in memory, * pinning the log tail. */ - xfs_defer_create_intents(*tp); + bool has_intents = xfs_defer_create_intents(*tp); + list_splice_init(&(*tp)->t_dfops, &dop_pending); - error = xfs_defer_trans_roll(tp); - if (error) - goto out_shutdown; + if (has_intents || dfp) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; - /* Possibly relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) - goto out_shutdown; + /* Relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; + } dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); @@ -774,17 +785,25 @@ xfs_defer_ops_continue( struct xfs_trans *tp, struct xfs_defer_resources *dres) { + unsigned int i; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); - /* Lock and join the captured inode to the new transaction. */ + /* Lock the captured resources to the new transaction. */ if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); + + for (i = 0; i < dfc->dfc_held.dr_bufs; i++) + xfs_buf_lock(dfc->dfc_held.dr_bp[i]); + + /* Join the captured resources to the new transaction. */ xfs_defer_restore_resources(tp, &dfc->dfc_held); memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); + dres->dr_bufs = 0; /* Move captured dfops chain and state to the transaction. */ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); @@ -854,7 +873,12 @@ xfs_defer_init_item_caches(void) error = xfs_extfree_intent_init_cache(); if (error) goto err; - + error = xfs_attri_init_cache(); + if (error) + goto err; + error = xfs_attrd_init_cache(); + if (error) + goto err; return 0; err: xfs_defer_destroy_item_caches(); @@ -865,6 +889,8 @@ err: void xfs_defer_destroy_item_caches(void) { + xfs_attri_destroy_cache(); + xfs_attrd_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); xfs_refcount_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7bb8a31ad65b..114a3a4930a3 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -19,6 +19,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_AGFL_FREE, + XFS_DEFER_OPS_TYPE_ATTR, XFS_DEFER_OPS_TYPE_MAX, }; @@ -63,6 +64,8 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +extern const struct xfs_defer_op_type xfs_attr_defer_type; + /* * Deferred operation item relogging limits. diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 5f1e4799e8fa..3cd51fa3837b 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -150,6 +150,8 @@ xfs_da_mount( dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >> + mp->m_sb.sb_blocklog; dageo->magicpct = (dageo->blksize * 37) / 100; /* set up attribute geometry - single fsb only */ @@ -161,6 +163,12 @@ xfs_da_mount( dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + + if (xfs_has_large_extent_counts(mp)) + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + else + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + dageo->magicpct = (dageo->blksize * 37) / 100; return 0; } diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a23a52e643ad..5362908164b0 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -59,7 +59,10 @@ #define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36 #define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 #define XFS_ERRTAG_AG_RESV_FAIL 38 -#define XFS_ERRTAG_MAX 39 +#define XFS_ERRTAG_LARP 39 +#define XFS_ERRTAG_DA_LEAF_SPLIT 40 +#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 +#define XFS_ERRTAG_MAX 42 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -103,5 +106,8 @@ #define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 #define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 #define XFS_RANDOM_AG_RESV_FAIL 1 +#define XFS_RANDOM_LARP 1 +#define XFS_RANDOM_DA_LEAF_SPLIT 1 +#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index d665c04e69dd..afdfc8108c5f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -372,12 +372,14 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ #define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ #define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ +#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ XFS_SB_FEAT_INCOMPAT_SPINODES| \ XFS_SB_FEAT_INCOMPAT_META_UUID| \ XFS_SB_FEAT_INCOMPAT_BIGTIME| \ - XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) + XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \ + XFS_SB_FEAT_INCOMPAT_NREXT64) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -388,7 +390,9 @@ xfs_sb_has_incompat_feature( return (sbp->sb_features_incompat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */ +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \ + (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS) #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL static inline bool xfs_sb_has_incompat_log_feature( @@ -413,6 +417,11 @@ xfs_sb_add_incompat_log_features( sbp->sb_features_log_incompat |= features; } +static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp) +{ + return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); +} static inline bool xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) @@ -525,26 +534,26 @@ typedef struct xfs_agf { #define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) -#define XFS_AGF_MAGICNUM 0x00000001 -#define XFS_AGF_VERSIONNUM 0x00000002 -#define XFS_AGF_SEQNO 0x00000004 -#define XFS_AGF_LENGTH 0x00000008 -#define XFS_AGF_ROOTS 0x00000010 -#define XFS_AGF_LEVELS 0x00000020 -#define XFS_AGF_FLFIRST 0x00000040 -#define XFS_AGF_FLLAST 0x00000080 -#define XFS_AGF_FLCOUNT 0x00000100 -#define XFS_AGF_FREEBLKS 0x00000200 -#define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_UUID 0x00001000 -#define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000 -#define XFS_AGF_REFCOUNT_ROOT 0x00008000 -#define XFS_AGF_REFCOUNT_LEVEL 0x00010000 -#define XFS_AGF_SPARE64 0x00020000 +#define XFS_AGF_MAGICNUM (1u << 0) +#define XFS_AGF_VERSIONNUM (1u << 1) +#define XFS_AGF_SEQNO (1u << 2) +#define XFS_AGF_LENGTH (1u << 3) +#define XFS_AGF_ROOTS (1u << 4) +#define XFS_AGF_LEVELS (1u << 5) +#define XFS_AGF_FLFIRST (1u << 6) +#define XFS_AGF_FLLAST (1u << 7) +#define XFS_AGF_FLCOUNT (1u << 8) +#define XFS_AGF_FREEBLKS (1u << 9) +#define XFS_AGF_LONGEST (1u << 10) +#define XFS_AGF_BTREEBLKS (1u << 11) +#define XFS_AGF_UUID (1u << 12) +#define XFS_AGF_RMAP_BLOCKS (1u << 13) +#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14) +#define XFS_AGF_REFCOUNT_ROOT (1u << 15) +#define XFS_AGF_REFCOUNT_LEVEL (1u << 16) +#define XFS_AGF_SPARE64 (1u << 17) #define XFS_AGF_NUM_BITS 18 -#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) +#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ @@ -619,22 +628,22 @@ typedef struct xfs_agi { #define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) -#define XFS_AGI_MAGICNUM (1 << 0) -#define XFS_AGI_VERSIONNUM (1 << 1) -#define XFS_AGI_SEQNO (1 << 2) -#define XFS_AGI_LENGTH (1 << 3) -#define XFS_AGI_COUNT (1 << 4) -#define XFS_AGI_ROOT (1 << 5) -#define XFS_AGI_LEVEL (1 << 6) -#define XFS_AGI_FREECOUNT (1 << 7) -#define XFS_AGI_NEWINO (1 << 8) -#define XFS_AGI_DIRINO (1 << 9) -#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_MAGICNUM (1u << 0) +#define XFS_AGI_VERSIONNUM (1u << 1) +#define XFS_AGI_SEQNO (1u << 2) +#define XFS_AGI_LENGTH (1u << 3) +#define XFS_AGI_COUNT (1u << 4) +#define XFS_AGI_ROOT (1u << 5) +#define XFS_AGI_LEVEL (1u << 6) +#define XFS_AGI_FREECOUNT (1u << 7) +#define XFS_AGI_NEWINO (1u << 8) +#define XFS_AGI_DIRINO (1u << 9) +#define XFS_AGI_UNLINKED (1u << 10) #define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ -#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) -#define XFS_AGI_FREE_ROOT (1 << 11) -#define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */ +#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1u << 11) +#define XFS_AGI_FREE_LEVEL (1u << 12) +#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */ #define XFS_AGI_NUM_BITS_R2 14 /* disk block (xfs_daddr_t) in the AG */ @@ -791,16 +800,41 @@ struct xfs_dinode { __be32 di_nlink; /* number of links to file */ __be16 di_projid_lo; /* lower part of owner's project id */ __be16 di_projid_hi; /* higher part owner's project id */ - __u8 di_pad[6]; /* unused, zeroed space */ - __be16 di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + __be64 di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + __be64 di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + __u8 di_v2_pad[6]; + __be16 di_flushiter; + }; + }; xfs_timestamp_t di_atime; /* time last accessed */ xfs_timestamp_t di_mtime; /* time last modified */ xfs_timestamp_t di_ctime; /* time created/inode modified */ __be64 di_size; /* number of bytes in file */ __be64 di_nblocks; /* # of direct & btree blocks used */ __be32 di_extsize; /* basic/minimum extent size for file */ - __be32 di_nextents; /* number of extents in data fork */ - __be16 di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + __be32 di_nextents; + __be16 di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + __be32 di_big_anextents; + __be16 di_nrext64_pad; + } __packed; + } __packed; __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ __s8 di_aformat; /* format of attr fork's data */ __be32 di_dmevmask; /* DMIG event mask */ @@ -870,6 +904,56 @@ enum xfs_dinode_fmt { { XFS_DINODE_FMT_UUID, "uuid" } /* + * Max values for extnum and aextnum. + * + * The original on-disk extent counts were held in signed fields, resulting in + * maximum extent counts of 2^31 and 2^15 for the data and attr forks + * respectively. Similarly the maximum extent length is limited to 2^21 blocks + * by the 21-bit wide blockcount field of a BMBT extent record. + * + * The newly introduced data fork extent counter can hold a 64-bit value, + * however the maximum number of extents in a file is also limited to 2^54 + * extents by the 54-bit wide startoff field of a BMBT extent record. + * + * It is further limited by the maximum supported file size of 2^63 + * *bytes*. This leads to a maximum extent count for maximally sized filesystem + * blocks (64kB) of: + * + * 2^63 bytes / 2^16 bytes per block = 2^47 blocks + * + * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence + * 2^48 was chosen as the maximum data fork extent count. + * + * The maximum file size that can be represented by the data fork extent counter + * in the worst case occurs when all extents are 1 block in length and each + * block is 1KB in size. + * + * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and + * with 1KB sized blocks, a file can reach upto, + * 1KB * (2^31) = 2TB + * + * This is much larger than the theoretical maximum size of a directory + * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB. + * + * Hence, a directory inode can never overflow its data fork extent counter. + */ +#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1)) +#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1)) + +/* + * When we upgrade an inode to the large extent counts, the maximum value by + * which the extent count can increase is bound by the change in size of the + * on-disk field. No upgrade operation should ever be adding more than a few + * tens of extents, so if we get a really large value it is a sign of a code bug + * or corruption. + */ +#define XFS_MAX_EXTCNT_UPGRADE_NR \ + min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \ + XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL) + +/* * Inode minimum and maximum sizes. */ #define XFS_DINODE_MIN_LOG 8 @@ -918,10 +1002,6 @@ enum xfs_dinode_fmt { ((w) == XFS_DATA_FORK ? \ (dip)->di_format : \ (dip)->di_aformat) -#define XFS_DFORK_NEXTENTS(dip,w) \ - ((w) == XFS_DATA_FORK ? \ - be32_to_cpu((dip)->di_nextents) : \ - be16_to_cpu((dip)->di_anextents)) /* * For block and character special files the 32bit dev_t is stored at the @@ -988,15 +1068,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ +#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { @@ -1004,6 +1086,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME)); } +static inline bool xfs_dinode_has_large_extent_counts( + const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64)); +} + /* * Inode number format: * low inopblog bits - offset in block @@ -1085,10 +1174,10 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ -#define XFS_DQTYPE_USER 0x01 /* user dquot record */ -#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ -#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ -#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */ +#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */ +#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */ +#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */ +#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */ /* bitmask to determine if this is a user/group/project dquot */ #define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ @@ -1596,6 +1685,8 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) #define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) +#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK)) + /* * bmbt records have a file offset (block) field that is 54 bits wide, so this * is the largest xfs_fileoff_t that we ever expect to see. diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 505533c43a92..1cfd5bc6520a 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ #define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ +#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ /* * Minimum and maximum sizes need for growth checks. @@ -377,7 +378,7 @@ struct xfs_bulkstat { uint32_t bs_extsize_blks; /* extent size hint, blocks */ uint32_t bs_nlink; /* number of links */ - uint32_t bs_extents; /* number of extents */ + uint32_t bs_extents; /* 32-bit data fork extent counter */ uint32_t bs_aextents; /* attribute number of extents */ uint16_t bs_version; /* structure version */ uint16_t bs_forkoff; /* inode fork offset in bytes */ @@ -386,8 +387,9 @@ struct xfs_bulkstat { uint16_t bs_checked; /* checked inode metadata */ uint16_t bs_mode; /* type and mode */ uint16_t bs_pad2; /* zeroed */ + uint64_t bs_extents64; /* 64-bit data fork extent counter */ - uint64_t bs_pad[7]; /* zeroed */ + uint64_t bs_pad[6]; /* zeroed */ }; #define XFS_BULKSTAT_VERSION_V1 (1) @@ -459,17 +461,28 @@ struct xfs_bulk_ireq { * Only return results from the specified @agno. If @ino is zero, start * with the first inode of @agno. */ -#define XFS_BULK_IREQ_AGNO (1 << 0) +#define XFS_BULK_IREQ_AGNO (1U << 0) /* * Return bulkstat information for a single inode, where @ino value is a * special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_* * values below. Not compatible with XFS_BULK_IREQ_AGNO. */ -#define XFS_BULK_IREQ_SPECIAL (1 << 1) +#define XFS_BULK_IREQ_SPECIAL (1U << 1) -#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ - XFS_BULK_IREQ_SPECIAL) +/* + * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign + * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use + * xfs_bulkstat->bs_extents for returning data fork extent count and set + * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and + * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than + * XFS_MAX_EXTCNT_DATA_FORK_OLD. + */ +#define XFS_BULK_IREQ_NREXT64 (1U << 2) + +#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ + XFS_BULK_IREQ_SPECIAL | \ + XFS_BULK_IREQ_NREXT64) /* Operate on the root directory inode. */ #define XFS_BULK_IREQ_SPECIAL_ROOT (1) @@ -699,34 +712,34 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_NR 25 /* i: Repair this metadata. */ -#define XFS_SCRUB_IFLAG_REPAIR (1 << 0) +#define XFS_SCRUB_IFLAG_REPAIR (1u << 0) /* o: Metadata object needs repair. */ -#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1) +#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1) /* * o: Metadata object could be optimized. It's not corrupt, but * we could improve on it somehow. */ -#define XFS_SCRUB_OFLAG_PREEN (1 << 2) +#define XFS_SCRUB_OFLAG_PREEN (1u << 2) /* o: Cross-referencing failed. */ -#define XFS_SCRUB_OFLAG_XFAIL (1 << 3) +#define XFS_SCRUB_OFLAG_XFAIL (1u << 3) /* o: Metadata object disagrees with cross-referenced metadata. */ -#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4) +#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4) /* o: Scan was not complete. */ -#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5) +#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5) /* o: Metadata object looked funny but isn't corrupt. */ -#define XFS_SCRUB_OFLAG_WARNING (1 << 6) +#define XFS_SCRUB_OFLAG_WARNING (1u << 6) /* * o: IFLAG_REPAIR was set but metadata object did not need fixing or * optimization and has therefore not been altered. */ -#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7) +#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7) #define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b418fe0c0679..bf2f4bc89193 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2414,9 +2414,9 @@ out_drop: */ void xfs_ialloc_log_agi( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* allocation group header buffer */ - int fields) /* bitmask of fields to log */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte number */ int last; /* last byte number */ @@ -2772,6 +2772,8 @@ xfs_ialloc_setup_geometry( igeo->new_diflags2 = 0; if (xfs_has_bigtime(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; + if (xfs_has_large_extent_counts(mp)) + igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64; /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 8b5c2b709022..a7705b6a1fd3 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -60,7 +60,7 @@ void xfs_ialloc_log_agi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* allocation group header buffer */ - int fields); /* bitmask of fields to log */ + uint32_t fields); /* bitmask of fields to log */ /* * Read in the allocation group header (inode allocation section) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index cae9708c8587..3b1b63f9d886 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -279,6 +279,25 @@ xfs_inode_to_disk_ts( return ts; } +static inline void +xfs_inode_to_disk_iext_counters( + struct xfs_inode *ip, + struct xfs_dinode *to) +{ + if (xfs_inode_has_large_extent_counts(ip)) { + to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); + to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp)); + /* + * We might be upgrading the inode to use larger extent counters + * than was previously used. Hence zero the unused field. + */ + to->di_nrext64_pad = cpu_to_be16(0); + } else { + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); + } +} + void xfs_inode_to_disk( struct xfs_inode *ip, @@ -296,7 +315,6 @@ xfs_inode_to_disk( to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); - memset(to->di_pad, 0, sizeof(to->di_pad)); to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); @@ -307,8 +325,6 @@ xfs_inode_to_disk( to->di_size = cpu_to_be64(ip->i_disk_size); to->di_nblocks = cpu_to_be64(ip->i_nblocks); to->di_extsize = cpu_to_be32(ip->i_extsize); - to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); - to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); to->di_forkoff = ip->i_forkoff; to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_flags = cpu_to_be16(ip->i_diflags); @@ -323,11 +339,14 @@ xfs_inode_to_disk( to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_version = 2; to->di_flushiter = cpu_to_be16(ip->i_flushiter); + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_inode_to_disk_iext_counters(ip, to); } static xfs_failaddr_t @@ -336,20 +355,40 @@ xfs_dinode_verify_fork( struct xfs_mount *mp, int whichfork) { - uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t di_nextents; + xfs_extnum_t max_extents; + mode_t mode = be16_to_cpu(dip->di_mode); + uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork); + uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork); + + di_nextents = xfs_dfork_nextents(dip, whichfork); + + /* + * For fork types that can contain local data, check that the fork + * format matches the size of local data contained within the fork. + * + * For all types, check that when the size says the should be in extent + * or btree format, the inode isn't claiming it is in local format. + */ + if (whichfork == XFS_DATA_FORK) { + if (S_ISDIR(mode) || S_ISLNK(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } - switch (XFS_DFORK_FORMAT(dip, whichfork)) { + if (be64_to_cpu(dip->di_size) > fork_size && + fork_format == XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + switch (fork_format) { case XFS_DINODE_FMT_LOCAL: /* - * no local regular files yet + * No local regular files yet. */ - if (whichfork == XFS_DATA_FORK) { - if (S_ISREG(be16_to_cpu(dip->di_mode))) - return __this_address; - if (be64_to_cpu(dip->di_size) > - XFS_DFORK_SIZE(dip, mp, whichfork)) - return __this_address; - } + if (S_ISREG(mode) && whichfork == XFS_DATA_FORK) + return __this_address; if (di_nextents) return __this_address; break; @@ -358,12 +397,11 @@ xfs_dinode_verify_fork( return __this_address; break; case XFS_DINODE_FMT_BTREE: - if (whichfork == XFS_ATTR_FORK) { - if (di_nextents > MAXAEXTNUM) - return __this_address; - } else if (di_nextents > MAXEXTNUM) { + max_extents = xfs_iext_max_nextents( + xfs_dinode_has_large_extent_counts(dip), + whichfork); + if (di_nextents > max_extents) return __this_address; - } break; default: return __this_address; @@ -396,6 +434,24 @@ xfs_dinode_verify_forkoff( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_nrext64( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) { + if (!xfs_has_large_extent_counts(mp)) + return __this_address; + if (dip->di_nrext64_pad != 0) + return __this_address; + } else if (dip->di_version >= 3) { + if (dip->di_v3_pad != 0) + return __this_address; + } + + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -407,6 +463,9 @@ xfs_dinode_verify( uint16_t flags; uint64_t flags2; uint64_t di_size; + xfs_extnum_t nextents; + xfs_extnum_t naextents; + xfs_filblks_t nblocks; if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return __this_address; @@ -437,10 +496,19 @@ xfs_dinode_verify( if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) return __this_address; + fa = xfs_dinode_verify_nrext64(mp, dip); + if (fa) + return fa; + + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + /* Fork checks carried over from xfs_iformat_fork */ - if (mode && - be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks)) + if (mode && nextents + naextents > nblocks) + return __this_address; + + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) @@ -497,7 +565,7 @@ xfs_dinode_verify( default: return __this_address; } - if (dip->di_anextents) + if (naextents) return __this_address; } @@ -639,7 +707,7 @@ xfs_inode_validate_extsize( if (extsize_bytes % blocksize_bytes) return __this_address; - if (extsize > MAXEXTLEN) + if (extsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) @@ -696,7 +764,7 @@ xfs_inode_validate_cowextsize( if (cowextsize_bytes % mp->m_sb.sb_blocksize) return __this_address; - if (cowextsize > MAXEXTLEN) + if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (cowextsize > mp->m_sb.sb_agblocks / 2) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9149f4f796fc..1a4cdf550f6d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -36,7 +36,7 @@ xfs_init_local_fork( int64_t size) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int mem_size = size, real_size = 0; + int mem_size = size; bool zero_terminate; /* @@ -50,8 +50,7 @@ xfs_init_local_fork( mem_size++; if (size) { - real_size = roundup(mem_size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); + ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS); memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; @@ -105,7 +104,7 @@ xfs_iformat_extents( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); int state = xfs_bmap_fork_to_state(whichfork); - int nex = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); struct xfs_iext_cursor icur; struct xfs_bmbt_rec *dp; @@ -117,8 +116,8 @@ xfs_iformat_extents( * we just bail out rather than crash in kmem_alloc() or memcpy() below. */ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { - xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", - (unsigned long long) ip->i_ino, nex); + xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).", + ip->i_ino, nex); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(1)", dip, sizeof(*dip), __this_address); @@ -230,7 +229,7 @@ xfs_iformat_data_fork( * depend on it. */ ip->i_df.if_format = dip->di_format; - ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents); + ip->i_df.if_nextents = xfs_dfork_data_extents(dip); switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -295,14 +294,14 @@ xfs_iformat_attr_fork( struct xfs_inode *ip, struct xfs_dinode *dip) { + xfs_extnum_t naextents = xfs_dfork_attr_extents(dip); int error = 0; /* * Initialize the extent count early, as the per-format routines may * depend on it. */ - ip->i_afp = xfs_ifork_alloc(dip->di_aformat, - be16_to_cpu(dip->di_anextents)); + ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents); switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_LOCAL: @@ -497,12 +496,7 @@ xfs_idata_realloc( return; } - /* - * For inline data, the underlying buffer must be a multiple of 4 bytes - * in size so that it can be logged and stay on word boundaries. - * We enforce that here. - */ - ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size, GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } @@ -744,7 +738,8 @@ xfs_iext_count_may_overflow( if (whichfork == XFS_COW_FORK) return 0; - max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM; + max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip), + whichfork); if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) max_exts = 10; @@ -755,3 +750,27 @@ xfs_iext_count_may_overflow( return 0; } + +/* + * Upgrade this inode's extent counter fields to be able to handle a potential + * increase in the extent count by nr_to_add. Normally this is the same + * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG. + */ +int +xfs_iext_count_upgrade( + struct xfs_trans *tp, + struct xfs_inode *ip, + uint nr_to_add) +{ + ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); + + if (!xfs_has_large_extent_counts(ip->i_mount) || + xfs_inode_has_large_extent_counts(ip) || + XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + return -EFBIG; + + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 3d64a3acb0ed..4f68c1f20beb 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -21,9 +21,9 @@ struct xfs_ifork { void *if_root; /* extent tree root */ char *if_data; /* inline file data */ } if_u1; + xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ - xfs_extnum_t if_nextents; /* # of extents in this fork */ }; /* @@ -40,19 +40,6 @@ struct xfs_ifork { #define XFS_IEXT_PUNCH_HOLE_CNT (1) /* - * Directory entry addition can cause the following, - * 1. Data block can be added/removed. - * A new extent can cause extent count to increase by 1. - * 2. Free disk block can be added/removed. - * Same behaviour as described above for Data block. - * 3. Dabtree blocks. - * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new - * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH. - */ -#define XFS_IEXT_DIR_MANIP_CNT(mp) \ - ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount) - -/* * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to * be added. One extra extent for dabtree in case a local attr is * large enough to cause a double split. It can also cause extent @@ -133,6 +120,65 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) return ifp->if_format; } +static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + case XFS_COW_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_DATA_FORK_LARGE; + return XFS_MAX_EXTCNT_DATA_FORK_SMALL; + + case XFS_ATTR_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + return XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + + default: + ASSERT(0); + return 0; + } +} + +static inline xfs_extnum_t +xfs_dfork_data_extents( + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + return be64_to_cpu(dip->di_big_nextents); + + return be32_to_cpu(dip->di_nextents); +} + +static inline xfs_extnum_t +xfs_dfork_attr_extents( + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + return be32_to_cpu(dip->di_big_anextents); + + return be16_to_cpu(dip->di_anextents); +} + +static inline xfs_extnum_t +xfs_dfork_nextents( + struct xfs_dinode *dip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return xfs_dfork_data_extents(dip); + case XFS_ATTR_FORK: + return xfs_dfork_attr_extents(dip); + default: + ASSERT(0); + break; + } + + return 0; +} + struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format, xfs_extnum_t nextents); struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); @@ -229,6 +275,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, int nr_to_add); +int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, + uint nr_to_add); /* returns true if the fork has extents but they are not read in yet. */ static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index b322db523d65..f7edd1ecf6d9 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -69,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr) /* Log Clients */ #define XFS_TRANSACTION 0x69 -#define XFS_VOLUME 0x2 #define XFS_LOG 0xaa #define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ @@ -114,7 +113,12 @@ struct xfs_unmount_log_format { #define XLOG_REG_TYPE_CUD_FORMAT 24 #define XLOG_REG_TYPE_BUI_FORMAT 25 #define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_MAX 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_MAX 30 + /* * Flags to log operation header @@ -237,6 +241,8 @@ typedef struct xfs_trans_header { #define XFS_LI_CUD 0x1243 #define XFS_LI_BUI 0x1244 /* bmbt update intent */ #define XFS_LI_BUD 0x1245 +#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/ +#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -252,7 +258,9 @@ typedef struct xfs_trans_header { { XFS_LI_CUI, "XFS_LI_CUI" }, \ { XFS_LI_CUD, "XFS_LI_CUD" }, \ { XFS_LI_BUI, "XFS_LI_BUI" }, \ - { XFS_LI_BUD, "XFS_LI_BUD" } + { XFS_LI_BUD, "XFS_LI_BUD" }, \ + { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ + { XFS_LI_ATTRD, "XFS_LI_ATTRD" } /* * Inode Log Item Format definitions. @@ -388,16 +396,41 @@ struct xfs_log_dinode { uint32_t di_nlink; /* number of links to file */ uint16_t di_projid_lo; /* lower part of owner's project id */ uint16_t di_projid_hi; /* higher part of owner's project id */ - uint8_t di_pad[6]; /* unused, zeroed space */ - uint16_t di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + uint64_t di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + uint64_t di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + uint8_t di_v2_pad[6]; /* V2 inode zeroed space */ + uint16_t di_flushiter; /* V2 inode incremented on flush */ + }; + }; xfs_log_timestamp_t di_atime; /* time last accessed */ xfs_log_timestamp_t di_mtime; /* time last modified */ xfs_log_timestamp_t di_ctime; /* time created/inode modified */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + uint32_t di_nextents; + uint16_t di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + uint32_t di_big_anextents; + uint16_t di_nrext64_pad; + } __packed; + } __packed; uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ @@ -869,4 +902,36 @@ struct xfs_icreate_log { __be32 icl_gen; /* inode generation number to use */ }; +/* + * Flags for deferred attribute operations. + * Upper bits are flags, lower byte is type code + */ +#define XFS_ATTR_OP_FLAGS_SET 1 /* Set the attribute */ +#define XFS_ATTR_OP_FLAGS_REMOVE 2 /* Remove the attribute */ +#define XFS_ATTR_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTR_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ + +/* + * This is the structure used to lay out an attr log item in the + * log. + */ +struct xfs_attri_log_format { + uint16_t alfi_type; /* attri log item type */ + uint16_t alfi_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfi_id; /* attri identifier */ + uint64_t alfi_ino; /* the inode for this attr operation */ + uint32_t alfi_op_flags; /* marks the op as a set or remove */ + uint32_t alfi_name_len; /* attr name length */ + uint32_t alfi_value_len; /* attr value length */ + uint32_t alfi_attr_flags;/* attr flags */ +}; + +struct xfs_attrd_log_format { + uint16_t alfd_type; /* attrd log item type */ + uint16_t alfd_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfd_alf_id; /* id of corresponding attri */ +}; + #endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index ff69a0000817..32e216255cb0 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -72,6 +72,8 @@ extern const struct xlog_recover_item_ops xlog_rui_item_ops; extern const struct xlog_recover_item_ops xlog_rud_item_ops; extern const struct xlog_recover_item_ops xlog_cui_item_ops; extern const struct xlog_recover_item_ops xlog_cud_item_ops; +extern const struct xlog_recover_item_ops xlog_attri_item_ops; +extern const struct xlog_recover_item_ops xlog_attrd_item_ops; /* * Macros, structures, prototypes for internal log manager use. diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 67798ff5e14e..9975b93a7412 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -14,6 +14,7 @@ #include "xfs_trans_space.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" +#include "xfs_trace.h" /* * Calculate the maximum length in bytes that would be required for a local @@ -37,6 +38,65 @@ xfs_log_calc_max_attrsetm_res( } /* + * Compute an alternate set of log reservation sizes for use exclusively with + * minimum log size calculations. + */ +static void +xfs_log_calc_trans_resv_for_minlogblocks( + struct xfs_mount *mp, + struct xfs_trans_resv *resv) +{ + unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; + + /* + * In the early days of rmap+reflink, we always set the rmap maxlevels + * to 9 even if the AG was small enough that it would never grow to + * that height. Transaction reservation sizes influence the minimum + * log size calculation, which influences the size of the log that mkfs + * creates. Use the old value here to ensure that newly formatted + * small filesystems will mount on older kernels. + */ + if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) + mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + + xfs_trans_resv_calc(mp, resv); + + if (xfs_has_reflink(mp)) { + /* + * In the early days of reflink, typical log operation counts + * were greatly overestimated. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + resv->tr_itruncate.tr_logcount = + XFS_ITRUNCATE_LOG_COUNT_REFLINK; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + } else if (xfs_has_rmapbt(mp)) { + /* + * In the early days of non-reflink rmap, the impact of rmapbt + * updates on log counts were not taken into account at all. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + } + + /* + * In the early days of reflink, we did not use deferred refcount + * update log items, so log reservations must be recomputed using the + * old calculations. + */ + resv->tr_write.tr_logres = + xfs_calc_write_reservation_minlogsize(mp); + resv->tr_itruncate.tr_logres = + xfs_calc_itruncate_reservation_minlogsize(mp); + resv->tr_qm_dqalloc.tr_logres = + xfs_calc_qm_dqalloc_reservation_minlogsize(mp); + + /* Put everything back the way it was. This goes at the end. */ + mp->m_rmap_maxlevels = rmap_maxlevels; +} + +/* * Iterate over the log space reservation table to figure out and return * the maximum one in terms of the pre-calculated values which were done * at mount time. @@ -46,19 +106,25 @@ xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) { + struct xfs_trans_resv resv = {}; struct xfs_trans_res *resp; struct xfs_trans_res *end_resp; + unsigned int i; int log_space = 0; int attr_space; attr_space = xfs_log_calc_max_attrsetm_res(mp); - resp = (struct xfs_trans_res *)M_RES(mp); - end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); - for (; resp < end_resp; resp++) { + xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv); + + resp = (struct xfs_trans_res *)&resv; + end_resp = (struct xfs_trans_res *)(&resv + 1); + for (i = 0; resp < end_resp; i++, resp++) { int tmp = resp->tr_logcount > 1 ? resp->tr_logres * resp->tr_logcount : resp->tr_logres; + + trace_xfs_trans_resv_calc_minlogsize(mp, i, resp); if (log_space < tmp) { log_space = tmp; *max_resp = *resp; /* struct copy */ @@ -66,9 +132,10 @@ xfs_log_get_max_trans_res( } if (attr_space > log_space) { - *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + *max_resp = resv.tr_attrsetm; /* struct copy */ max_resp->tr_logres = attr_space; } + trace_xfs_log_get_max_trans_res(mp, max_resp); } /* diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index a02c5062f9b2..cb035da3f990 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -16,7 +16,6 @@ * and quota-limits. This is a waste in the common case, but hey ... */ typedef uint64_t xfs_qcnt_t; -typedef uint16_t xfs_qwarncnt_t; typedef uint8_t xfs_dqtype_t; @@ -29,8 +28,8 @@ typedef uint8_t xfs_dqtype_t; /* * flags for q_flags field in the dquot. */ -#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */ -#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */ +#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */ +#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */ #define XFS_DQFLAG_STRINGS \ { XFS_DQFLAG_DIRTY, "DIRTY" }, \ @@ -73,29 +72,45 @@ typedef uint8_t xfs_dqtype_t; * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ -#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ -#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */ +#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */ +#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be * modified. */ -#define XFS_QMOPT_RES_REGBLKS 0x0010000 -#define XFS_QMOPT_RES_RTBLKS 0x0020000 -#define XFS_QMOPT_BCOUNT 0x0040000 -#define XFS_QMOPT_ICOUNT 0x0080000 -#define XFS_QMOPT_RTBCOUNT 0x0100000 -#define XFS_QMOPT_DELBCOUNT 0x0200000 -#define XFS_QMOPT_DELRTBCOUNT 0x0400000 -#define XFS_QMOPT_RES_INOS 0x0800000 +#define XFS_QMOPT_RES_REGBLKS (1u << 7) +#define XFS_QMOPT_RES_RTBLKS (1u << 8) +#define XFS_QMOPT_BCOUNT (1u << 9) +#define XFS_QMOPT_ICOUNT (1u << 10) +#define XFS_QMOPT_RTBCOUNT (1u << 11) +#define XFS_QMOPT_DELBCOUNT (1u << 12) +#define XFS_QMOPT_DELRTBCOUNT (1u << 13) +#define XFS_QMOPT_RES_INOS (1u << 14) /* * flags for dqalloc. */ -#define XFS_QMOPT_INHERIT 0x1000000 +#define XFS_QMOPT_INHERIT (1u << 31) + +#define XFS_QMOPT_FLAGS \ + { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ + { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ + { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ + { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ + { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ + { XFS_QMOPT_INHERIT, "INHERIT" }, \ + { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ + { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ + { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ + { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ + { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ + { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ + { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ + { XFS_QMOPT_RES_INOS, "RES_INOS" } /* * flags to xfs_trans_mod_dquot. @@ -114,6 +129,7 @@ typedef uint8_t xfs_dqtype_t; (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, struct xfs_disk_dquot *ddq, xfs_dqid_t id); extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 327ba25e9e17..97e9e6020596 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -886,8 +886,13 @@ xfs_refcount_still_have_space( { unsigned long overhead; - overhead = cur->bc_ag.refc.shape_changes * - xfs_allocfree_log_count(cur->bc_mp, 1); + /* + * Worst case estimate: full splits of the free space and rmap btrees + * to handle each of the shape changes to the refcount btree. + */ + overhead = xfs_allocfree_block_count(cur->bc_mp, + cur->bc_ag.refc.shape_changes); + overhead += cur->bc_mp->m_refc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; /* @@ -960,6 +965,7 @@ xfs_refcount_adjust_extents( * Either cover the hole (increment) or * delete the range (decrement). */ + cur->bc_ag.refc.nr_ops++; if (tmp.rc_refcount) { error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -970,7 +976,6 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, @@ -1001,11 +1006,11 @@ xfs_refcount_adjust_extents( ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &ext); + cur->bc_ag.refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) goto out_error; - cur->bc_ag.refc.nr_ops++; } else if (ext.rc_refcount == 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) @@ -1014,7 +1019,6 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_ag.refc.nr_ops++; goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 9eb01edbd89d..e8b322de7f3d 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -67,14 +67,17 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, * log (plus any key updates) so we'll conservatively assume 32 bytes * per record. We must also leave space for btree splits on both ends * of the range and space for the CUD and a new CUI. + * + * Each EFI that we attach to the transaction is assumed to consume ~32 bytes. + * This is a low estimate for an EFI tracking a single extent (16 bytes for the + * EFI header, 16 for the extent, and 12 for the xlog op header), but the + * estimate is acceptable if there's more than one extent being freed. + * In the worst case of freeing every other block during a refcount decrease + * operation, we amortize the space used for one EFI log item across 16 + * extents. */ #define XFS_REFCOUNT_ITEM_OVERHEAD 32 -static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) -{ - return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; -} - extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); union xfs_btree_rec; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index cd322174dbff..2845019d31da 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -34,18 +34,32 @@ int xfs_rmap_lookup_le( struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat) { + int get_stat = 0; + int error; + cur->bc_rec.r.rm_startblock = bno; - cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_blockcount = 0; cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + if (error || !(*stat) || !irec) + return error; + + error = xfs_rmap_get_rec(cur, irec, &get_stat); + if (error) + return error; + if (!get_stat) + return -EFSCORRUPTED; + + return 0; } /* @@ -251,7 +265,6 @@ out_bad_rec: struct xfs_find_left_neighbor_info { struct xfs_rmap_irec high; struct xfs_rmap_irec *irec; - int *stat; }; /* For each rmap given, figure out if it matches the key we want. */ @@ -276,7 +289,6 @@ xfs_rmap_find_left_neighbor_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -285,7 +297,7 @@ xfs_rmap_find_left_neighbor_helper( * return a match with the same owner and adjacent physical and logical * block ranges. */ -int +STATIC int xfs_rmap_find_left_neighbor( struct xfs_btree_cur *cur, xfs_agblock_t bno, @@ -296,6 +308,7 @@ xfs_rmap_find_left_neighbor( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; *stat = 0; @@ -313,21 +326,44 @@ xfs_rmap_find_left_neighbor( info.high.rm_flags = flags; info.high.rm_blockcount = 0; info.irec = irec; - info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_find_left_neighbor_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * extent conversion and remap operations run a bit faster if the + * physical extents aren't being shared. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_find_left_neighbor_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* For each rmap given, figure out if it matches the key we want. */ @@ -353,7 +389,6 @@ xfs_rmap_lookup_le_range_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -374,6 +409,7 @@ xfs_rmap_lookup_le_range( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; info.high.rm_startblock = bno; @@ -386,20 +422,44 @@ xfs_rmap_lookup_le_range( info.high.rm_blockcount = 0; *stat = 0; info.irec = irec; - info.stat = stat; - trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_lookup_le_range_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno, + bno, 0, owner, offset, flags); + + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * scrub run much faster on most filesystems because bmbt records are + * usually an exact match for rmap records. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_lookup_le_range_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_lookup_le_range_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* @@ -510,7 +570,7 @@ xfs_rmap_unmap( * for the AG headers at rm_startblock == 0 created by mkfs/growfs that * will not ever be removed from the tree. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &i); if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -518,13 +578,6 @@ xfs_rmap_unmap( goto out_error; } - error = xfs_rmap_get_rec(cur, <rec, &i); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -786,18 +839,11 @@ xfs_rmap_map( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &have_lt); if (error) goto out_error; if (have_lt) { - error = xfs_rmap_get_rec(cur, <rec, &have_lt); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, have_lt != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -1022,7 +1068,7 @@ xfs_rmap_convert( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -1030,13 +1076,6 @@ xfs_rmap_convert( goto done; } - error = xfs_rmap_get_rec(cur, &PREV, &i); - if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, @@ -1140,7 +1179,7 @@ xfs_rmap_convert( _RET_IP_); /* reset the cursor back to PREV */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -2677,7 +2716,7 @@ xfs_rmap_record_exists( ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || (flags & XFS_RMAP_BMBT_BLOCK)); - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec, &has_record); if (error) return error; @@ -2686,14 +2725,6 @@ xfs_rmap_record_exists( return 0; } - error = xfs_rmap_get_rec(cur, &irec, &has_record); - if (error) - return error; - if (!has_record) { - *has_rmap = false; - return 0; - } - *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && irec.rm_startblock + irec.rm_blockcount >= bno + len); return 0; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index b718ebeda372..54741a591a17 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -122,8 +122,8 @@ int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, const struct xfs_owner_info *oinfo); int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, - unsigned int flags, int *stat); + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, int *stat); @@ -184,9 +184,6 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); -int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, - uint64_t owner, uint64_t offset, unsigned int flags, - struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, struct xfs_rmap_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5740ba664867..fa180ab66b73 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1008,6 +1008,7 @@ xfs_rtfree_extent( /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, @@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range( void *priv) { struct xfs_rtalloc_rec rec; - struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; xfs_rtblock_t high_key; @@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range( rec.ar_startext = rtstart; rec.ar_extcount = rtend - rtstart + 1; - error = fn(tp, &rec, priv); + error = fn(mp, tp, &rec, priv); if (error) break; } @@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( + struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) @@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all( struct xfs_rtalloc_rec keys[2]; keys[0].ar_startext = 0; - keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1; + keys[1].ar_startext = mp->m_sb.sb_rextents - 1; keys[0].ar_extcount = keys[1].ar_extcount = 0; - return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); + return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv); } /* Is the given extent all free? */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index f4e84aa1d50a..a20cade590e9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -31,15 +31,66 @@ */ /* + * Check that all the V4 feature bits that the V5 filesystem format requires are + * correctly set. + */ +static bool +xfs_sb_validate_v5_features( + struct xfs_sb *sbp) +{ + /* We must not have any unknown V4 feature bits set */ + if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) + return false; + + /* + * The CRC bit is considered an invalid V4 flag, so we have to add it + * manually to the OKBITS mask. + */ + if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS | + XFS_SB_VERSION2_CRCBIT)) + return false; + + /* Now check all the required V4 feature flags are set. */ + +#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \ + XFS_SB_VERSION_ALIGNBIT | \ + XFS_SB_VERSION_LOGV2BIT | \ + XFS_SB_VERSION_EXTFLGBIT | \ + XFS_SB_VERSION_DIRV2BIT | \ + XFS_SB_VERSION_MOREBITSBIT) + +#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_CRCBIT) + + if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS) + return false; + if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS) + return false; + return true; +} + +/* * We support all XFS versions newer than a v4 superblock with V2 directories. */ bool xfs_sb_good_version( struct xfs_sb *sbp) { - /* all v5 filesystems are supported */ + /* + * All v5 filesystems are supported, but we must check that all the + * required v4 feature flags are enabled correctly as the code checks + * those flags and not for v5 support. + */ if (xfs_sb_is_v5(sbp)) - return true; + return xfs_sb_validate_v5_features(sbp); + + /* We must not have any unknown v4 feature bits set */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; /* versions prior to v4 are not supported */ if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) @@ -51,12 +102,6 @@ xfs_sb_good_version( if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) return false; - /* And must not have any unknown v4 feature bits set */ - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) - return false; - /* It's a supported v4 filesystem */ return true; } @@ -70,6 +115,8 @@ xfs_sb_version_to_features( /* optional V4 features */ if (sbp->sb_rblocks > 0) features |= XFS_FEAT_REALTIME; + if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT) + features |= XFS_FEAT_NLINK; if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT) features |= XFS_FEAT_ATTR; if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT) @@ -124,6 +171,9 @@ xfs_sb_version_to_features( features |= XFS_FEAT_BIGTIME; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) features |= XFS_FEAT_NEEDSREPAIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) + features |= XFS_FEAT_NREXT64; + return features; } @@ -262,12 +312,15 @@ xfs_validate_sb_common( bool has_dalign; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, +"Superblock has bad magic number 0x%x. Not an XFS filesystem?", + be32_to_cpu(dsb->sb_magicnum)); return -EWRONGFS; } if (!xfs_sb_good_version(sbp)) { - xfs_warn(mp, "bad version"); + xfs_warn(mp, +"Superblock has unknown features enabled or corrupted feature masks."); return -EWRONGFS; } @@ -911,6 +964,11 @@ xfs_log_sb( * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. + * + * Do not update sb_frextents here because it is not part of the lazy + * sb counters, despite having a percpu counter. It is always kept + * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() + * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); @@ -1135,6 +1193,8 @@ xfs_fs_geometry( } else { geo->logsectsize = BBSIZE; } + if (xfs_has_large_extent_counts(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 25c4cab58851..c4381388c0c1 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -54,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, /* * Values for t_flags. */ -#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ -#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ -#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ -#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ -#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ -#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ +/* Transaction needs to be logged */ +#define XFS_TRANS_DIRTY (1u << 0) +/* Superblock is dirty and needs to be logged */ +#define XFS_TRANS_SB_DIRTY (1u << 1) +/* Transaction took a permanent log reservation */ +#define XFS_TRANS_PERM_LOG_RES (1u << 2) +/* Synchronous transaction commit needed */ +#define XFS_TRANS_SYNC (1u << 3) +/* Transaction can use reserve block pool */ +#define XFS_TRANS_RESERVE (1u << 4) +/* Transaction should avoid VFS level superblock write accounting */ +#define XFS_TRANS_NO_WRITECOUNT (1u << 5) +/* Transaction has freed blocks returned to it's reservation */ +#define XFS_TRANS_RES_FDBLKS (1u << 6) +/* Transaction contains an intent done log item */ +#define XFS_TRANS_HAS_INTENT_DONE (1u << 7) + /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6f83d9b306ee..e9913c2c5a24 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -56,15 +56,14 @@ xfs_calc_buf_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating an extent. In classic XFS there were two trees that will be * modified (bnobt + cntbt). With rmap enabled, there are three trees - * (rmapbt). With reflink, there are four trees (refcountbt). The number of - * blocks reserved is based on the formula: + * (rmapbt). The number of blocks reserved is based on the formula: * * num trees * ((2 blocks/level * max depth) - 1) * * Keep in mind that max depth is calculated separately for each type of tree. */ uint -xfs_allocfree_log_count( +xfs_allocfree_block_count( struct xfs_mount *mp, uint num_ops) { @@ -73,13 +72,24 @@ xfs_allocfree_log_count( blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1); if (xfs_has_rmapbt(mp)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); - if (xfs_has_reflink(mp)) - blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; } /* + * Per-extent log reservation for refcount btree changes. These are never done + * in the same transaction as an allocation or a free, so we compute them + * separately. + */ +static unsigned int +xfs_refcountbt_block_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + return num_ops * (2 * mp->m_refc_maxlevels - 1); +} + +/* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into * the amount of space they use on disk. @@ -136,7 +146,7 @@ xfs_calc_inobt_res( { return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -183,7 +193,7 @@ xfs_calc_inode_chunk_res( { uint res, size = 0; - res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ @@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res( /* * Per-extent log reservation for the btree changes involved in freeing or * allocating a realtime extent. We have to be able to log as many rtbitmap - * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, - * as well as the realtime summary block. + * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime + * extents, as well as the realtime summary block. */ static unsigned int -xfs_rtalloc_log_count( +xfs_rtalloc_block_count( struct xfs_mount *mp, unsigned int num_ops) { unsigned int blksz = XFS_FSB_TO_B(mp, 1); unsigned int rtbmp_bytes; - rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY; return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; } @@ -233,6 +243,28 @@ xfs_rtalloc_log_count( * register overflow from temporaries in the calculations. */ +/* + * Compute the log reservation required to handle the refcount update + * transaction. Refcount updates are always done via deferred log items. + * + * This is calculated as: + * Data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +static unsigned int +xfs_calc_refcountbt_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); +} /* * In a write transaction we can allocate a maximum of 2 @@ -247,7 +279,7 @@ xfs_rtalloc_log_count( * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size - * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 1 block * the allocation btrees: 2 trees * (2 * max depth - 1) * block size * And the bmap_finish transaction can free bmap blocks in a join (t3): @@ -255,34 +287,65 @@ xfs_rtalloc_log_count( * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_write_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); if (xfs_has_realtime(mp)) { t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz); } else { t2 = 0; } t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * two refcountbt splits for each transaction. The codebase runs + * refcountbt updates in separate transactions now, so to compute the + * minimum log size, add the refcountbtree splits back to t1 and t3 and + * do not account them separately as t4. Reflink did not support + * realtime when the reservations were established, so no adjustment to + * t2 is needed. + */ + if (for_minlogsize) { + unsigned int adj = 0; + + if (xfs_has_reflink(mp)) + adj = xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 2), + blksz); + t1 += adj; + t3 += adj; + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 1); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_write_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_write_reservation(mp, true); } /* @@ -299,33 +362,62 @@ xfs_calc_write_reservation( * the agf for each of the ags: 2 * sector size * the agfl for each of the ags: 2 * sector size * the super block to reflect the freed blocks: sector size - * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 2 exts * 1 block * worst case split in allocation btrees per extent assuming 2 extents: * 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_itruncate_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); if (xfs_has_realtime(mp)) { t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); } else { t3 = 0; } - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * four refcountbt splits in the same transaction as bnobt/cntbt + * updates. The codebase runs refcountbt updates in separate + * transactions now, so to compute the minimum log size, add the + * refcount btree splits back here and do not compute them separately + * as t4. Reflink did not support realtime when the reservations were + * established, so do not adjust t3. + */ + if (for_minlogsize) { + if (xfs_has_reflink(mp)) + t2 += xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 4), + blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 2); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_itruncate_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_itruncate_reservation(mp, true); } /* @@ -349,7 +441,7 @@ xfs_calc_rename_reservation( xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), XFS_FSB_TO_B(mp, 1)))); } @@ -389,7 +481,7 @@ xfs_calc_link_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)))); } @@ -427,7 +519,7 @@ xfs_calc_remove_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -572,7 +664,7 @@ xfs_calc_growdata_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -670,7 +762,7 @@ xfs_calc_addafork_reservation( xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), XFS_FSB_TO_B(mp, 1)))); } @@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation( XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -791,13 +883,21 @@ xfs_calc_qm_setqlim_reservation(void) */ STATIC uint xfs_calc_qm_dqalloc_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - return xfs_calc_write_reservation(mp) + + return xfs_calc_write_reservation(mp, for_minlogsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); } +unsigned int +xfs_calc_qm_dqalloc_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_qm_dqalloc_reservation(mp, true); +} + /* * Syncing the incore super block changes to disk. * the super block to reflect the changes: sector size @@ -814,36 +914,18 @@ xfs_trans_resv_calc( struct xfs_mount *mp, struct xfs_trans_resv *resp) { - unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; - - /* - * In the early days of rmap+reflink, we always set the rmap maxlevels - * to 9 even if the AG was small enough that it would never grow to - * that height. Transaction reservation sizes influence the minimum - * log size calculation, which influences the size of the log that mkfs - * creates. Use the old value here to ensure that newly formatted - * small filesystems will mount on older kernels. - */ - if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) - mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + int logcount_adj = 0; /* * The following transactions are logged in physical format and * require a permanent reservation on space. */ - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_itruncate.tr_logcount = - XFS_ITRUNCATE_LOG_COUNT_REFLINK; - else - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -899,11 +981,9 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp, + false); + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; /* @@ -930,6 +1010,19 @@ xfs_trans_resv_calc( resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); - /* Put everything back the way it was. This goes at the end. */ - mp->m_rmap_maxlevels = rmap_maxlevels; + /* + * Add one logcount for BUI items that appear with rmap or reflink, + * one logcount for refcount intent items, and one logcount for rmap + * intent items. + */ + if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp)) + logcount_adj++; + if (xfs_has_reflink(mp)) + logcount_adj++; + if (xfs_has_rmapbt(mp)) + logcount_adj++; + + resp->tr_itruncate.tr_logcount += logcount_adj; + resp->tr_write.tr_logcount += logcount_adj; + resp->tr_qm_dqalloc.tr_logcount += logcount_adj; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index fc4e9b369a3a..0554b9d775d2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -73,7 +73,6 @@ struct xfs_trans_resv { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 -#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_CREATE_TMPFILE_LOG_COUNT 2 @@ -83,13 +82,24 @@ struct xfs_trans_resv { #define XFS_LINK_LOG_COUNT 2 #define XFS_RENAME_LOG_COUNT 2 #define XFS_WRITE_LOG_COUNT 2 -#define XFS_WRITE_LOG_COUNT_REFLINK 8 #define XFS_ADDAFORK_LOG_COUNT 2 #define XFS_ATTRINVAL_LOG_COUNT 1 #define XFS_ATTRSET_LOG_COUNT 3 #define XFS_ATTRRM_LOG_COUNT 3 +/* + * Original log operation counts were overestimated in the early days of + * reflink. These are retained here purely for minimum log size calculations + * and must not be used for runtime reservations. + */ +#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 +#define XFS_WRITE_LOG_COUNT_REFLINK 8 + void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); -uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); +uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); + +unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b6da06b40989..373f64a492a4 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ -typedef int32_t xfs_extnum_t; /* # of extents in a file */ -typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef uint64_t xfs_extnum_t; /* # of extents in a file */ +typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ @@ -57,13 +57,6 @@ typedef void * xfs_failaddr_t; #define NULLAGINO ((xfs_agino_t)-1) /* - * Max values for extlen, extnum, aextnum. - */ -#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ -#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ -#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ - -/* * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. * The sectorsize upper limit is due to sizeof(sb_sectsize). diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index a4cbbc346f60..285995ba3947 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -133,29 +133,13 @@ xchk_bmap_get_rmap( if (info->is_shared) { error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno, owner, offset, rflags, rmap, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) - return false; - goto out; + } else { + error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, + owner, offset, rflags, rmap, &has_rmap); } - - /* - * Otherwise, use the (faster) regular lookup. - */ - error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner, - offset, rflags, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) + if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur)) return false; - if (!has_rmap) - goto out; - error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) - return false; - -out: if (!has_rmap) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -350,7 +334,7 @@ xchk_bmap_iextent( irec->br_startoff); /* Make sure the extent points to a valid place. */ - if (irec->br_blockcount > MAXEXTLEN) + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (info->is_rt && diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index bf1f3607d0b6..97b54ac3075f 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -23,6 +23,8 @@ #include "xfs_rmap_btree.h" #include "xfs_log.h" #include "xfs_trans_priv.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index eac15af7b08c..51820b40ab1c 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -232,7 +232,8 @@ xchk_dinode( size_t fork_recs; unsigned long long isize; uint64_t flags2; - uint32_t nextents; + xfs_extnum_t nextents; + xfs_extnum_t naextents; prid_t prid; uint16_t flags; uint16_t mode; @@ -390,8 +391,10 @@ xchk_dinode( xchk_inode_extsize(sc, dip, ino, mode, flags); + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + /* di_nextents */ - nextents = be32_to_cpu(dip->di_nextents); fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); switch (dip->di_format) { case XFS_DINODE_FMT_EXTENTS: @@ -411,7 +414,7 @@ xchk_dinode( /* di_forkoff */ if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) xchk_ino_set_corrupt(sc, ino); - if (dip->di_anextents != 0 && dip->di_forkoff == 0) + if (naextents != 0 && dip->di_forkoff == 0) xchk_ino_set_corrupt(sc, ino); if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) xchk_ino_set_corrupt(sc, ino); @@ -423,19 +426,18 @@ xchk_dinode( xchk_ino_set_corrupt(sc, ino); /* di_anextents */ - nextents = be16_to_cpu(dip->di_anextents); fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); switch (dip->di_aformat) { case XFS_DINODE_FMT_EXTENTS: - if (nextents > fork_recs) + if (naextents > fork_recs) xchk_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_BTREE: - if (nextents <= fork_recs) + if (naextents <= fork_recs) xchk_ino_set_corrupt(sc, ino); break; default: - if (nextents != 0) + if (naextents != 0) xchk_ino_set_corrupt(sc, ino); } @@ -513,14 +515,14 @@ xchk_inode_xref_bmap( &nextents, &count); if (!xchk_should_check_xref(sc, &error, NULL)) return; - if (nextents < be32_to_cpu(dip->di_nextents)) + if (nextents < xfs_dfork_data_extents(dip)) xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, &nextents, &acount); if (!xchk_should_check_xref(sc, &error, NULL)) return; - if (nextents != be16_to_cpu(dip->di_anextents)) + if (nextents != xfs_dfork_attr_extents(dip)) xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); /* Check nblocks against the inode. */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 8fa012057405..0a3bde64c675 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -40,6 +40,7 @@ xchk_setup_rt( /* Scrub a free extent record from the realtime bitmap. */ STATIC int xchk_rtbitmap_rec( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -48,10 +49,10 @@ xchk_rtbitmap_rec( xfs_rtblock_t startblock; xfs_rtblock_t blockcount; - startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize; - blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize; + startblock = rec->ar_startext * mp->m_sb.sb_rextsize; + blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; - if (!xfs_verify_rtext(sc->mp, startblock, blockcount)) + if (!xfs_verify_rtext(mp, startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -114,7 +115,7 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 5c52ee869272..3df9c1782ead 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -10,12 +10,12 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_acl.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_trans.h" #include <linux/posix_acl_xattr.h> diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index bb6abdcb265d..263404d0bfda 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -16,11 +16,13 @@ extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); #else -static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu) +#define xfs_get_acl NULL +#define xfs_set_acl NULL +static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, + int type) { - return NULL; + return 0; } -# define xfs_set_acl NULL static inline void xfs_forget_acl(struct inode *inode, const char *name) { } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c new file mode 100644 index 000000000000..e8ac88d9fd14 --- /dev/null +++ b/fs/xfs/xfs_attr_item.c @@ -0,0 +1,824 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022 Oracle. All Rights Reserved. + * Author: Allison Henderson <allison.henderson@oracle.com> + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_shared.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_item.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_trans_space.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" + +static const struct xfs_item_ops xfs_attri_item_ops; +static const struct xfs_item_ops xfs_attrd_item_ops; +static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, + struct xfs_attri_log_item *attrip); + +static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_attri_log_item, attri_item); +} + +STATIC void +xfs_attri_item_free( + struct xfs_attri_log_item *attrip) +{ + kmem_free(attrip->attri_item.li_lv_shadow); + kvfree(attrip); +} + +/* + * Freeing the attrip requires that we remove it from the AIL if it has already + * been placed there. However, the ATTRI may not yet have been placed in the + * AIL when called by xfs_attri_release() from ATTRD processing due to the + * ordering of committed vs unpin operations in bulk insert operations. Hence + * the reference count to ensure only the last caller frees the ATTRI. + */ +STATIC void +xfs_attri_release( + struct xfs_attri_log_item *attrip) +{ + ASSERT(atomic_read(&attrip->attri_refcount) > 0); + if (!atomic_dec_and_test(&attrip->attri_refcount)) + return; + + xfs_trans_ail_delete(&attrip->attri_item, 0); + xfs_attri_item_free(attrip); +} + +STATIC void +xfs_attri_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + + *nvecs += 2; + *nbytes += sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(attrip->attri_name_len); + + if (!attrip->attri_value_len) + return; + + *nvecs += 1; + *nbytes += xlog_calc_iovec_len(attrip->attri_value_len); +} + +/* + * This is called to fill in the log iovecs for the given attri log + * item. We use 1 iovec for the attri_format_item, 1 for the name, and + * another for the value if it is present + */ +STATIC void +xfs_attri_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + attrip->attri_format.alfi_type = XFS_LI_ATTRI; + attrip->attri_format.alfi_size = 1; + + /* + * This size accounting must be done before copying the attrip into the + * iovec. If we do it after, the wrong size will be recorded to the log + * and we trip across assertion checks for bad region sizes later during + * the log recovery. + */ + + ASSERT(attrip->attri_name_len > 0); + attrip->attri_format.alfi_size++; + + if (attrip->attri_value_len > 0) + attrip->attri_format.alfi_size++; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, + &attrip->attri_format, + sizeof(struct xfs_attri_log_format)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, + attrip->attri_name, + attrip->attri_name_len); + if (attrip->attri_value_len > 0) + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE, + attrip->attri_value, + attrip->attri_value_len); +} + +/* + * The unpin operation is the last place an ATTRI is manipulated in the log. It + * is either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the ATTRI transaction has been successfully committed to make + * it this far. Therefore, we expect whoever committed the ATTRI to either + * construct and commit the ATTRD or drop the ATTRD's reference in the event of + * error. Simply drop the log's ATTRI reference now that the log is done with + * it. + */ +STATIC void +xfs_attri_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + xfs_attri_release(ATTRI_ITEM(lip)); +} + + +STATIC void +xfs_attri_item_release( + struct xfs_log_item *lip) +{ + xfs_attri_release(ATTRI_ITEM(lip)); +} + +/* + * Allocate and initialize an attri item. Caller may allocate an additional + * trailing buffer for name and value + */ +STATIC struct xfs_attri_log_item * +xfs_attri_init( + struct xfs_mount *mp, + uint32_t name_len, + uint32_t value_len) + +{ + struct xfs_attri_log_item *attrip; + uint32_t buffer_size = name_len + value_len; + + if (buffer_size) { + /* + * This could be over 64kB in length, so we have to use + * kvmalloc() for this. But kvmalloc() utterly sucks, so we + * use own version. + */ + attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) + + buffer_size); + } else { + attrip = kmem_cache_alloc(xfs_attri_cache, + GFP_NOFS | __GFP_NOFAIL); + } + memset(attrip, 0, sizeof(struct xfs_attri_log_item)); + + attrip->attri_name_len = name_len; + if (name_len) + attrip->attri_name = ((char *)attrip) + + sizeof(struct xfs_attri_log_item); + else + attrip->attri_name = NULL; + + attrip->attri_value_len = value_len; + if (value_len) + attrip->attri_value = ((char *)attrip) + + sizeof(struct xfs_attri_log_item) + + name_len; + else + attrip->attri_value = NULL; + + xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI, + &xfs_attri_item_ops); + attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip; + atomic_set(&attrip->attri_refcount, 2); + + return attrip; +} + +/* + * Copy an attr format buffer from the given buf, and into the destination attr + * format structure. + */ +STATIC int +xfs_attri_copy_format( + struct xfs_log_iovec *buf, + struct xfs_attri_log_format *dst_attr_fmt) +{ + struct xfs_attri_log_format *src_attr_fmt = buf->i_addr; + size_t len; + + len = sizeof(struct xfs_attri_log_format); + if (buf->i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; + } + + memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len); + return 0; +} + +static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_attrd_log_item, attrd_item); +} + +STATIC void +xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp) +{ + kmem_free(attrdp->attrd_item.li_lv_shadow); + kmem_free(attrdp); +} + +STATIC void +xfs_attrd_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_attrd_log_format); +} + +/* + * This is called to fill in the log iovecs for the given attrd log item. We use + * only 1 iovec for the attrd_format, and we point that at the attr_log_format + * structure embedded in the attrd item. + */ +STATIC void +xfs_attrd_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + attrdp->attrd_format.alfd_type = XFS_LI_ATTRD; + attrdp->attrd_format.alfd_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT, + &attrdp->attrd_format, + sizeof(struct xfs_attrd_log_format)); +} + +/* + * The ATTRD is either committed or aborted if the transaction is canceled. If + * the transaction is canceled, drop our reference to the ATTRI and free the + * ATTRD. + */ +STATIC void +xfs_attrd_item_release( + struct xfs_log_item *lip) +{ + struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip); + + xfs_attri_release(attrdp->attrd_attrip); + xfs_attrd_item_free(attrdp); +} + +static struct xfs_log_item * +xfs_attrd_item_intent( + struct xfs_log_item *lip) +{ + return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; +} + +/* + * Performs one step of an attribute update intent and marks the attrd item + * dirty.. An attr operation may be a set or a remove. Note that the + * transaction is marked dirty regardless of whether the operation succeeds or + * fails to support the ATTRI/ATTRD lifecycle rules. + */ +STATIC int +xfs_xattri_finish_update( + struct xfs_attr_item *attr, + struct xfs_attrd_log_item *attrdp) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error; + + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + error = -EIO; + goto out; + } + + error = xfs_attr_set_iter(attr); + if (!error && attr->xattri_dela_state != XFS_DAS_DONE) + error = -EAGAIN; +out: + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the ATTRI and frees the ATTRD + * 2.) shuts down the filesystem + */ + args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; + + /* + * attr intent/done items are null when logged attributes are disabled + */ + if (attrdp) + set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); + + return error; +} + +/* Log an attr to the intent item. */ +STATIC void +xfs_attr_log_item( + struct xfs_trans *tp, + struct xfs_attri_log_item *attrip, + struct xfs_attr_item *attr) +{ + struct xfs_attri_log_format *attrp; + + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); + + /* + * At this point the xfs_attr_item has been constructed, and we've + * created the log intent. Fill in the attri log item and log format + * structure with fields from this xfs_attr_item + */ + attrp = &attrip->attri_format; + attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; + attrp->alfi_op_flags = attr->xattri_op_flags; + attrp->alfi_value_len = attr->xattri_da_args->valuelen; + attrp->alfi_name_len = attr->xattri_da_args->namelen; + attrp->alfi_attr_flags = attr->xattri_da_args->attr_filter; + + memcpy(attrip->attri_name, attr->xattri_da_args->name, + attr->xattri_da_args->namelen); + memcpy(attrip->attri_value, attr->xattri_da_args->value, + attr->xattri_da_args->valuelen); + attrip->attri_name_len = attr->xattri_da_args->namelen; + attrip->attri_value_len = attr->xattri_da_args->valuelen; +} + +/* Get an ATTRI. */ +static struct xfs_log_item * +xfs_attr_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_attri_log_item *attrip; + struct xfs_attr_item *attr; + + ASSERT(count == 1); + + if (!xfs_sb_version_haslogxattrs(&mp->m_sb)) + return NULL; + + /* + * Each attr item only performs one attribute operation at a time, so + * this is a list of one + */ + list_for_each_entry(attr, items, xattri_list) { + attrip = xfs_attri_init(mp, attr->xattri_da_args->namelen, + attr->xattri_da_args->valuelen); + if (attrip == NULL) + return NULL; + + xfs_trans_add_item(tp, &attrip->attri_item); + xfs_attr_log_item(tp, attrip, attr); + } + + return &attrip->attri_item; +} + +/* Process an attr. */ +STATIC int +xfs_attr_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + struct xfs_attr_item *attr; + struct xfs_attrd_log_item *done_item = NULL; + int error; + + attr = container_of(item, struct xfs_attr_item, xattri_list); + if (done) + done_item = ATTRD_ITEM(done); + + /* + * Always reset trans after EAGAIN cycle + * since the transaction is new + */ + attr->xattri_da_args->trans = tp; + + error = xfs_xattri_finish_update(attr, done_item); + if (error != -EAGAIN) + kmem_free(attr); + + return error; +} + +/* Abort all pending ATTRs. */ +STATIC void +xfs_attr_abort_intent( + struct xfs_log_item *intent) +{ + xfs_attri_release(ATTRI_ITEM(intent)); +} + +/* Cancel an attr */ +STATIC void +xfs_attr_cancel_item( + struct list_head *item) +{ + struct xfs_attr_item *attr; + + attr = container_of(item, struct xfs_attr_item, xattri_list); + kmem_free(attr); +} + +STATIC xfs_lsn_t +xfs_attri_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + + /* + * The attrip refers to xfs_attr_item memory to log the name and value + * with the intent item. This already occurred when the intent was + * committed so these fields are no longer accessed. Clear them out of + * caution since we're about to free the xfs_attr_item. + */ + attrip->attri_name = NULL; + attrip->attri_value = NULL; + + /* + * The ATTRI is logged only once and cannot be moved in the log, so + * simply return the lsn at which it's been logged. + */ + return lsn; +} + +STATIC bool +xfs_attri_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id; +} + +/* Is this recovered ATTRI format ok? */ +static inline bool +xfs_attri_validate( + struct xfs_mount *mp, + struct xfs_attri_log_format *attrp) +{ + unsigned int op = attrp->alfi_op_flags & + XFS_ATTR_OP_FLAGS_TYPE_MASK; + + if (attrp->__pad != 0) + return false; + + /* alfi_op_flags should be either a set or remove */ + switch (op) { + case XFS_ATTR_OP_FLAGS_SET: + case XFS_ATTR_OP_FLAGS_REPLACE: + case XFS_ATTR_OP_FLAGS_REMOVE: + break; + default: + return false; + } + + if (attrp->alfi_value_len > XATTR_SIZE_MAX) + return false; + + if ((attrp->alfi_name_len > XATTR_NAME_MAX) || + (attrp->alfi_name_len == 0)) + return false; + + return xfs_verify_ino(mp, attrp->alfi_ino); +} + +/* + * Process an attr intent item that was recovered from the log. We need to + * delete the attr that it describes. + */ +STATIC int +xfs_attri_item_recover( + struct xfs_log_item *lip, + struct list_head *capture_list) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attr_item *attr; + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_inode *ip; + struct xfs_da_args *args; + struct xfs_trans *tp; + struct xfs_trans_res tres; + struct xfs_attri_log_format *attrp; + int error, ret = 0; + int total; + int local; + struct xfs_attrd_log_item *done_item = NULL; + + /* + * First check the validity of the attr described by the ATTRI. If any + * are bad, then assume that all are bad and just toss the ATTRI. + */ + attrp = &attrip->attri_format; + if (!xfs_attri_validate(mp, attrp) || + !xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) + return -EFSCORRUPTED; + + error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + if (error) + return error; + + attr = kmem_zalloc(sizeof(struct xfs_attr_item) + + sizeof(struct xfs_da_args), KM_NOFS); + args = (struct xfs_da_args *)(attr + 1); + + attr->xattri_da_args = args; + attr->xattri_op_flags = attrp->alfi_op_flags; + + args->dp = ip; + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->name = attrip->attri_name; + args->namelen = attrp->alfi_name_len; + args->hashval = xfs_da_hashname(args->name, args->namelen); + args->attr_filter = attrp->alfi_attr_flags; + args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; + + switch (attrp->alfi_op_flags & XFS_ATTR_OP_FLAGS_TYPE_MASK) { + case XFS_ATTR_OP_FLAGS_SET: + case XFS_ATTR_OP_FLAGS_REPLACE: + args->value = attrip->attri_value; + args->valuelen = attrp->alfi_value_len; + args->total = xfs_attr_calc_size(args, &local); + if (xfs_inode_hasattr(args->dp)) + attr->xattri_dela_state = xfs_attr_init_replace_state(args); + else + attr->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTR_OP_FLAGS_REMOVE: + if (!xfs_inode_hasattr(args->dp)) + goto out; + attr->xattri_dela_state = xfs_attr_init_remove_state(args); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + goto out; + } + + xfs_init_attr_trans(args, &tres, &total); + error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp); + if (error) + goto out; + + args->trans = tp; + done_item = xfs_trans_get_attrd(tp, attrip); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + ret = xfs_xattri_finish_update(attr, done_item); + if (ret == -EAGAIN) { + /* There's more work to do, so add it to this transaction */ + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); + } else + error = ret; + + if (error) { + xfs_trans_cancel(tp); + goto out_unlock; + } + + error = xfs_defer_ops_capture_and_commit(tp, capture_list); + +out_unlock: + if (attr->xattri_leaf_bp) + xfs_buf_relse(attr->xattri_leaf_bp); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); +out: + if (ret != -EAGAIN) + kmem_free(attr); + return error; +} + +/* Re-log an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_attri_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_attrd_log_item *attrdp; + struct xfs_attri_log_item *old_attrip; + struct xfs_attri_log_item *new_attrip; + struct xfs_attri_log_format *new_attrp; + struct xfs_attri_log_format *old_attrp; + + old_attrip = ATTRI_ITEM(intent); + old_attrp = &old_attrip->attri_format; + + tp->t_flags |= XFS_TRANS_DIRTY; + attrdp = xfs_trans_get_attrd(tp, old_attrip); + set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); + + new_attrip = xfs_attri_init(tp->t_mountp, old_attrp->alfi_name_len, + old_attrp->alfi_value_len); + new_attrp = &new_attrip->attri_format; + + new_attrp->alfi_ino = old_attrp->alfi_ino; + new_attrp->alfi_op_flags = old_attrp->alfi_op_flags; + new_attrp->alfi_value_len = old_attrp->alfi_value_len; + new_attrp->alfi_name_len = old_attrp->alfi_name_len; + new_attrp->alfi_attr_flags = old_attrp->alfi_attr_flags; + + memcpy(new_attrip->attri_name, old_attrip->attri_name, + new_attrip->attri_name_len); + + if (new_attrip->attri_value_len > 0) + memcpy(new_attrip->attri_value, old_attrip->attri_value, + new_attrip->attri_value_len); + + xfs_trans_add_item(tp, &new_attrip->attri_item); + set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); + + return &new_attrip->attri_item; +} + +STATIC int +xlog_recover_attri_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_attri_log_item *attrip; + struct xfs_attri_log_format *attri_formatp; + int region = 0; + + attri_formatp = item->ri_buf[region].i_addr; + + /* Validate xfs_attri_log_format */ + if (!xfs_attri_validate(mp, attri_formatp)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + /* memory alloc failure will cause replay to abort */ + attrip = xfs_attri_init(mp, attri_formatp->alfi_name_len, + attri_formatp->alfi_value_len); + if (attrip == NULL) + return -ENOMEM; + + error = xfs_attri_copy_format(&item->ri_buf[region], + &attrip->attri_format); + if (error) + goto out; + + region++; + memcpy(attrip->attri_name, item->ri_buf[region].i_addr, + attrip->attri_name_len); + + if (!xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out; + } + + if (attrip->attri_value_len > 0) { + region++; + memcpy(attrip->attri_value, item->ri_buf[region].i_addr, + attrip->attri_value_len); + } + + /* + * The ATTRI has two references. One for the ATTRD and one for ATTRI to + * ensure it makes it into the AIL. Insert the ATTRI into the AIL + * directly and drop the ATTRI reference. Note that + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); + xfs_attri_release(attrip); + return 0; +out: + xfs_attri_item_free(attrip); + return error; +} + +/* + * This routine is called to allocate an "attr free done" log item. + */ +static struct xfs_attrd_log_item * +xfs_trans_get_attrd(struct xfs_trans *tp, + struct xfs_attri_log_item *attrip) +{ + struct xfs_attrd_log_item *attrdp; + + ASSERT(tp != NULL); + + attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); + + xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, + &xfs_attrd_item_ops); + attrdp->attrd_attrip = attrip; + attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; + + xfs_trans_add_item(tp, &attrdp->attrd_item); + return attrdp; +} + +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + if (!intent) + return NULL; + + return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item; +} + +const struct xfs_defer_op_type xfs_attr_defer_type = { + .max_items = 1, + .create_intent = xfs_attr_create_intent, + .abort_intent = xfs_attr_abort_intent, + .create_done = xfs_attr_create_done, + .finish_item = xfs_attr_finish_item, + .cancel_item = xfs_attr_cancel_item, +}; + +/* + * This routine is called when an ATTRD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding ATTRI if + * it was still in the log. To do this it searches the AIL for the ATTRI with + * an id equal to that in the ATTRD format structure. If we find it we drop + * the ATTRD reference, which removes the ATTRI from the AIL and frees it. + */ +STATIC int +xlog_recover_attrd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_attrd_log_format *attrd_formatp; + + attrd_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_ATTRI, + attrd_formatp->alfd_alf_id); + return 0; +} + +static const struct xfs_item_ops xfs_attri_item_ops = { + .flags = XFS_ITEM_INTENT, + .iop_size = xfs_attri_item_size, + .iop_format = xfs_attri_item_format, + .iop_unpin = xfs_attri_item_unpin, + .iop_committed = xfs_attri_item_committed, + .iop_release = xfs_attri_item_release, + .iop_recover = xfs_attri_item_recover, + .iop_match = xfs_attri_item_match, + .iop_relog = xfs_attri_item_relog, +}; + +const struct xlog_recover_item_ops xlog_attri_item_ops = { + .item_type = XFS_LI_ATTRI, + .commit_pass2 = xlog_recover_attri_commit_pass2, +}; + +static const struct xfs_item_ops xfs_attrd_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, + .iop_size = xfs_attrd_item_size, + .iop_format = xfs_attrd_item_format, + .iop_release = xfs_attrd_item_release, + .iop_intent = xfs_attrd_item_intent, +}; + +const struct xlog_recover_item_ops xlog_attrd_item_ops = { + .item_type = XFS_LI_ATTRD, + .commit_pass2 = xlog_recover_attrd_commit_pass2, +}; diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h new file mode 100644 index 000000000000..c3b779f82adb --- /dev/null +++ b/fs/xfs/xfs_attr_item.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * + * Copyright (C) 2022 Oracle. All Rights Reserved. + * Author: Allison Henderson <allison.henderson@oracle.com> + */ +#ifndef __XFS_ATTR_ITEM_H__ +#define __XFS_ATTR_ITEM_H__ + +/* kernel only ATTRI/ATTRD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * This is the "attr intention" log item. It is used to log the fact that some + * extended attribute operations need to be processed. An operation is + * currently either a set or remove. Set or remove operations are described by + * the xfs_attr_item which may be logged to this intent. + * + * During a normal attr operation, name and value point to the name and value + * fields of the caller's xfs_da_args structure. During a recovery, the name + * and value buffers are copied from the log, and stored in a trailing buffer + * attached to the xfs_attr_item until they are committed. They are freed when + * the xfs_attr_item itself is freed when the work is done. + */ +struct xfs_attri_log_item { + struct xfs_log_item attri_item; + atomic_t attri_refcount; + int attri_name_len; + int attri_value_len; + void *attri_name; + void *attri_value; + struct xfs_attri_log_format attri_format; +}; + +/* + * This is the "attr done" log item. It is used to log the fact that some attrs + * earlier mentioned in an attri item have been freed. + */ +struct xfs_attrd_log_item { + struct xfs_log_item attrd_item; + struct xfs_attri_log_item *attrd_attrip; + struct xfs_attrd_log_format attrd_format; +}; + +#endif /* __XFS_ATTR_ITEM_H__ */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 2d1e5134cebe..90a14e85e76d 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -15,6 +15,7 @@ #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_sf.h" #include "xfs_attr_leaf.h" diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 761dde155099..51f66e982484 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -39,6 +39,7 @@ STATIC void xfs_bui_item_free( struct xfs_bui_log_item *buip) { + kmem_free(buip->bui_item.li_lv_shadow); kmem_cache_free(xfs_bui_cache, buip); } @@ -54,10 +55,11 @@ xfs_bui_release( struct xfs_bui_log_item *buip) { ASSERT(atomic_read(&buip->bui_refcount) > 0); - if (atomic_dec_and_test(&buip->bui_refcount)) { - xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_bui_item_free(buip); - } + if (!atomic_dec_and_test(&buip->bui_refcount)) + return; + + xfs_trans_ail_delete(&buip->bui_item, 0); + xfs_bui_item_free(buip); } @@ -198,14 +200,24 @@ xfs_bud_item_release( struct xfs_bud_log_item *budp = BUD_ITEM(lip); xfs_bui_release(budp->bud_buip); + kmem_free(budp->bud_item.li_lv_shadow); kmem_cache_free(xfs_bud_cache, budp); } +static struct xfs_log_item * +xfs_bud_item_intent( + struct xfs_log_item *lip) +{ + return &BUD_ITEM(lip)->bud_buip->bui_item; +} + static const struct xfs_item_ops xfs_bud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_bud_item_size, .iop_format = xfs_bud_item_format, .iop_release = xfs_bud_item_release, + .iop_intent = xfs_bud_item_intent, }; static struct xfs_bud_log_item * @@ -254,7 +266,7 @@ xfs_trans_log_finish_bmap_update( * 1.) releases the BUI and frees the BUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); return error; @@ -506,6 +518,8 @@ xfs_bui_item_recover( iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; @@ -584,6 +598,7 @@ xfs_bui_item_relog( } static const struct xfs_item_ops xfs_bui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, .iop_unpin = xfs_bui_item_unpin, diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index eb2e387ba528..52be58372c63 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -119,14 +119,14 @@ retry: */ ralen = ap->length / mp->m_sb.sb_rextsize; /* - * If the old value was close enough to MAXEXTLEN that + * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that * we rounded up to it, cut it back so it's valid again. * Note that if it's a really large request (bigger than - * MAXEXTLEN), we don't hear about that number, and can't + * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't * adjust the starting point to match it. */ - if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) - ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN) + ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize; /* * Lock out modifications to both the RT bitmap and summary inodes @@ -839,9 +839,11 @@ xfs_alloc_file_space( * count, hence we need to limit the number of blocks we are * trying to reserve to avoid an overflow. We can't allocate * more than @nimaps extents, and an extent is limited on disk - * to MAXEXTLEN (21 bits), so use that to enforce the limit. + * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the + * limit. */ - resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); + resblks = min_t(xfs_fileoff_t, (e - s), + (XFS_MAX_BMBT_EXTLEN * nimaps)); if (unlikely(rt)) { dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resblks; @@ -857,6 +859,9 @@ xfs_alloc_file_space( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto error; @@ -912,6 +917,8 @@ xfs_unmap_extent( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -1193,6 +1200,8 @@ xfs_insert_file_space( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -1421,6 +1430,9 @@ xfs_swap_extent_rmap( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } @@ -1429,6 +1441,9 @@ xfs_swap_extent_rmap( error = xfs_iext_count_may_overflow(tip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index e11e9ef2338f..4d8a6aece995 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -8,15 +8,18 @@ /* kernel only definitions */ +struct xfs_buf; +struct xfs_mount; + /* buf log item flags */ -#define XFS_BLI_HOLD 0x01 -#define XFS_BLI_DIRTY 0x02 -#define XFS_BLI_STALE 0x04 -#define XFS_BLI_LOGGED 0x08 -#define XFS_BLI_INODE_ALLOC_BUF 0x10 -#define XFS_BLI_STALE_INODE 0x20 -#define XFS_BLI_INODE_BUF 0x40 -#define XFS_BLI_ORDERED 0x80 +#define XFS_BLI_HOLD (1u << 0) +#define XFS_BLI_DIRTY (1u << 1) +#define XFS_BLI_STALE (1u << 2) +#define XFS_BLI_LOGGED (1u << 3) +#define XFS_BLI_INODE_ALLOC_BUF (1u << 4) +#define XFS_BLI_STALE_INODE (1u << 5) +#define XFS_BLI_INODE_BUF (1u << 6) +#define XFS_BLI_ORDERED (1u << 7) #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -28,11 +31,6 @@ { XFS_BLI_INODE_BUF, "INODE_BUF" }, \ { XFS_BLI_ORDERED, "ORDERED" } - -struct xfs_buf; -struct xfs_mount; -struct xfs_buf_log_item; - /* * This is the in core log item structure used to track information * needed to log buffers. It tracks how many times the lock has been diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 5afedcbc78c7..5a6c3c3c4de2 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -136,10 +136,7 @@ xfs_qm_adjust_res_timer( res->timer = xfs_dquot_set_timeout(mp, ktime_get_real_seconds() + qlim->time); } else { - if (res->timer == 0) - res->warnings = 0; - else - res->timer = 0; + res->timer = 0; } } @@ -322,6 +319,9 @@ xfs_dquot_disk_alloc( error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, quotip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto err_cancel; @@ -589,10 +589,6 @@ xfs_dquot_from_disk( dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); - dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns); - dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); - dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); - dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer); dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer); dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer); @@ -634,9 +630,9 @@ xfs_dquot_to_disk( ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); - ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings); - ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); - ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); + ddqp->d_bwarns = 0; + ddqp->d_iwarns = 0; + ddqp->d_rtbwarns = 0; ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer); ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer); diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 6b5e3cf40c8b..80c8f851a2f3 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -44,14 +44,6 @@ struct xfs_dquot_res { * in seconds since the Unix epoch. */ time64_t timer; - - /* - * For root dquots, this is the maximum number of warnings that will - * be issued for this quota type. Otherwise, this is the number of - * warnings issued against this quota. Note that none of this is - * implemented. - */ - xfs_qwarncnt_t warnings; }; static inline bool diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 749fd18c4f32..296faa41d81d 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -57,6 +57,9 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_REDUCE_MAX_IEXTENTS, XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, XFS_RANDOM_AG_RESV_FAIL, + XFS_RANDOM_LARP, + XFS_RANDOM_DA_LEAF_SPLIT, + XFS_RANDOM_ATTR_LEAF_TO_NODE, }; struct xfs_errortag_attr { @@ -170,6 +173,9 @@ XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); +XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); +XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); +XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -211,6 +217,9 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), + XFS_ERRORTAG_ATTR_LIST(larp), + XFS_ERRORTAG_ATTR_LIST(da_leaf_split), + XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 5735d5ea87ee..5191e9145e55 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -64,16 +64,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); * XFS panic tags -- allow a call to xfs_alert_tag() be turned into * a panic by setting xfs_panic_mask in a sysctl. */ -#define XFS_NO_PTAG 0 -#define XFS_PTAG_IFLUSH 0x00000001 -#define XFS_PTAG_LOGRES 0x00000002 -#define XFS_PTAG_AILDELETE 0x00000004 -#define XFS_PTAG_ERROR_REPORT 0x00000008 -#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 -#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 -#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 -#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 -#define XFS_PTAG_VERIFIER_ERROR 0x00000100 +#define XFS_NO_PTAG 0u +#define XFS_PTAG_IFLUSH (1u << 0) +#define XFS_PTAG_LOGRES (1u << 1) +#define XFS_PTAG_AILDELETE (1u << 2) +#define XFS_PTAG_ERROR_REPORT (1u << 3) +#define XFS_PTAG_SHUTDOWN_CORRUPT (1u << 4) +#define XFS_PTAG_SHUTDOWN_IOERROR (1u << 5) +#define XFS_PTAG_SHUTDOWN_LOGERROR (1u << 6) +#define XFS_PTAG_FSBLOCK_ZERO (1u << 7) +#define XFS_PTAG_VERIFIER_ERROR (1u << 8) #define XFS_PTAG_STRINGS \ { XFS_NO_PTAG, "none" }, \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 0e50f2c9348e..765be054dffe 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -58,10 +58,11 @@ xfs_efi_release( struct xfs_efi_log_item *efip) { ASSERT(atomic_read(&efip->efi_refcount) > 0); - if (atomic_dec_and_test(&efip->efi_refcount)) { - xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); - xfs_efi_item_free(efip); - } + if (!atomic_dec_and_test(&efip->efi_refcount)) + return; + + xfs_trans_ail_delete(&efip->efi_item, 0); + xfs_efi_item_free(efip); } /* @@ -306,11 +307,20 @@ xfs_efd_item_release( xfs_efd_item_free(efdp); } +static struct xfs_log_item * +xfs_efd_item_intent( + struct xfs_log_item *lip) +{ + return &EFD_ITEM(lip)->efd_efip->efi_item; +} + static const struct xfs_item_ops xfs_efd_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_release = xfs_efd_item_release, + .iop_intent = xfs_efd_item_intent, }; /* @@ -380,7 +390,7 @@ xfs_trans_free_extent( * 1.) releases the EFI and frees the EFD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); next_extent = efdp->efd_next_extent; @@ -688,6 +698,7 @@ xfs_efi_item_relog( } static const struct xfs_item_ops xfs_efi_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_unpin = xfs_efi_item_unpin, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 85c412107a10..a60632ecc3f0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -310,7 +310,7 @@ STATIC ssize_t xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, - int *iolock) + unsigned int *iolock) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -513,7 +513,7 @@ xfs_file_dio_write_aligned( struct kiocb *iocb, struct iov_iter *from) { - int iolock = XFS_IOLOCK_SHARED; + unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; ret = xfs_ilock_iocb(iocb, iolock); @@ -566,7 +566,7 @@ xfs_file_dio_write_unaligned( { size_t isize = i_size_read(VFS_I(ip)); size_t count = iov_iter_count(from); - int iolock = XFS_IOLOCK_SHARED; + unsigned int iolock = XFS_IOLOCK_SHARED; unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; ssize_t ret; @@ -655,7 +655,7 @@ xfs_file_dax_write( { struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - int iolock = XFS_IOLOCK_EXCL; + unsigned int iolock = XFS_IOLOCK_EXCL; ssize_t ret, error = 0; loff_t pos; @@ -694,13 +694,11 @@ xfs_file_buffered_write( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; bool cleared_space = false; - int iolock; + unsigned int iolock; if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; @@ -767,9 +765,7 @@ xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = iov_iter_count(from); @@ -1167,12 +1163,10 @@ xfs_file_open( struct inode *inode, struct file *file) { - if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) - return -EFBIG; if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; - return 0; + return generic_file_open(inode, file); } STATIC int @@ -1181,7 +1175,7 @@ xfs_dir_open( struct file *file) { struct xfs_inode *ip = XFS_I(inode); - int mode; + unsigned int mode; int error; error = xfs_file_open(inode, file); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 6a3ce0f6dc9e..be9bcf8a1f99 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -128,11 +128,12 @@ xfs_filestream_pick_ag( if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); if (err) { - xfs_perag_put(pag); - if (err != -EAGAIN) + if (err != -EAGAIN) { + xfs_perag_put(pag); return err; + } /* Couldn't lock the AGF, skip this AG. */ - continue; + goto next_ag; } } diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 10e1cb71439e..bb23199f65c3 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -450,11 +450,11 @@ xfs_getfsmap_logdev( /* Transform a rtbitmap "record" into a fsmap */ STATIC int xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { - struct xfs_mount *mp = tp->t_mountp; struct xfs_getfsmap_info *info = priv; struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; @@ -535,7 +535,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( do_div(alow.ar_startext, mp->m_sb.sb_rextsize); if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; - error = xfs_rtalloc_query_range(tp, &alow, &ahigh, + error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) goto err; @@ -547,7 +547,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( info->last = true; ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext); - error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); + error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info); if (error) goto err; err: diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 68f74549fa22..888839e75d11 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -349,10 +349,7 @@ xfs_fs_counts( cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - xfs_fdblocks_unavailable(mp); - - spin_lock(&mp->m_sb_lock); - cnt->freertx = mp->m_sb.sb_frextents; - spin_unlock(&mp->m_sb_lock); + cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); } /* @@ -512,7 +509,7 @@ xfs_fs_goingdown( void xfs_do_force_shutdown( struct xfs_mount *mp, - int flags, + uint32_t flags, char *fname, int lnnum) { diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f62fa652c2fd..4d0a98f920ca 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -41,5 +41,6 @@ struct xfs_globals xfs_globals = { #endif #ifdef DEBUG .pwork_threads = -1, /* automatic thread detection */ + .larp = false, /* log attribute replay */ #endif }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index bffd6eb0b298..5269354b1b69 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - uint64_t freertx; if (!XFS_IS_REALTIME_INODE(ip)) return false; - freertx = READ_ONCE(mp->m_sb.sb_frextents); - return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT]; + if (__percpu_counter_compare(&mp->m_frextents, + mp->m_low_rtexts[XFS_LOWSP_5_PCNT], + XFS_FDBLOCKS_BATCH) < 0) + return true; + + return false; } #else # define xfs_inodegc_want_queue_rt_file(ip) (false) diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 508e184e3b8f..b05314d48176 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -63,6 +63,7 @@ STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { + kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow); kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip)); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 39ae53efb3ab..b2879870a17e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -416,10 +416,12 @@ xfs_lockdep_subclass_ok( * parent locking. Care must be taken to ensure we don't overrun the subclass * storage fields in the class mask we build. */ -static inline int -xfs_lock_inumorder(int lock_mode, int subclass) +static inline uint +xfs_lock_inumorder( + uint lock_mode, + uint subclass) { - int class = 0; + uint class = 0; ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | XFS_ILOCK_RTSUM))); @@ -464,7 +466,10 @@ xfs_lock_inodes( int inodes, uint lock_mode) { - int attempts = 0, i, j, try_lock; + int attempts = 0; + uint i; + int j; + bool try_lock; struct xfs_log_item *lp; /* @@ -489,9 +494,9 @@ xfs_lock_inodes( } else if (lock_mode & XFS_MMAPLOCK_EXCL) ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); - try_lock = 0; - i = 0; again: + try_lock = false; + i = 0; for (; i < inodes; i++) { ASSERT(ips[i]); @@ -506,7 +511,7 @@ again: for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = &ips[j]->i_itemp->ili_item; if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) - try_lock++; + try_lock = true; } } @@ -546,8 +551,6 @@ again: if ((attempts % 5) == 0) { delay(1); /* Don't just spin the CPU */ } - i = 0; - try_lock = 0; goto again; } } @@ -1024,11 +1027,6 @@ xfs_create( xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; - error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; - /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry @@ -1242,11 +1240,6 @@ xfs_link( if (error) goto std_return; - error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto error_return; - /* * If we are using project inheritance, we only allow hard link * creation in our tree when the project IDs are the same; else @@ -3212,35 +3205,6 @@ retry: /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. - * - * Extent count overflow check: - * - * From the perspective of src_dp, a rename operation is essentially a - * directory entry remove operation. Hence the only place where we check - * for extent count overflow for src_dp is in - * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns - * -ENOSPC when it detects a possible extent count overflow and in - * response, the higher layers of directory handling code do the - * following: - * 1. Data/Free blocks: XFS lets these blocks linger until a - * future remove operation removes them. - * 2. Dabtree blocks: XFS swaps the blocks with the last block in the - * Leaf space and unmaps the last block. - * - * For target_dp, there are two cases depending on whether the - * destination directory entry exists or not. - * - * When destination directory entry does not exist (i.e. target_ip == - * NULL), extent count overflow check is performed only when transaction - * has a non-zero sized space reservation associated with it. With a - * zero-sized space reservation, XFS allows a rename operation to - * continue only when the directory has sufficient free space in its - * data/leaf/free space blocks to hold the new entry. - * - * When destination directory entry exists (i.e. target_ip != NULL), all - * we need to do is change the inode number associated with the already - * existing entry. Hence there is no need to perform an extent count - * overflow check. */ if (target_ip == NULL) { /* @@ -3251,12 +3215,6 @@ retry: error = xfs_dir_canenter(tp, target_dp, target_name); if (error) goto out_trans_cancel; - } else { - error = xfs_iext_count_may_overflow(target_dp, - XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; } } else { /* @@ -3424,18 +3382,12 @@ retry: * inode number of the whiteout inode rather than removing it * altogether. */ - if (wip) { + if (wip) error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, spaceres); - } else { - /* - * NOTE: We don't need to check for extent count overflow here - * because the dir remove name code will leave the dir block in - * place if the extent count would overflow. - */ + else error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, spaceres); - } if (error) goto out_trans_cancel; @@ -3517,8 +3469,8 @@ xfs_iflush( if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: detected corrupt incore inode %Lu, " - "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, + "%s: detected corrupt incore inode %llu, " + "total extents = %llu nblocks = %lld, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), ip->i_nblocks, ip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 740ab13d1aa2..7be6f8e705ab 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -218,6 +218,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME; } +static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) +{ + return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; +} + /* * Return the buftarg used for data allocations on a given inode. */ @@ -278,12 +283,12 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) */ -#define XFS_IOLOCK_EXCL (1<<0) -#define XFS_IOLOCK_SHARED (1<<1) -#define XFS_ILOCK_EXCL (1<<2) -#define XFS_ILOCK_SHARED (1<<3) -#define XFS_MMAPLOCK_EXCL (1<<4) -#define XFS_MMAPLOCK_SHARED (1<<5) +#define XFS_IOLOCK_EXCL (1u << 0) +#define XFS_IOLOCK_SHARED (1u << 1) +#define XFS_ILOCK_EXCL (1u << 2) +#define XFS_ILOCK_SHARED (1u << 3) +#define XFS_MMAPLOCK_EXCL (1u << 4) +#define XFS_MMAPLOCK_SHARED (1u << 5) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ @@ -350,19 +355,19 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) */ #define XFS_IOLOCK_SHIFT 16 #define XFS_IOLOCK_MAX_SUBCLASS 3 -#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_IOLOCK_DEP_MASK 0x000f0000u #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 #define XFS_MMAPLOCK_MAX_SUBCLASS 3 -#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000u #define XFS_ILOCK_SHIFT 24 -#define XFS_ILOCK_PARENT_VAL 5 +#define XFS_ILOCK_PARENT_VAL 5u #define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) -#define XFS_ILOCK_RTBITMAP_VAL 6 -#define XFS_ILOCK_RTSUM_VAL 7 -#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_ILOCK_RTBITMAP_VAL 6u +#define XFS_ILOCK_RTSUM_VAL 7u +#define XFS_ILOCK_DEP_MASK 0xff000000u #define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 9e6ef55cf29e..721def0639fd 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -71,7 +71,7 @@ xfs_inode_item_data_fork_size( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - *nbytes += roundup(ip->i_df.if_bytes, 4); + *nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes); *nvecs += 1; } break; @@ -112,7 +112,7 @@ xfs_inode_item_attr_fork_size( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { - *nbytes += roundup(ip->i_afp->if_bytes, 4); + *nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes); *nvecs += 1; } break; @@ -204,17 +204,12 @@ xfs_inode_item_format_data_fork( ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - /* - * Round i_bytes up to a word boundary. - * The underlying memory is guaranteed - * to be there by xfs_idata_realloc(). - */ - data_bytes = roundup(ip->i_df.if_bytes, 4); ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_disk_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, - ip->i_df.if_u1.if_data, data_bytes); - ilf->ilf_dsize = (unsigned)data_bytes; + ip->i_df.if_u1.if_data, + ip->i_df.if_bytes); + ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DDATA; @@ -288,17 +283,11 @@ xfs_inode_item_format_attr_fork( if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { - /* - * Round i_bytes up to a word boundary. - * The underlying memory is guaranteed - * to be there by xfs_idata_realloc(). - */ - data_bytes = roundup(ip->i_afp->if_bytes, 4); ASSERT(ip->i_afp->if_u1.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, ip->i_afp->if_u1.if_data, - data_bytes); - ilf->ilf_asize = (unsigned)data_bytes; + ip->i_afp->if_bytes); + ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ADATA; @@ -359,6 +348,21 @@ xfs_copy_dm_fields_to_log_dinode( } } +static inline void +xfs_inode_to_log_dinode_iext_counters( + struct xfs_inode *ip, + struct xfs_log_dinode *to) +{ + if (xfs_inode_has_large_extent_counts(ip)) { + to->di_big_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_big_anextents = xfs_ifork_nextents(ip->i_afp); + to->di_nrext64_pad = 0; + } else { + to->di_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_anextents = xfs_ifork_nextents(ip->i_afp); + } +} + static void xfs_inode_to_log_dinode( struct xfs_inode *ip, @@ -374,7 +378,6 @@ xfs_inode_to_log_dinode( to->di_projid_lo = ip->i_projid & 0xffff; to->di_projid_hi = ip->i_projid >> 16; - memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); @@ -386,8 +389,6 @@ xfs_inode_to_log_dinode( to->di_size = ip->i_disk_size; to->di_nblocks = ip->i_nblocks; to->di_extsize = ip->i_extsize; - to->di_nextents = xfs_ifork_nextents(&ip->i_df); - to->di_anextents = xfs_ifork_nextents(ip->i_afp); to->di_forkoff = ip->i_forkoff; to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_flags = ip->i_diflags; @@ -407,11 +408,14 @@ xfs_inode_to_log_dinode( to->di_lsn = lsn; memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_inode_to_log_dinode_iext_counters(ip, to); } /* diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 239dd2e3384e..d28ffaebd067 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -142,6 +142,29 @@ xfs_log_dinode_to_disk_ts( return ts; } +static inline bool xfs_log_dinode_has_large_extent_counts( + const struct xfs_log_dinode *ld) +{ + return ld->di_version >= 3 && + (ld->di_flags2 & XFS_DIFLAG2_NREXT64); +} + +static inline void +xfs_log_dinode_to_disk_iext_counters( + struct xfs_log_dinode *from, + struct xfs_dinode *to) +{ + if (xfs_log_dinode_has_large_extent_counts(from)) { + to->di_big_nextents = cpu_to_be64(from->di_big_nextents); + to->di_big_anextents = cpu_to_be32(from->di_big_anextents); + to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad); + } else { + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + } + +} + STATIC void xfs_log_dinode_to_disk( struct xfs_log_dinode *from, @@ -158,7 +181,6 @@ xfs_log_dinode_to_disk( to->di_nlink = cpu_to_be32(from->di_nlink); to->di_projid_lo = cpu_to_be16(from->di_projid_lo); to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); @@ -167,8 +189,6 @@ xfs_log_dinode_to_disk( to->di_size = cpu_to_be64(from->di_size); to->di_nblocks = cpu_to_be64(from->di_nblocks); to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); to->di_forkoff = from->di_forkoff; to->di_aformat = from->di_aformat; to->di_dmevmask = cpu_to_be32(from->di_dmevmask); @@ -184,12 +204,66 @@ xfs_log_dinode_to_disk( to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &from->di_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_flushiter = cpu_to_be16(from->di_flushiter); + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_log_dinode_to_disk_iext_counters(from, to); +} + +STATIC int +xlog_dinode_verify_extent_counts( + struct xfs_mount *mp, + struct xfs_log_dinode *ldip) +{ + xfs_extnum_t nextents; + xfs_aextnum_t anextents; + + if (xfs_log_dinode_has_large_extent_counts(ldip)) { + if (!xfs_has_large_extent_counts(mp) || + (ldip->di_nrext64_pad != 0)) { + XFS_CORRUPTION_ERROR( + "Bad log dinode large extent count format", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, large extent counts %d, padding 0x%x", + ldip->di_ino, xfs_has_large_extent_counts(mp), + ldip->di_nrext64_pad); + return -EFSCORRUPTED; + } + + nextents = ldip->di_big_nextents; + anextents = ldip->di_big_anextents; + } else { + if (ldip->di_version == 3 && ldip->di_v3_pad != 0) { + XFS_CORRUPTION_ERROR( + "Bad log dinode di_v3_pad", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, di_v3_pad 0x%llx", + ldip->di_ino, ldip->di_v3_pad); + return -EFSCORRUPTED; + } + + nextents = ldip->di_nextents; + anextents = ldip->di_anextents; + } + + if (unlikely(nextents + anextents > ldip->di_nblocks)) { + XFS_CORRUPTION_ERROR("Bad log dinode extent counts", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx", + ldip->di_ino, xfs_has_large_extent_counts(mp), nextents, + anextents, ldip->di_nblocks); + return -EFSCORRUPTED; + } + + return 0; } STATIC int @@ -317,13 +391,12 @@ xlog_recover_inode_commit_pass2( if (unlikely(S_ISREG(ldip->di_mode))) { if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR( + "Bad log dinode data fork format for regular file", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad regular inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); + "Bad inode 0x%llx, data fork format 0x%x", + in_f->ilf_ino, ldip->di_format); error = -EFSCORRUPTED; goto out_release; } @@ -331,49 +404,37 @@ xlog_recover_inode_commit_pass2( if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE) && (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR( + "Bad log dinode data fork format for directory", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad dir inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); + "Bad inode 0x%llx, data fork format 0x%x", + in_f->ilf_ino, ldip->di_format); error = -EFSCORRUPTED; goto out_release; } } - if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", - __func__, item, dip, bp, in_f->ilf_ino, - ldip->di_nextents + ldip->di_anextents, - ldip->di_nblocks); - error = -EFSCORRUPTED; + + error = xlog_dinode_verify_extent_counts(mp, ldip); + if (error) goto out_release; - } + if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR("Bad log dinode fork offset", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, - item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); + "Bad inode 0x%llx, di_forkoff 0x%x", + in_f->ilf_ino, ldip->di_forkoff); error = -EFSCORRUPTED; goto out_release; } isize = xfs_log_dinode_size(mp); if (unlikely(item->ri_buf[1].i_len > isize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW, + mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad inode log record length %d, rec ptr "PTR_FMT, - __func__, item->ri_buf[1].i_len, item); + "Bad inode 0x%llx log dinode size 0x%x", + in_f->ilf_ino, item->ri_buf[1].i_len); error = -EFSCORRUPTED; goto out_release; } @@ -401,7 +462,7 @@ xlog_recover_inode_commit_pass2( ASSERT(in_f->ilf_size <= 4); ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); ASSERT(!(fields & XFS_ILOG_DFORK) || - (len == in_f->ilf_dsize)); + (len == xlog_calc_iovec_len(in_f->ilf_dsize))); switch (fields & XFS_ILOG_DFORK) { case XFS_ILOG_DDATA: @@ -436,7 +497,7 @@ xlog_recover_inode_commit_pass2( } len = item->ri_buf[attr_index].i_len; src = item->ri_buf[attr_index].i_addr; - ASSERT(len == in_f->ilf_asize); + ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize)); switch (in_f->ilf_fields & XFS_ILOG_AFORK) { case XFS_ILOG_ADATA: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 83481005317a..0e5cb7936206 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -15,6 +15,8 @@ #include "xfs_iwalk.h" #include "xfs_itable.h" #include "xfs_error.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -35,8 +37,6 @@ #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include <linux/mount.h> #include <linux/namei.h> @@ -813,6 +813,9 @@ xfs_bulk_ireq_setup( if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) return -ECANCELED; + if (hdr->flags & XFS_BULK_IREQ_NREXT64) + breq->flags |= XFS_IBULK_NREXT64; + return 0; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index ca25ed89b706..2f54b701eead 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -17,6 +17,8 @@ #include "xfs_itable.h" #include "xfs_fsops.h" #include "xfs_rtalloc.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_ioctl.h" #include "xfs_ioctl32.h" diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index e552ce541ec2..5a393259a3a3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -251,6 +251,8 @@ xfs_iomap_write_direct( return error; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, nr_exts); if (error) goto out_trans_cancel; @@ -402,7 +404,7 @@ xfs_iomap_prealloc_size( */ plen = prev.br_blockcount; while (xfs_iext_prev_extent(ifp, &ncur, &got)) { - if (plen > MAXEXTLEN / 2 || + if (plen > XFS_MAX_BMBT_EXTLEN / 2 || isnullstartblock(got.br_startblock) || got.br_startoff + got.br_blockcount != prev.br_startoff || got.br_startblock + got.br_blockcount != prev.br_startblock) @@ -414,23 +416,23 @@ xfs_iomap_prealloc_size( /* * If the size of the extents is greater than half the maximum extent * length, then use the current offset as the basis. This ensures that - * for large files the preallocation size always extends to MAXEXTLEN - * rather than falling short due to things like stripe unit/width - * alignment of real extents. + * for large files the preallocation size always extends to + * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe + * unit/width alignment of real extents. */ alloc_blocks = plen * 2; - if (alloc_blocks > MAXEXTLEN) + if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) alloc_blocks = XFS_B_TO_FSB(mp, offset); qblocks = alloc_blocks; /* - * MAXEXTLEN is not a power of two value but we round the prealloc down - * to the nearest power of two value after throttling. To prevent the - * round down from unconditionally reducing the maximum supported - * prealloc size, we round up first, apply appropriate throttling, - * round down and cap the value to MAXEXTLEN. + * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc + * down to the nearest power of two value after throttling. To prevent + * the round down from unconditionally reducing the maximum supported + * prealloc size, we round up first, apply appropriate throttling, round + * down and cap the value to XFS_BMBT_MAX_EXTLEN. */ - alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN), alloc_blocks); freesp = percpu_counter_read_positive(&mp->m_fdblocks); @@ -478,14 +480,14 @@ xfs_iomap_prealloc_size( */ if (alloc_blocks) alloc_blocks = rounddown_pow_of_two(alloc_blocks); - if (alloc_blocks > MAXEXTLEN) - alloc_blocks = MAXEXTLEN; + if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) + alloc_blocks = XFS_MAX_BMBT_EXTLEN; /* * If we are still trying to allocate more space than is * available, squash the prealloc hard. This can happen if we * have a large file on a small filesystem and the above - * lowspace thresholds are smaller than MAXEXTLEN. + * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN. */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; @@ -555,6 +557,9 @@ xfs_iomap_write_unwritten( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_WRITE_UNWRITTEN_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b34e8e4344a8..e912b7fee714 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -13,6 +13,8 @@ #include "xfs_inode.h" #include "xfs_acl.h" #include "xfs_quota.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trans.h" #include "xfs_trace.h" @@ -209,7 +211,6 @@ xfs_generic_create( if (unlikely(error)) goto out_cleanup_inode; -#ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) @@ -220,7 +221,6 @@ xfs_generic_create( if (error) goto out_cleanup_inode; } -#endif xfs_setup_iops(ip); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c08c79d9e311..f74c9fff72bb 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -64,6 +64,7 @@ xfs_bulkstat_one_int( struct xfs_inode *ip; /* incore inode pointer */ struct inode *inode; struct xfs_bulkstat *buf = bc->buf; + xfs_extnum_t nextents; int error = -EINVAL; if (xfs_internal_inum(mp, ino)) @@ -102,7 +103,13 @@ xfs_bulkstat_one_int( buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize_blks = ip->i_extsize; - buf->bs_extents = xfs_ifork_nextents(&ip->i_df); + + nextents = xfs_ifork_nextents(&ip->i_df); + if (!(bc->breq->flags & XFS_IBULK_NREXT64)) + buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL); + else + buf->bs_extents64 = nextents; + xfs_bulkstat_health(ip, buf); buf->bs_aextents = xfs_ifork_nextents(ip->i_afp); buf->bs_forkoff = XFS_IFORK_BOFF(ip); @@ -256,6 +263,7 @@ xfs_bulkstat( .breq = breq, }; struct xfs_trans *tp; + unsigned int iwalk_flags = 0; int error; if (breq->mnt_userns != &init_user_ns) { @@ -279,7 +287,10 @@ xfs_bulkstat( if (error) goto out; - error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags, + if (breq->flags & XFS_IBULK_SAME_AG) + iwalk_flags |= XFS_IWALK_SAME_AG; + + error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags, xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); out: diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 7078d10c9b12..e2d0eba43f35 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -17,7 +17,10 @@ struct xfs_ibulk { }; /* Only iterate within the same AG as startino */ -#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) +#define XFS_IBULK_SAME_AG (1U << 0) + +/* Fill out the bs_extents64 field if set. */ +#define XFS_IBULK_NREXT64 (1U << 1) /* * Advance the user buffer pointer by one record of the given size. If the diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h index 37a795f03267..83699089755e 100644 --- a/fs/xfs/xfs_iwalk.h +++ b/fs/xfs/xfs_iwalk.h @@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, unsigned int inode_records, bool poll, void *data); /* Only iterate inodes within the same AG as @startino. */ -#define XFS_IWALK_SAME_AG (0x1) +#define XFS_IWALK_SAME_AG (1U << 0) #define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 499e15b24215..9dc748abdf33 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -49,7 +49,6 @@ xlog_state_get_iclog_space( int len, struct xlog_in_core **iclog, struct xlog_ticket *ticket, - int *continued_write, int *logoffsetp); STATIC void xlog_grant_push_ail( @@ -61,10 +60,6 @@ xlog_sync( struct xlog_in_core *iclog); #if defined(DEBUG) STATIC void -xlog_verify_dest_ptr( - struct xlog *log, - void *ptr); -STATIC void xlog_verify_grant_tail( struct xlog *log); STATIC void @@ -77,7 +72,6 @@ xlog_verify_tail_lsn( struct xlog *log, struct xlog_in_core *iclog); #else -#define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c) #define xlog_verify_tail_lsn(a,b) @@ -90,6 +84,62 @@ xlog_iclogs_empty( static int xfs_log_cover(struct xfs_mount *); +/* + * We need to make sure the buffer pointer returned is naturally aligned for the + * biggest basic data type we put into it. We have already accounted for this + * padding when sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + * + * We also add space for the xlog_op_header that describes this region in the + * log. This prepends the data region we return to the caller to copy their data + * into, so do all the static initialisation of the ophdr now. Because the ophdr + * is not 8 byte aligned, we have to be careful to ensure that we align the + * start of the buffer such that the region we return to the call is 8 byte + * aligned and packed against the tail of the ophdr. + */ +void * +xlog_prepare_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + struct xlog_op_header *oph; + uint32_t len; + void *buf; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + len = lv->lv_buf_len + sizeof(struct xlog_op_header); + if (!IS_ALIGNED(len, sizeof(uint64_t))) { + lv->lv_buf_len = round_up(len, sizeof(uint64_t)) - + sizeof(struct xlog_op_header); + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + oph = vec->i_addr; + oph->oh_clientid = XFS_TRANSACTION; + oph->oh_res2 = 0; + oph->oh_flags = 0; + + buf = vec->i_addr + sizeof(struct xlog_op_header); + ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t))); + + *vecp = vec; + return buf; +} + static void xlog_grant_sub_space( struct xlog *log, @@ -322,30 +372,6 @@ xlog_grant_head_check( return error; } -static void -xlog_tic_reset_res(xlog_ticket_t *tic) -{ - tic->t_res_num = 0; - tic->t_res_arr_sum = 0; - tic->t_res_num_ophdrs = 0; -} - -static void -xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) -{ - if (tic->t_res_num == XLOG_TIC_LEN_MAX) { - /* add to overflow and start again */ - tic->t_res_o_flow += tic->t_res_arr_sum; - tic->t_res_num = 0; - tic->t_res_arr_sum = 0; - } - - tic->t_res_arr[tic->t_res_num].r_len = len; - tic->t_res_arr[tic->t_res_num].r_type = type; - tic->t_res_arr_sum += len; - tic->t_res_num++; -} - bool xfs_log_writable( struct xfs_mount *mp) @@ -395,8 +421,6 @@ xfs_log_regrant( xlog_grant_push_ail(log, tic->t_unit_res); tic->t_curr_res = tic->t_unit_res; - xlog_tic_reset_res(tic); - if (tic->t_cnt > 0) return 0; @@ -434,10 +458,9 @@ out_error: int xfs_log_reserve( struct xfs_mount *mp, - int unit_bytes, - int cnt, + int unit_bytes, + int cnt, struct xlog_ticket **ticp, - uint8_t client, bool permanent) { struct xlog *log = mp->m_log; @@ -445,15 +468,13 @@ xfs_log_reserve( int need_bytes; int error = 0; - ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -901,12 +922,22 @@ xlog_write_unmount_record( struct xlog *log, struct xlog_ticket *ticket) { - struct xfs_unmount_log_format ulf = { - .magic = XLOG_UNMOUNT_TYPE, + struct { + struct xlog_op_header ophdr; + struct xfs_unmount_log_format ulf; + } unmount_rec = { + .ophdr = { + .oh_clientid = XFS_LOG, + .oh_tid = cpu_to_be32(ticket->t_tid), + .oh_flags = XLOG_UNMOUNT_TRANS, + }, + .ulf = { + .magic = XLOG_UNMOUNT_TYPE, + }, }; struct xfs_log_iovec reg = { - .i_addr = &ulf, - .i_len = sizeof(ulf), + .i_addr = &unmount_rec, + .i_len = sizeof(unmount_rec), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { @@ -914,10 +945,14 @@ xlog_write_unmount_record( .lv_iovecp = ®, }; + BUILD_BUG_ON((sizeof(struct xlog_op_header) + + sizeof(struct xfs_unmount_log_format)) != + sizeof(unmount_rec)); + /* account for space used by record data */ - ticket->t_curr_res -= sizeof(ulf); + ticket->t_curr_res -= sizeof(unmount_rec); - return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS); + return xlog_write(log, NULL, &vec, ticket, reg.i_len); } /* @@ -933,7 +968,7 @@ xlog_unmount_write( struct xlog_ticket *tic = NULL; int error; - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); + error = xfs_log_reserve(mp, 600, 1, &tic, 0); if (error) goto out_err; @@ -1584,9 +1619,6 @@ xlog_alloc_log( GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; -#ifdef DEBUG - log->l_iclog_bak[i] = &iclog->ic_header; -#endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); @@ -1602,7 +1634,7 @@ xlog_alloc_log( iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); INIT_LIST_HEAD(&iclog->ic_callbacks); - iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; + iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -2111,63 +2143,11 @@ xlog_print_tic_res( struct xfs_mount *mp, struct xlog_ticket *ticket) { - uint i; - uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); - - /* match with XLOG_REG_TYPE_* in xfs_log.h */ -#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str - static char *res_type_str[] = { - REG_TYPE_STR(BFORMAT, "bformat"), - REG_TYPE_STR(BCHUNK, "bchunk"), - REG_TYPE_STR(EFI_FORMAT, "efi_format"), - REG_TYPE_STR(EFD_FORMAT, "efd_format"), - REG_TYPE_STR(IFORMAT, "iformat"), - REG_TYPE_STR(ICORE, "icore"), - REG_TYPE_STR(IEXT, "iext"), - REG_TYPE_STR(IBROOT, "ibroot"), - REG_TYPE_STR(ILOCAL, "ilocal"), - REG_TYPE_STR(IATTR_EXT, "iattr_ext"), - REG_TYPE_STR(IATTR_BROOT, "iattr_broot"), - REG_TYPE_STR(IATTR_LOCAL, "iattr_local"), - REG_TYPE_STR(QFORMAT, "qformat"), - REG_TYPE_STR(DQUOT, "dquot"), - REG_TYPE_STR(QUOTAOFF, "quotaoff"), - REG_TYPE_STR(LRHEADER, "LR header"), - REG_TYPE_STR(UNMOUNT, "unmount"), - REG_TYPE_STR(COMMIT, "commit"), - REG_TYPE_STR(TRANSHDR, "trans header"), - REG_TYPE_STR(ICREATE, "inode create"), - REG_TYPE_STR(RUI_FORMAT, "rui_format"), - REG_TYPE_STR(RUD_FORMAT, "rud_format"), - REG_TYPE_STR(CUI_FORMAT, "cui_format"), - REG_TYPE_STR(CUD_FORMAT, "cud_format"), - REG_TYPE_STR(BUI_FORMAT, "bui_format"), - REG_TYPE_STR(BUD_FORMAT, "bud_format"), - }; - BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1); -#undef REG_TYPE_STR - xfs_warn(mp, "ticket reservation summary:"); - xfs_warn(mp, " unit res = %d bytes", - ticket->t_unit_res); - xfs_warn(mp, " current res = %d bytes", - ticket->t_curr_res); - xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", - ticket->t_res_arr_sum, ticket->t_res_o_flow); - xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", - ticket->t_res_num_ophdrs, ophdr_spc); - xfs_warn(mp, " ophdr + reg = %u bytes", - ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); - xfs_warn(mp, " num regions = %u", - ticket->t_res_num); - - for (i = 0; i < ticket->t_res_num; i++) { - uint r_type = ticket->t_res_arr[i].r_type; - xfs_warn(mp, "region[%u]: %s - %u bytes", i, - ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? - "bad-rtype" : res_type_str[r_type]), - ticket->t_res_arr[i].r_len); - } + xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); + xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res); + xfs_warn(mp, " original count = %d", ticket->t_ocnt); + xfs_warn(mp, " remaining count = %d", ticket->t_cnt); } /* @@ -2220,187 +2200,226 @@ xlog_print_trans( } } +static inline void +xlog_write_iovec( + struct xlog_in_core *iclog, + uint32_t *log_offset, + void *data, + uint32_t write_len, + int *bytes_left, + uint32_t *record_cnt, + uint32_t *data_cnt) +{ + ASSERT(*log_offset < iclog->ic_log->l_iclog_size); + ASSERT(*log_offset % sizeof(int32_t) == 0); + ASSERT(write_len % sizeof(int32_t) == 0); + + memcpy(iclog->ic_datap + *log_offset, data, write_len); + *log_offset += write_len; + *bytes_left -= write_len; + (*record_cnt)++; + *data_cnt += write_len; +} + /* - * Calculate the potential space needed by the log vector. We may need a start - * record, and each region gets its own struct xlog_op_header and may need to be - * double word aligned. + * Write log vectors into a single iclog which is guaranteed by the caller + * to have enough space to write the entire log vector into. */ -static int -xlog_write_calc_vec_length( +static void +xlog_write_full( + struct xfs_log_vec *lv, struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector, - uint optype) + struct xlog_in_core *iclog, + uint32_t *log_offset, + uint32_t *len, + uint32_t *record_cnt, + uint32_t *data_cnt) { - struct xfs_log_vec *lv; - int headers = 0; - int len = 0; - int i; - - if (optype & XLOG_START_TRANS) - headers++; - - for (lv = log_vector; lv; lv = lv->lv_next) { - /* we don't write ordered log vectors */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) - continue; + int index; - headers += lv->lv_niovecs; + ASSERT(*log_offset + *len <= iclog->ic_size || + iclog->ic_state == XLOG_STATE_WANT_SYNC); - for (i = 0; i < lv->lv_niovecs; i++) { - struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + /* + * Ordered log vectors have no regions to write so this + * loop will naturally skip them. + */ + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *reg = &lv->lv_iovecp[index]; + struct xlog_op_header *ophdr = reg->i_addr; - len += vecp->i_len; - xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); - } + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + xlog_write_iovec(iclog, log_offset, reg->i_addr, + reg->i_len, len, record_cnt, data_cnt); } - - ticket->t_res_num_ophdrs += headers; - len += headers * sizeof(struct xlog_op_header); - - return len; -} - -static void -xlog_write_start_rec( - struct xlog_op_header *ophdr, - struct xlog_ticket *ticket) -{ - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - ophdr->oh_clientid = ticket->t_clientid; - ophdr->oh_len = 0; - ophdr->oh_flags = XLOG_START_TRANS; - ophdr->oh_res2 = 0; } -static xlog_op_header_t * -xlog_write_setup_ophdr( - struct xlog *log, - struct xlog_op_header *ophdr, +static int +xlog_write_get_more_iclog_space( struct xlog_ticket *ticket, - uint flags) + struct xlog_in_core **iclogp, + uint32_t *log_offset, + uint32_t len, + uint32_t *record_cnt, + uint32_t *data_cnt) { - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - ophdr->oh_clientid = ticket->t_clientid; - ophdr->oh_res2 = 0; - - /* are we copying a commit or unmount record? */ - ophdr->oh_flags = flags; + struct xlog_in_core *iclog = *iclogp; + struct xlog *log = iclog->ic_log; + int error; - /* - * We've seen logs corrupted with bad transaction client ids. This - * makes sure that XFS doesn't generate them on. Turn this into an EIO - * and shut down the filesystem. - */ - switch (ophdr->oh_clientid) { - case XFS_TRANSACTION: - case XFS_VOLUME: - case XFS_LOG: - break; - default: - xfs_warn(log->l_mp, - "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT, - ophdr->oh_clientid, ticket); - return NULL; - } + spin_lock(&log->l_icloglock); + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC); + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + error = xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + if (error) + return error; - return ophdr; + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + log_offset); + if (error) + return error; + *record_cnt = 0; + *data_cnt = 0; + *iclogp = iclog; + return 0; } /* - * Set up the parameters of the region copy into the log. This has - * to handle region write split across multiple log buffers - this - * state is kept external to this function so that this code can - * be written in an obvious, self documenting manner. + * Write log vectors into a single iclog which is smaller than the current chain + * length. We write until we cannot fit a full record into the remaining space + * and then stop. We return the log vector that is to be written that cannot + * wholly fit in the iclog. */ static int -xlog_write_setup_copy( +xlog_write_partial( + struct xfs_log_vec *lv, struct xlog_ticket *ticket, - struct xlog_op_header *ophdr, - int space_available, - int space_required, - int *copy_off, - int *copy_len, - int *last_was_partial_copy, - int *bytes_consumed) -{ - int still_to_copy; - - still_to_copy = space_required - *bytes_consumed; - *copy_off = *bytes_consumed; - - if (still_to_copy <= space_available) { - /* write of region completes here */ - *copy_len = still_to_copy; - ophdr->oh_len = cpu_to_be32(*copy_len); - if (*last_was_partial_copy) - ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); - *last_was_partial_copy = 0; - *bytes_consumed = 0; - return 0; - } + struct xlog_in_core **iclogp, + uint32_t *log_offset, + uint32_t *len, + uint32_t *record_cnt, + uint32_t *data_cnt) +{ + struct xlog_in_core *iclog = *iclogp; + struct xlog_op_header *ophdr; + int index = 0; + uint32_t rlen; + int error; - /* partial write of region, needs extra log op header reservation */ - *copy_len = space_available; - ophdr->oh_len = cpu_to_be32(*copy_len); - ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - if (*last_was_partial_copy) - ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; - *bytes_consumed += *copy_len; - (*last_was_partial_copy)++; + /* walk the logvec, copying until we run out of space in the iclog */ + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *reg = &lv->lv_iovecp[index]; + uint32_t reg_offset = 0; - /* account for new log op header */ - ticket->t_curr_res -= sizeof(struct xlog_op_header); - ticket->t_res_num_ophdrs++; + /* + * The first region of a continuation must have a non-zero + * length otherwise log recovery will just skip over it and + * start recovering from the next opheader it finds. Because we + * mark the next opheader as a continuation, recovery will then + * incorrectly add the continuation to the previous region and + * that breaks stuff. + * + * Hence if there isn't space for region data after the + * opheader, then we need to start afresh with a new iclog. + */ + if (iclog->ic_size - *log_offset <= + sizeof(struct xlog_op_header)) { + error = xlog_write_get_more_iclog_space(ticket, + &iclog, log_offset, *len, record_cnt, + data_cnt); + if (error) + return error; + } - return sizeof(struct xlog_op_header); -} + ophdr = reg->i_addr; + rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset); -static int -xlog_write_copy_finish( - struct xlog *log, - struct xlog_in_core *iclog, - uint flags, - int *record_cnt, - int *data_cnt, - int *partial_copy, - int *partial_copy_len, - int log_offset) -{ - int error; + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header)); + if (rlen != reg->i_len) + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + + xlog_write_iovec(iclog, log_offset, reg->i_addr, + rlen, len, record_cnt, data_cnt); + + /* If we wrote the whole region, move to the next. */ + if (rlen == reg->i_len) + continue; - if (*partial_copy) { /* - * This iclog has already been marked WANT_SYNC by - * xlog_state_get_iclog_space. + * We now have a partially written iovec, but it can span + * multiple iclogs so we loop here. First we release the iclog + * we currently have, then we get a new iclog and add a new + * opheader. Then we continue copying from where we were until + * we either complete the iovec or fill the iclog. If we + * complete the iovec, then we increment the index and go right + * back to the top of the outer loop. if we fill the iclog, we + * run the inner loop again. + * + * This is complicated by the tail of a region using all the + * space in an iclog and hence requiring us to release the iclog + * and get a new one before returning to the outer loop. We must + * always guarantee that we exit this inner loop with at least + * space for log transaction opheaders left in the current + * iclog, hence we cannot just terminate the loop at the end + * of the of the continuation. So we loop while there is no + * space left in the current iclog, and check for the end of the + * continuation after getting a new iclog. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; - goto release_iclog; - } + do { + /* + * Ensure we include the continuation opheader in the + * space we need in the new iclog by adding that size + * to the length we require. This continuation opheader + * needs to be accounted to the ticket as the space it + * consumes hasn't been accounted to the lv we are + * writing. + */ + error = xlog_write_get_more_iclog_space(ticket, + &iclog, log_offset, + *len + sizeof(struct xlog_op_header), + record_cnt, data_cnt); + if (error) + return error; - *partial_copy = 0; - *partial_copy_len = 0; + ophdr = iclog->ic_datap + *log_offset; + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = XFS_TRANSACTION; + ophdr->oh_res2 = 0; + ophdr->oh_flags = XLOG_WAS_CONT_TRANS; - if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t)) - return 0; + ticket->t_curr_res -= sizeof(struct xlog_op_header); + *log_offset += sizeof(struct xlog_op_header); + *data_cnt += sizeof(struct xlog_op_header); - /* no more space in this iclog - push it. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; + /* + * If rlen fits in the iclog, then end the region + * continuation. Otherwise we're going around again. + */ + reg_offset += rlen; + rlen = reg->i_len - reg_offset; + if (rlen <= iclog->ic_size - *log_offset) + ophdr->oh_flags |= XLOG_END_TRANS; + else + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - if (iclog->ic_state == XLOG_STATE_ACTIVE) - xlog_state_switch_iclogs(log, iclog, 0); - else - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - xlog_is_shutdown(log)); -release_iclog: - error = xlog_state_release_iclog(log, iclog); - spin_unlock(&log->l_icloglock); - return error; + rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset); + ophdr->oh_len = cpu_to_be32(rlen); + + xlog_write_iovec(iclog, log_offset, + reg->i_addr + reg_offset, + rlen, len, record_cnt, data_cnt); + + } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS); + } + + /* + * No more iovecs remain in this logvec so return the next log vec to + * the caller so it can go back to fast path copying. + */ + *iclogp = iclog; + return 0; } /* @@ -2449,27 +2468,16 @@ xlog_write( struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, - uint optype) + uint32_t len) + { struct xlog_in_core *iclog = NULL; struct xfs_log_vec *lv = log_vector; - struct xfs_log_iovec *vecp = lv->lv_iovecp; - int index = 0; - int len; - int partial_copy = 0; - int partial_copy_len = 0; - int contwr = 0; - int record_cnt = 0; - int data_cnt = 0; + uint32_t record_cnt = 0; + uint32_t data_cnt = 0; int error = 0; + int log_offset; - /* - * If this is a commit or unmount transaction, we don't need a start - * record to be written. We do, however, have to account for the - * commit or unmount header that gets written. Hence we always have - * to account for an extra xlog_op_header here. - */ - ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2477,144 +2485,54 @@ xlog_write( xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } - len = xlog_write_calc_vec_length(ticket, log_vector, optype); - while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { - void *ptr; - int log_offset; - - error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset); - if (error) - return error; + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &log_offset); + if (error) + return error; - ASSERT(log_offset <= iclog->ic_size - 1); - ptr = iclog->ic_datap + log_offset; + ASSERT(log_offset <= iclog->ic_size - 1); - /* - * If we have a context pointer, pass it the first iclog we are - * writing to so it can record state needed for iclog write - * ordering. - */ - if (ctx) { - xlog_cil_set_ctx_write_state(ctx, iclog); - ctx = NULL; - } + /* + * If we have a context pointer, pass it the first iclog we are + * writing to so it can record state needed for iclog write + * ordering. + */ + if (ctx) + xlog_cil_set_ctx_write_state(ctx, iclog); + while (lv) { /* - * This loop writes out as many regions as can fit in the amount - * of space which was allocated by xlog_state_get_iclog_space(). + * If the entire log vec does not fit in the iclog, punt it to + * the partial copy loop which can handle this case. */ - while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { - struct xfs_log_iovec *reg; - struct xlog_op_header *ophdr; - int copy_len; - int copy_off; - bool ordered = false; - bool wrote_start_rec = false; - - /* ordered log vectors have no regions to write */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { - ASSERT(lv->lv_niovecs == 0); - ordered = true; - goto next_lv; - } - - reg = &vecp[index]; - ASSERT(reg->i_len % sizeof(int32_t) == 0); - ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - - /* - * Before we start formatting log vectors, we need to - * write a start record. Only do this for the first - * iclog we write to. - */ - if (optype & XLOG_START_TRANS) { - xlog_write_start_rec(ptr, ticket); - xlog_write_adv_cnt(&ptr, &len, &log_offset, - sizeof(struct xlog_op_header)); - optype &= ~XLOG_START_TRANS; - wrote_start_rec = true; - } - - ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype); - if (!ophdr) - return -EIO; - - xlog_write_adv_cnt(&ptr, &len, &log_offset, - sizeof(struct xlog_op_header)); - - len += xlog_write_setup_copy(ticket, ophdr, - iclog->ic_size-log_offset, - reg->i_len, - ©_off, ©_len, - &partial_copy, - &partial_copy_len); - xlog_verify_dest_ptr(log, ptr); - - /* - * Copy region. - * - * Unmount records just log an opheader, so can have - * empty payloads with no data region to copy. Hence we - * only copy the payload if the vector says it has data - * to copy. - */ - ASSERT(copy_len >= 0); - if (copy_len > 0) { - memcpy(ptr, reg->i_addr + copy_off, copy_len); - xlog_write_adv_cnt(&ptr, &len, &log_offset, - copy_len); - } - copy_len += sizeof(struct xlog_op_header); - record_cnt++; - if (wrote_start_rec) { - copy_len += sizeof(struct xlog_op_header); - record_cnt++; - } - data_cnt += contwr ? copy_len : 0; - - error = xlog_write_copy_finish(log, iclog, optype, - &record_cnt, &data_cnt, - &partial_copy, - &partial_copy_len, - log_offset); - if (error) + if (lv->lv_niovecs && + lv->lv_bytes > iclog->ic_size - log_offset) { + error = xlog_write_partial(lv, ticket, &iclog, + &log_offset, &len, &record_cnt, + &data_cnt); + if (error) { + /* + * We have no iclog to release, so just return + * the error immediately. + */ return error; - - /* - * if we had a partial copy, we need to get more iclog - * space but we don't want to increment the region - * index because there is still more is this region to - * write. - * - * If we completed writing this region, and we flushed - * the iclog (indicated by resetting of the record - * count), then we also need to get more log space. If - * this was the last record, though, we are done and - * can just return. - */ - if (partial_copy) - break; - - if (++index == lv->lv_niovecs) { -next_lv: - lv = lv->lv_next; - index = 0; - if (lv) - vecp = lv->lv_iovecp; - } - if (record_cnt == 0 && !ordered) { - if (!lv) - return 0; - break; } + } else { + xlog_write_full(lv, ticket, iclog, &log_offset, + &len, &record_cnt, &data_cnt); } + lv = lv->lv_next; } - ASSERT(len == 0); + /* + * We've already been guaranteed that the last writes will fit inside + * the current iclog, and hence it will already have the space used by + * those writes accounted to it. Hence we do not need to update the + * iclog with the number of bytes written here. + */ spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + xlog_state_finish_copy(log, iclog, record_cnt, 0); error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); @@ -2971,7 +2889,6 @@ xlog_state_get_iclog_space( int len, struct xlog_in_core **iclogp, struct xlog_ticket *ticket, - int *continued_write, int *logoffsetp) { int log_offset; @@ -3008,9 +2925,6 @@ restart: */ if (log_offset == 0) { ticket->t_curr_res -= log->l_iclog_hsize; - xlog_tic_add_region(ticket, - log->l_iclog_hsize, - XLOG_REG_TYPE_LRHEADER); head->h_cycle = cpu_to_be32(log->l_curr_cycle); head->h_lsn = cpu_to_be64( xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); @@ -3052,13 +2966,10 @@ restart: * iclogs (to mark it taken), this particular iclog will release/sync * to disk in xlog_write(). */ - if (len <= iclog->ic_size - iclog->ic_offset) { - *continued_write = 0; + if (len <= iclog->ic_size - iclog->ic_offset) iclog->ic_offset += len; - } else { - *continued_write = 1; + else xlog_state_switch_iclogs(log, iclog, iclog->ic_size); - } *iclogp = iclog; ASSERT(iclog->ic_offset <= iclog->ic_size); @@ -3090,7 +3001,6 @@ xfs_log_ticket_regrant( xlog_grant_sub_space(log, &log->l_write_head.grant, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); trace_xfs_log_ticket_regrant_sub(log, ticket); @@ -3101,7 +3011,6 @@ xfs_log_ticket_regrant( trace_xfs_log_ticket_regrant_exit(log, ticket); ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); } xfs_log_ticket_put(ticket); @@ -3591,7 +3500,6 @@ xlog_ticket_alloc( struct xlog *log, int unit_bytes, int cnt, - char client, bool permanent) { struct xlog_ticket *tic; @@ -3609,40 +3517,14 @@ xlog_ticket_alloc( tic->t_cnt = cnt; tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); - tic->t_clientid = client; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; - xlog_tic_reset_res(tic); - return tic; } #if defined(DEBUG) /* - * Make sure that the destination ptr is within the valid data region of - * one of the iclogs. This uses backup pointers stored in a different - * part of the log in case we trash the log structure. - */ -STATIC void -xlog_verify_dest_ptr( - struct xlog *log, - void *ptr) -{ - int i; - int good_ptr = 0; - - for (i = 0; i < log->l_iclog_bufs; i++) { - if (ptr >= log->l_iclog_bak[i] && - ptr <= log->l_iclog_bak[i] + log->l_iclog_size) - good_ptr++; - } - - if (!good_ptr) - xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); -} - -/* * Check to make sure the grant write head didn't just over lap the tail. If * the cycles are the same, we can't be overlapping. Otherwise, make sure that * the cycles differ by exactly one and check the byte count. @@ -3769,7 +3651,7 @@ xlog_verify_iclog( if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); + idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3780,11 +3662,12 @@ xlog_verify_iclog( iclog->ic_header.h_cycle_data[idx]); } } - if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) + if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) { xfs_warn(log->l_mp, - "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx", - __func__, clientid, ophead, + "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx", + __func__, i, clientid, ophead, (unsigned long)field_offset); + } /* check length */ p = &ophead->oh_len; @@ -3792,8 +3675,7 @@ xlog_verify_iclog( if (field_offset & 0x1ff) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((uintptr_t)&ophead->oh_len - - (uintptr_t)iclog->ic_datap); + idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3829,7 +3711,7 @@ xlog_verify_iclog( bool xlog_force_shutdown( struct xlog *log, - int shutdown_flags) + uint32_t shutdown_flags) { bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); @@ -3995,3 +3877,44 @@ xlog_drop_incompat_feat( { up_read(&log->l_incompat_users); } + +/* + * Get permission to use log-assisted atomic exchange of file extents. + * + * Callers must not be running any transactions or hold any inode locks, and + * they must release the permission by calling xlog_drop_incompat_feat + * when they're done. + */ +int +xfs_attr_use_log_assist( + struct xfs_mount *mp) +{ + int error = 0; + + /* + * Protect ourselves from an idle log clearing the logged xattrs log + * incompat feature bit. + */ + xlog_use_incompat_feat(mp->m_log); + + /* + * If log-assisted xattrs are already enabled, the caller can use the + * log assisted swap functions with the log-incompat reference we got. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + return 0; + + /* Enable log-assisted xattrs. */ + error = xfs_add_incompat_log_feature(mp, + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); + if (error) + goto drop_incompat; + + xfs_warn_once(mp, +"EXPERIMENTAL logged extended attributes feature added. Use at your own risk!"); + + return 0; +drop_incompat: + xlog_drop_incompat_feat(mp->m_log); + return error; +} diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index dc1b77b92fc1..252b098cde1f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -21,46 +21,59 @@ struct xfs_log_vec { #define XFS_LOG_VEC_ORDERED (-1) -static inline void * -xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, - uint type) +/* + * Calculate the log iovec length for a given user buffer length. Intended to be + * used by ->iop_size implementations when sizing buffers of arbitrary + * alignments. + */ +static inline int +xlog_calc_iovec_len(int len) { - struct xfs_log_iovec *vec = *vecp; + return roundup(len, sizeof(uint32_t)); +} - if (vec) { - ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); - vec++; - } else { - vec = &lv->lv_iovecp[0]; +void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type); + +static inline void +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, + int data_len) +{ + struct xlog_op_header *oph = vec->i_addr; + int len; + + /* + * Always round up the length to the correct alignment so callers don't + * need to know anything about this log vec layout requirement. This + * means we have to zero the area the data to be written does not cover. + * This is complicated by fact the payload region is offset into the + * logvec region by the opheader that tracks the payload. + */ + len = xlog_calc_iovec_len(data_len); + if (len - data_len != 0) { + char *buf = vec->i_addr + sizeof(struct xlog_op_header); + + memset(buf + data_len, 0, len - data_len); } - vec->i_type = type; - vec->i_addr = lv->lv_buf + lv->lv_buf_len; + /* + * The opheader tracks aligned payload length, whilst the logvec tracks + * the overall region length. + */ + oph->oh_len = cpu_to_be32(len); - ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); + len += sizeof(struct xlog_op_header); + lv->lv_buf_len += len; + lv->lv_bytes += len; + vec->i_len = len; - *vecp = vec; - return vec->i_addr; + /* Catch buffer overruns */ + ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size); } /* - * We need to make sure the next buffer is naturally aligned for the biggest - * basic data type we put into it. We already accounted for this padding when - * sizing the buffer. - * - * However, this padding does not get written into the log, and hence we have to - * track the space used by the log vectors separately to prevent log space hangs - * due to inaccurate accounting (i.e. a leak) of the used log space through the - * CIL context ticket. + * Copy the amount of data requested by the caller into a new log iovec. */ -static inline void -xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) -{ - lv->lv_buf_len += round_up(len, sizeof(uint64_t)); - lv->lv_bytes += len; - vec->i_len = len; -} - static inline void * xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, uint type, void *data, int len) @@ -117,15 +130,11 @@ int xfs_log_mount_finish(struct xfs_mount *mp); void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); -void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_reserve(struct xfs_mount *mp, - int length, - int count, - struct xlog_ticket **ticket, - uint8_t clientid, - bool permanent); -int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); -void xfs_log_unmount(struct xfs_mount *mp); +void xfs_log_space_wake(struct xfs_mount *mp); +int xfs_log_reserve(struct xfs_mount *mp, int length, int count, + struct xlog_ticket **ticket, bool permanent); +int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); +void xfs_log_unmount(struct xfs_mount *mp); bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); @@ -140,9 +149,10 @@ void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); -bool xlog_force_shutdown(struct xlog *log, int shutdown_flags); +bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags); void xlog_use_incompat_feat(struct xlog *log); void xlog_drop_incompat_feat(struct xlog *log); +int xfs_attr_use_log_assist(struct xfs_mount *mp); #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index c9f55e4f0957..db6cb7800251 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -37,7 +37,7 @@ xlog_cil_ticket_alloc( { struct xlog_ticket *tic; - tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0); + tic = xlog_ticket_alloc(log, 0, 1, 0); /* * set the current reservation to zero so we know to steal the basic @@ -48,6 +48,38 @@ xlog_cil_ticket_alloc( } /* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +static bool +xlog_item_in_current_chkpt( + struct xfs_cil *cil, + struct xfs_log_item *lip) +{ + if (list_empty(&lip->li_cil)) + return false; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + return lip->li_seq == READ_ONCE(cil->xc_current_sequence); +} + +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip); +} + +/* * Unavoidable forward declaration - xlog_cil_push_work() calls * xlog_cil_ctx_alloc() itself. */ @@ -103,39 +135,6 @@ xlog_cil_iovec_space( } /* - * shadow buffers can be large, so we need to use kvmalloc() here to ensure - * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall - * back to vmalloc, so we can't actually do anything useful with gfp flags to - * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do - * direct reclaim and compaction in the slow path, both of which are - * horrendously expensive. We just want kmalloc to fail fast and fall back to - * vmalloc if it can't get somethign straight away from the free lists or buddy - * allocator. Hence we have to open code kvmalloc outselves here. - * - * Also, we are in memalloc_nofs_save task context here, so despite the use of - * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This - * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets - * just all pretend this is a GFP_KERNEL context operation.... - */ -static inline void * -xlog_cil_kvmalloc( - size_t buf_size) -{ - gfp_t flags = GFP_KERNEL; - void *p; - - flags &= ~__GFP_DIRECT_RECLAIM; - flags |= __GFP_NOWARN | __GFP_NORETRY; - do { - p = kmalloc(buf_size, flags); - if (!p) - p = vmalloc(buf_size); - } while (!p); - - return p; -} - -/* * Allocate or pin log vector buffers for CIL insertion. * * The CIL currently uses disposable buffers for copying a snapshot of the @@ -214,13 +213,20 @@ xlog_cil_alloc_shadow_bufs( } /* - * We 64-bit align the length of each iovec so that the start - * of the next one is naturally aligned. We'll need to - * account for that slack space here. Then round nbytes up - * to 64-bit alignment so that the initial buffer alignment is - * easy to calculate and verify. + * We 64-bit align the length of each iovec so that the start of + * the next one is naturally aligned. We'll need to account for + * that slack space here. + * + * We also add the xlog_op_header to each region when + * formatting, but that's not accounted to the size of the item + * at this point. Hence we'll need an addition number of bytes + * for each vector to hold an opheader. + * + * Then round nbytes up to 64-bit alignment so that the initial + * buffer alignment is easy to calculate and verify. */ - nbytes += niovecs * sizeof(uint64_t); + nbytes += niovecs * + (sizeof(uint64_t) + sizeof(struct xlog_op_header)); nbytes = round_up(nbytes, sizeof(uint64_t)); /* @@ -244,7 +250,7 @@ xlog_cil_alloc_shadow_bufs( * storage. */ kmem_free(lip->li_lv_shadow); - lv = xlog_cil_kvmalloc(buf_size); + lv = xlog_kvmalloc(buf_size); memset(lv, 0, xlog_cil_iovec_space(niovecs)); @@ -277,22 +283,18 @@ xlog_cil_alloc_shadow_bufs( /* * Prepare the log item for insertion into the CIL. Calculate the difference in - * log space and vectors it will consume, and if it is a new item pin it as - * well. + * log space it will consume, and if it is a new item pin it as well. */ STATIC void xfs_cil_prepare_item( struct xlog *log, struct xfs_log_vec *lv, struct xfs_log_vec *old_lv, - int *diff_len, - int *diff_iovecs) + int *diff_len) { /* Account for the new LV being passed in */ - if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) *diff_len += lv->lv_bytes; - *diff_iovecs += lv->lv_niovecs; - } /* * If there is no old LV, this is the first time we've seen the item in @@ -309,7 +311,6 @@ xfs_cil_prepare_item( ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; - *diff_iovecs -= old_lv->lv_niovecs; lv->lv_item->li_lv_shadow = old_lv; } @@ -358,12 +359,10 @@ static void xlog_cil_insert_format_items( struct xlog *log, struct xfs_trans *tp, - int *diff_len, - int *diff_iovecs) + int *diff_len) { struct xfs_log_item *lip; - /* Bail out if we didn't find a log item. */ if (list_empty(&tp->t_items)) { ASSERT(0); @@ -406,7 +405,6 @@ xlog_cil_insert_format_items( * set the item up as though it is a new insertion so * that the space reservation accounting is correct. */ - *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_bytes; /* Ensure the lv is set up according to ->iop_size */ @@ -431,7 +429,7 @@ xlog_cil_insert_format_items( ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); lip->li_ops->iop_format(lip, lv); insert: - xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); + xfs_cil_prepare_item(log, lv, old_lv, diff_len); } } @@ -445,13 +443,13 @@ insert: static void xlog_cil_insert_items( struct xlog *log, - struct xfs_trans *tp) + struct xfs_trans *tp, + uint32_t released_space) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; struct xfs_log_item *lip; int len = 0; - int diff_iovecs = 0; int iclog_space; int iovhdr_res = 0, split_res = 0, ctx_res = 0; @@ -461,15 +459,10 @@ xlog_cil_insert_items( * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ - xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); + xlog_cil_insert_format_items(log, tp, &len); spin_lock(&cil->xc_cil_lock); - /* account for space used by new iovec headers */ - iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t); - len += iovhdr_res; - ctx->nvecs += diff_iovecs; - /* attach the transaction to the CIL if it has any busy extents */ if (!list_empty(&tp->t_busy)) list_splice_init(&tp->t_busy, &ctx->busy_extents); @@ -500,7 +493,9 @@ xlog_cil_insert_items( ASSERT(tp->t_ticket->t_curr_res >= len); } tp->t_ticket->t_curr_res -= len; + tp->t_ticket->t_curr_res += released_space; ctx->space_used += len; + ctx->space_used -= released_space; /* * If we've overrun the reservation, dump the tx details before we move @@ -822,7 +817,8 @@ restart: static int xlog_cil_write_chain( struct xfs_cil_ctx *ctx, - struct xfs_log_vec *chain) + struct xfs_log_vec *chain, + uint32_t chain_len) { struct xlog *log = ctx->cil->xc_log; int error; @@ -830,7 +826,7 @@ xlog_cil_write_chain( error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD); if (error) return error; - return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS); + return xlog_write(log, ctx, chain, ctx->ticket, chain_len); } /* @@ -844,9 +840,14 @@ xlog_cil_write_commit_record( struct xfs_cil_ctx *ctx) { struct xlog *log = ctx->cil->xc_log; + struct xlog_op_header ophdr = { + .oh_clientid = XFS_TRANSACTION, + .oh_tid = cpu_to_be32(ctx->ticket->t_tid), + .oh_flags = XLOG_COMMIT_TRANS, + }; struct xfs_log_iovec reg = { - .i_addr = NULL, - .i_len = 0, + .i_addr = &ophdr, + .i_len = sizeof(struct xlog_op_header), .i_type = XLOG_REG_TYPE_COMMIT, }; struct xfs_log_vec vec = { @@ -862,12 +863,138 @@ xlog_cil_write_commit_record( if (error) return error; - error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); + /* account for space used by record data */ + ctx->ticket->t_curr_res -= reg.i_len; + error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len); if (error) xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } +struct xlog_cil_trans_hdr { + struct xlog_op_header oph[2]; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr[2]; +}; + +/* + * Build a checkpoint transaction header to begin the journal transaction. We + * need to account for the space used by the transaction header here as it is + * not accounted for in xlog_write(). + * + * This is the only place we write a transaction header, so we also build the + * log opheaders that indicate the start of a log transaction and wrap the + * transaction header. We keep the start record in it's own log vector rather + * than compacting them into a single region as this ends up making the logic + * in xlog_write() for handling empty opheaders for start, commit and unmount + * records much simpler. + */ +static void +xlog_cil_build_trans_hdr( + struct xfs_cil_ctx *ctx, + struct xlog_cil_trans_hdr *hdr, + struct xfs_log_vec *lvhdr, + int num_iovecs) +{ + struct xlog_ticket *tic = ctx->ticket; + __be32 tid = cpu_to_be32(tic->t_tid); + + memset(hdr, 0, sizeof(*hdr)); + + /* Log start record */ + hdr->oph[0].oh_tid = tid; + hdr->oph[0].oh_clientid = XFS_TRANSACTION; + hdr->oph[0].oh_flags = XLOG_START_TRANS; + + /* log iovec region pointer */ + hdr->lhdr[0].i_addr = &hdr->oph[0]; + hdr->lhdr[0].i_len = sizeof(struct xlog_op_header); + hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER; + + /* log opheader */ + hdr->oph[1].oh_tid = tid; + hdr->oph[1].oh_clientid = XFS_TRANSACTION; + hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header)); + + /* transaction header in host byte order format */ + hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + hdr->thdr.th_type = XFS_TRANS_CHECKPOINT; + hdr->thdr.th_tid = tic->t_tid; + hdr->thdr.th_num_items = num_iovecs; + + /* log iovec region pointer */ + hdr->lhdr[1].i_addr = &hdr->oph[1]; + hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) + + sizeof(struct xfs_trans_header); + hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR; + + lvhdr->lv_niovecs = 2; + lvhdr->lv_iovecp = &hdr->lhdr[0]; + lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len; + lvhdr->lv_next = ctx->lv_chain; + + tic->t_curr_res -= lvhdr->lv_bytes; +} + +/* + * Pull all the log vectors off the items in the CIL, and remove the items from + * the CIL. We don't need the CIL lock here because it's only needed on the + * transaction commit side which is currently locked out by the flush lock. + * + * If a log item is marked with a whiteout, we do not need to write it to the + * journal and so we just move them to the whiteout list for the caller to + * dispose of appropriately. + */ +static void +xlog_cil_build_lv_chain( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx, + struct list_head *whiteouts, + uint32_t *num_iovecs, + uint32_t *num_bytes) +{ + struct xfs_log_vec *lv = NULL; + + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + + if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) { + list_move(&item->li_cil, whiteouts); + trace_xfs_cil_whiteout_skip(item); + continue; + } + + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + *num_iovecs += lv->lv_niovecs; + + /* we don't write ordered log vectors */ + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) + *num_bytes += lv->lv_bytes; + } +} + +static void +xlog_cil_cleanup_whiteouts( + struct list_head *whiteouts) +{ + while (!list_empty(whiteouts)) { + struct xfs_log_item *item = list_first_entry(whiteouts, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + trace_xfs_cil_whiteout_unpin(item); + item->li_ops->iop_unpin(item, 1); + } +} + /* * Push the Committed Item List to the log. * @@ -890,16 +1017,15 @@ xlog_cil_push_work( container_of(work, struct xfs_cil_ctx, push_work); struct xfs_cil *cil = ctx->cil; struct xlog *log = cil->xc_log; - struct xfs_log_vec *lv; struct xfs_cil_ctx *new_ctx; - struct xlog_ticket *tic; - int num_iovecs; + int num_iovecs = 0; + int num_bytes = 0; int error = 0; - struct xfs_trans_header thdr; - struct xfs_log_iovec lhdr; + struct xlog_cil_trans_hdr thdr; struct xfs_log_vec lvhdr = { NULL }; xfs_csn_t push_seq; bool push_commit_stable; + LIST_HEAD (whiteouts); new_ctx = xlog_cil_ctx_alloc(); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -968,28 +1094,7 @@ xlog_cil_push_work( list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); - /* - * Pull all the log vectors off the items in the CIL, and remove the - * items from the CIL. We don't need the CIL lock here because it's only - * needed on the transaction commit side which is currently locked out - * by the flush lock. - */ - lv = NULL; - num_iovecs = 0; - while (!list_empty(&cil->xc_cil)) { - struct xfs_log_item *item; - - item = list_first_entry(&cil->xc_cil, - struct xfs_log_item, li_cil); - list_del_init(&item->li_cil); - if (!ctx->lv_chain) - ctx->lv_chain = item->li_lv; - else - lv->lv_next = item->li_lv; - lv = item->li_lv; - item->li_lv = NULL; - num_iovecs += lv->lv_niovecs; - } + xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes); /* * Switch the contexts so we can drop the context lock and move out @@ -1025,26 +1130,11 @@ xlog_cil_push_work( * Build a checkpoint transaction header and write it to the log to * begin the transaction. We need to account for the space used by the * transaction header here as it is not accounted for in xlog_write(). - * - * The LSN we need to pass to the log items on transaction commit is - * the LSN reported by the first log vector write. If we use the commit - * record lsn then we can move the tail beyond the grant write head. */ - tic = ctx->ticket; - thdr.th_magic = XFS_TRANS_HEADER_MAGIC; - thdr.th_type = XFS_TRANS_CHECKPOINT; - thdr.th_tid = tic->t_tid; - thdr.th_num_items = num_iovecs; - lhdr.i_addr = &thdr; - lhdr.i_len = sizeof(xfs_trans_header_t); - lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; - tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); - - lvhdr.lv_niovecs = 1; - lvhdr.lv_iovecp = &lhdr; - lvhdr.lv_next = ctx->lv_chain; - - error = xlog_cil_write_chain(ctx, &lvhdr); + xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs); + num_bytes += lvhdr.lv_bytes; + + error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes); if (error) goto out_abort_free_ticket; @@ -1052,7 +1142,7 @@ xlog_cil_push_work( if (error) goto out_abort_free_ticket; - xfs_log_ticket_ungrant(log, tic); + xfs_log_ticket_ungrant(log, ctx->ticket); /* * If the checkpoint spans multiple iclogs, wait for all previous iclogs @@ -1107,6 +1197,7 @@ xlog_cil_push_work( /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); + xlog_cil_cleanup_whiteouts(&whiteouts); return; out_skip: @@ -1116,8 +1207,9 @@ out_skip: return; out_abort_free_ticket: - xfs_log_ticket_ungrant(log, tic); + xfs_log_ticket_ungrant(log, ctx->ticket); ASSERT(xlog_is_shutdown(log)); + xlog_cil_cleanup_whiteouts(&whiteouts); if (!ctx->commit_iclog) { xlog_cil_committed(ctx); return; @@ -1267,6 +1359,43 @@ xlog_cil_empty( } /* + * If there are intent done items in this transaction and the related intent was + * committed in the current (same) CIL checkpoint, we don't need to write either + * the intent or intent done item to the journal as the change will be + * journalled atomically within this checkpoint. As we cannot remove items from + * the CIL here, mark the related intent with a whiteout so that the CIL push + * can remove it rather than writing it to the journal. Then remove the intent + * done item from the current transaction and release it so it doesn't get put + * into the CIL at all. + */ +static uint32_t +xlog_cil_process_intents( + struct xfs_cil *cil, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *ilip, *next; + uint32_t len = 0; + + list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { + if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE)) + continue; + + ilip = lip->li_ops->iop_intent(lip); + if (!ilip || !xlog_item_in_current_chkpt(cil, ilip)) + continue; + set_bit(XFS_LI_WHITEOUT, &ilip->li_flags); + trace_xfs_cil_whiteout_mark(ilip); + len += ilip->li_lv->lv_bytes; + kmem_free(ilip->li_lv); + ilip->li_lv = NULL; + + xfs_trans_del_item(lip); + lip->li_ops->iop_release(lip); + } + return len; +} + +/* * Commit a transaction with the given vector to the Committed Item List. * * To do this, we need to format the item, pin it in memory if required and @@ -1288,6 +1417,7 @@ xlog_cil_commit( { struct xfs_cil *cil = log->l_cilp; struct xfs_log_item *lip, *next; + uint32_t released_space = 0; /* * Do all necessary memory allocation before we lock the CIL. @@ -1299,7 +1429,10 @@ xlog_cil_commit( /* lock out background commit */ down_read(&cil->xc_ctx_lock); - xlog_cil_insert_items(log, tp); + if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE) + released_space = xlog_cil_process_intents(cil, tp); + + xlog_cil_insert_items(log, tp, released_space); if (regrant && !xlog_is_shutdown(log)) xfs_log_ticket_regrant(log, tp->t_ticket); @@ -1456,32 +1589,6 @@ out_shutdown: } /* - * Check if the current log item was first committed in this sequence. - * We can't rely on just the log item being in the CIL, we have to check - * the recorded commit sequence number. - * - * Note: for this to be used in a non-racy manner, it has to be called with - * CIL flushing locked out. As a result, it should only be used during the - * transaction commit process when deciding what to format into the item. - */ -bool -xfs_log_item_in_current_chkpt( - struct xfs_log_item *lip) -{ - struct xfs_cil *cil = lip->li_log->l_cilp; - - if (list_empty(&lip->li_cil)) - return false; - - /* - * li_seq is written on the first commit of a log item to record the - * first checkpoint it is written to. Hence if it is different to the - * current sequence, we're in a new checkpoint. - */ - return lip->li_seq == READ_ONCE(cil->xc_current_sequence); -} - -/* * Perform initial CIL structure initialisation. */ int diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 401cdc400980..67fd9789e69a 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -51,8 +51,8 @@ enum xlog_iclog_state { /* * In core log flags */ -#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ -#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ +#define XLOG_ICL_NEED_FLUSH (1u << 0) /* iclog needs REQ_PREFLUSH */ +#define XLOG_ICL_NEED_FUA (1u << 1) /* iclog needs REQ_FUA */ #define XLOG_ICL_STRINGS \ { XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \ @@ -62,7 +62,7 @@ enum xlog_iclog_state { /* * Log ticket flags */ -#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ +#define XLOG_TIC_PERM_RESERV (1u << 0) /* permanent reservation */ #define XLOG_TIC_FLAGS \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } @@ -142,19 +142,6 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 -/* Ticket reservation region accounting */ -#define XLOG_TIC_LEN_MAX 15 - -/* - * Reservation region - * As would be stored in xfs_log_iovec but without the i_addr which - * we don't care about. - */ -typedef struct xlog_res { - uint r_len; /* region length :4 */ - uint r_type; /* region's transaction type :4 */ -} xlog_res_t; - typedef struct xlog_ticket { struct list_head t_queue; /* reserve/write queue */ struct task_struct *t_task; /* task that owns this ticket */ @@ -164,15 +151,7 @@ typedef struct xlog_ticket { int t_unit_res; /* unit reservation in bytes : 4 */ char t_ocnt; /* original count : 1 */ char t_cnt; /* current count : 1 */ - char t_clientid; /* who does this belong to; : 1 */ - char t_flags; /* properties of reservation : 1 */ - - /* reservation array fields */ - uint t_res_num; /* num in array : 4 */ - uint t_res_num_ophdrs; /* num op hdrs : 4 */ - uint t_res_arr_sum; /* array sum : 4 */ - uint t_res_o_flow; /* sum overflow : 4 */ - xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ + uint8_t t_flags; /* properties of reservation : 1 */ } xlog_ticket_t; /* @@ -211,7 +190,7 @@ typedef struct xlog_in_core { u32 ic_offset; enum xlog_iclog_state ic_state; unsigned int ic_flags; - char *ic_datap; /* pointer to iclog data */ + void *ic_datap; /* pointer to iclog data */ struct list_head ic_callbacks; /* reference counts need their own cacheline */ @@ -242,7 +221,6 @@ struct xfs_cil_ctx { xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ - int nvecs; /* number of regions */ int space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ struct xfs_log_vec *lv_chain; /* logvecs being pushed */ @@ -441,10 +419,6 @@ struct xlog { struct xfs_kobj l_kobj; - /* The following field are used for debugging; need to hold icloglock */ -#ifdef DEBUG - void *l_iclog_bak[XLOG_MAX_ICLOGS]; -#endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; @@ -509,27 +483,14 @@ extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); extern struct kmem_cache *xfs_log_ticket_cache; -struct xlog_ticket * -xlog_ticket_alloc( - struct xlog *log, - int unit_bytes, - int count, - char client, - bool permanent); - -static inline void -xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) -{ - *ptr += bytes; - *len -= bytes; - *off += bytes; -} +struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, + int count, bool permanent); void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *tic, - uint optype); + uint32_t len); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); @@ -690,4 +651,38 @@ xlog_valid_lsn( return valid; } +/* + * Log vector and shadow buffers can be large, so we need to use kvmalloc() here + * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts + * to fall back to vmalloc, so we can't actually do anything useful with gfp + * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() + * will do direct reclaim and compaction in the slow path, both of which are + * horrendously expensive. We just want kmalloc to fail fast and fall back to + * vmalloc if it can't get somethign straight away from the free lists or + * buddy allocator. Hence we have to open code kvmalloc outselves here. + * + * This assumes that the caller uses memalloc_nofs_save task context here, so + * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS + * allocations. This is actually the only way to make vmalloc() do GFP_NOFS + * allocations, so lets just all pretend this is a GFP_KERNEL context + * operation.... + */ +static inline void * +xlog_kvmalloc( + size_t buf_size) +{ + gfp_t flags = GFP_KERNEL; + void *p; + + flags &= ~__GFP_DIRECT_RECLAIM; + flags |= __GFP_NOWARN | __GFP_NORETRY; + do { + p = kmalloc(buf_size, flags); + if (!p) + p = vmalloc(buf_size); + } while (!p); + + return p; +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c4ad4296c540..97b941c07957 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1800,6 +1800,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_cud_item_ops, &xlog_bui_item_ops, &xlog_bud_item_ops, + &xlog_attri_item_ops, + &xlog_attrd_item_ops, }; static const struct xlog_recover_item_ops * diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index bc66d95c8d4c..8f495cc23903 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -27,42 +27,34 @@ __xfs_printk( printk("%sXFS: %pV\n", level, vaf); } -#define define_xfs_printk_level(func, kern_level) \ -void func(const struct xfs_mount *mp, const char *fmt, ...) \ -{ \ - struct va_format vaf; \ - va_list args; \ - int level; \ - \ - va_start(args, fmt); \ - \ - vaf.fmt = fmt; \ - vaf.va = &args; \ - \ - __xfs_printk(kern_level, mp, &vaf); \ - va_end(args); \ - \ - if (!kstrtoint(kern_level, 0, &level) && \ - level <= LOGLEVEL_ERR && \ - xfs_error_level >= XFS_ERRLEVEL_HIGH) \ - xfs_stack_trace(); \ -} \ - -define_xfs_printk_level(xfs_emerg, KERN_EMERG); -define_xfs_printk_level(xfs_alert, KERN_ALERT); -define_xfs_printk_level(xfs_crit, KERN_CRIT); -define_xfs_printk_level(xfs_err, KERN_ERR); -define_xfs_printk_level(xfs_warn, KERN_WARNING); -define_xfs_printk_level(xfs_notice, KERN_NOTICE); -define_xfs_printk_level(xfs_info, KERN_INFO); -#ifdef DEBUG -define_xfs_printk_level(xfs_debug, KERN_DEBUG); -#endif +void +xfs_printk_level( + const char *kern_level, + const struct xfs_mount *mp, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int level; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(kern_level, mp, &vaf); + + va_end(args); + + if (!kstrtoint(kern_level, 0, &level) && + level <= LOGLEVEL_ERR && + xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} void -xfs_alert_tag( +_xfs_alert_tag( const struct xfs_mount *mp, - int panic_tag, + uint32_t panic_tag, const char *fmt, ...) { struct va_format vaf; diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index bb9860ec9a93..55ee464ab59f 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -6,33 +6,46 @@ struct xfs_mount; -extern __printf(2, 3) -void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); extern __printf(3, 4) -void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); -extern __printf(2, 3) -void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); +void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp, + const char *fmt, ...); +#define xfs_printk_index_wrap(kern_level, mp, fmt, ...) \ +({ \ + printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \ + xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__); \ +}) +#define xfs_emerg(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__) +#define xfs_alert(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__) +#define xfs_crit(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__) +#define xfs_err(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__) +#define xfs_warn(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__) +#define xfs_notice(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__) +#define xfs_info(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__) #ifdef DEBUG -extern __printf(2, 3) -void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); +#define xfs_debug(mp, fmt, ...) \ + xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__) #else -static inline __printf(2, 3) -void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) -{ -} +#define xfs_debug(mp, fmt, ...) do {} while (0) #endif +#define xfs_alert_tag(mp, tag, fmt, ...) \ +({ \ + printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \ + _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__); \ +}) + +extern __printf(3, 4) +void _xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag, + const char *fmt, ...); + #define xfs_printk_ratelimited(func, dev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c5f153c3693f..0c0bcbd4949d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -468,6 +468,8 @@ STATIC int xfs_check_summary_counts( struct xfs_mount *mp) { + int error = 0; + /* * The AG0 superblock verifier rejects in-progress filesystems, * so we should never see the flag set this far into mounting. @@ -506,11 +508,32 @@ xfs_check_summary_counts( * superblock to be correct and we don't need to do anything here. * Otherwise, recalculate the summary counters. */ - if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) && - !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) - return 0; + if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) || + xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) { + error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); + if (error) + return error; + } - return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); + /* + * Older kernels misused sb_frextents to reflect both incore + * reservations made by running transactions and the actual count of + * free rt extents in the ondisk metadata. Transactions committed + * during runtime can therefore contain a superblock update that + * undercounts the number of free rt extents tracked in the rt bitmap. + * A clean unmount record will have the correct frextents value since + * there can be no other transactions running at that point. + * + * If we're mounting the rt volume after recovering the log, recompute + * frextents from the rtbitmap file to fix the inconsistency. + */ + if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { + error = xfs_rtalloc_reinit_frextents(mp); + if (error) + return error; + } + + return 0; } /* @@ -784,11 +807,6 @@ xfs_mountfs( goto out_inodegc_shrinker; } - /* Make sure the summary counts are ok. */ - error = xfs_check_summary_counts(mp); - if (error) - goto out_log_dealloc; - /* Enable background inode inactivation workers. */ xfs_inodegc_start(mp); xfs_blockgc_start(mp); @@ -844,6 +862,11 @@ xfs_mountfs( goto out_rele_rip; } + /* Make sure the summary counts are ok. */ + error = xfs_check_summary_counts(mp); + if (error) + goto out_rtunmount; + /* * If this is a read-only mount defer the superblock updates until * the next remount into writeable mode. Otherwise we would never @@ -1087,24 +1110,33 @@ xfs_fs_writable( return true; } +/* Adjust m_fdblocks or m_frextents. */ int -xfs_mod_fdblocks( +xfs_mod_freecounter( struct xfs_mount *mp, + struct percpu_counter *counter, int64_t delta, bool rsvd) { int64_t lcounter; long long res_used; + uint64_t set_aside = 0; s32 batch; - uint64_t set_aside; + bool has_resv_pool; + + ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); + has_resv_pool = (counter == &mp->m_fdblocks); + if (rsvd) + ASSERT(has_resv_pool); if (delta > 0) { /* * If the reserve pool is depleted, put blocks back into it * first. Most of the time the pool is full. */ - if (likely(mp->m_resblks == mp->m_resblks_avail)) { - percpu_counter_add(&mp->m_fdblocks, delta); + if (likely(!has_resv_pool || + mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(counter, delta); return 0; } @@ -1116,7 +1148,7 @@ xfs_mod_fdblocks( } else { delta -= res_used; mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(&mp->m_fdblocks, delta); + percpu_counter_add(counter, delta); } spin_unlock(&mp->m_sb_lock); return 0; @@ -1130,7 +1162,7 @@ xfs_mod_fdblocks( * then make everything serialise as we are real close to * ENOSPC. */ - if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, + if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else @@ -1147,9 +1179,10 @@ xfs_mod_fdblocks( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); - if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, + if (has_resv_pool) + set_aside = xfs_fdblocks_unavailable(mp); + percpu_counter_add_batch(counter, delta, batch); + if (__percpu_counter_compare(counter, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; @@ -1160,8 +1193,8 @@ xfs_mod_fdblocks( * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - percpu_counter_add(&mp->m_fdblocks, -delta); - if (!rsvd) + percpu_counter_add(counter, -delta); + if (!has_resv_pool || !rsvd) goto fdblocks_enospc; lcounter = (long long)mp->m_resblks_avail + delta; @@ -1178,24 +1211,6 @@ fdblocks_enospc: return -ENOSPC; } -int -xfs_mod_frextents( - struct xfs_mount *mp, - int64_t delta) -{ - int64_t lcounter; - int ret = 0; - - spin_lock(&mp->m_sb_lock); - lcounter = mp->m_sb.sb_frextents + delta; - if (lcounter < 0) - ret = -ENOSPC; - else - mp->m_sb.sb_frextents = lcounter; - spin_unlock(&mp->m_sb_lock); - return ret; -} - /* * Used to free the superblock along various error paths. */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f6dc19de8322..8c42786e4942 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -183,6 +183,8 @@ typedef struct xfs_mount { struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ struct percpu_counter m_fdblocks; /* free block counter */ + struct percpu_counter m_frextents; /* free rt extent counter */ + /* * Count of data device blocks reserved for delayed allocations, * including indlen blocks. Does not include allocated CoW staging @@ -276,6 +278,7 @@ typedef struct xfs_mount { #define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */ #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ +#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -338,6 +341,7 @@ __XFS_HAS_FEAT(realtime, REALTIME) __XFS_HAS_FEAT(inobtcounts, INOBTCNT) __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) +__XFS_HAS_FEAT(large_extent_counts, NREXT64) /* * Mount features @@ -425,16 +429,15 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) #define XFS_MAX_IO_LOG 30 /* 1G */ #define XFS_MIN_IO_LOG PAGE_SHIFT -#define xfs_is_shutdown(mp) xfs_is_shutdown(mp) -void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, +void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname, int lnnum); #define xfs_force_shutdown(m,f) \ xfs_do_force_shutdown(m, f, __FILE__, __LINE__) -#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ -#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ -#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ -#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */ +#define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */ +#define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */ +#define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */ #define XFS_SHUTDOWN_STRINGS \ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ @@ -494,9 +497,20 @@ xfs_fdblocks_unavailable( return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); } -extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, - bool reserved); -extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); +int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, + int64_t delta, bool rsvd); + +static inline int +xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved) +{ + return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved); +} + +static inline int +xfs_mod_frextents(struct xfs_mount *mp, int64_t delta) +{ + return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false); +} extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 25991923c1a8..758702b9495f 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -132,6 +132,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40); + XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16); /* * The v5 superblock format extended several v4 header structures with diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f165d1a3de1d..8fc813cb6011 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -582,9 +582,6 @@ xfs_qm_init_timelimits( defq->blk.time = XFS_QM_BTIMELIMIT; defq->ino.time = XFS_QM_ITIMELIMIT; defq->rtb.time = XFS_QM_RTBTIMELIMIT; - defq->blk.warn = XFS_QM_BWARNLIMIT; - defq->ino.warn = XFS_QM_IWARNLIMIT; - defq->rtb.warn = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. @@ -608,12 +605,6 @@ xfs_qm_init_timelimits( defq->ino.time = dqp->q_ino.timer; if (dqp->q_rtb.timer) defq->rtb.time = dqp->q_rtb.timer; - if (dqp->q_blk.warnings) - defq->blk.warn = dqp->q_blk.warnings; - if (dqp->q_ino.warnings) - defq->ino.warn = dqp->q_ino.warnings; - if (dqp->q_rtb.warnings) - defq->rtb.warn = dqp->q_rtb.warnings; xfs_qm_dqdestroy(dqp); } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 5bb12717ea28..9683f0457d19 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -34,7 +34,6 @@ struct xfs_quota_limits { xfs_qcnt_t hard; /* default hard limit */ xfs_qcnt_t soft; /* default soft limit */ time64_t time; /* limit for timers */ - xfs_qwarncnt_t warn; /* limit for warnings */ }; /* Defaults for each quota type: time limits, warn limits, usage limits */ @@ -134,10 +133,6 @@ struct xfs_dquot_acct { #define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ #define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ -#define XFS_QM_BWARNLIMIT 5 -#define XFS_QM_IWARNLIMIT 5 -#define XFS_QM_RTBWARNLIMIT 5 - extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); /* quota ops */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 7d5a31827681..74ac9ca9e119 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -217,8 +217,7 @@ xfs_qm_scall_quotaon( return 0; } -#define XFS_QC_MASK \ - (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) +#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK) /* * Adjust limits of this quota, and the defaults if passed in. Returns true @@ -251,17 +250,6 @@ xfs_setqlim_limits( } static inline void -xfs_setqlim_warns( - struct xfs_dquot_res *res, - struct xfs_quota_limits *qlim, - int warns) -{ - res->warnings = warns; - if (qlim) - qlim->warn = warns; -} - -static inline void xfs_setqlim_timer( struct xfs_mount *mp, struct xfs_dquot_res *res, @@ -354,8 +342,6 @@ xfs_qm_scall_setqlim( if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk")) xfs_dquot_set_prealloc_limits(dqp); - if (newlim->d_fieldmask & QC_SPC_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); if (newlim->d_fieldmask & QC_SPC_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer); @@ -370,8 +356,6 @@ xfs_qm_scall_setqlim( qlim = id == 0 ? &defq->rtb : NULL; xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb"); - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); if (newlim->d_fieldmask & QC_RT_SPC_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer); @@ -386,8 +370,6 @@ xfs_qm_scall_setqlim( qlim = id == 0 ? &defq->ino : NULL; xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino"); - if (newlim->d_fieldmask & QC_INO_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); if (newlim->d_fieldmask & QC_INO_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer); @@ -428,13 +410,13 @@ xfs_qm_scall_getquota_fill_qc( dst->d_ino_count = dqp->q_ino.reserved; dst->d_spc_timer = dqp->q_blk.timer; dst->d_ino_timer = dqp->q_ino.timer; - dst->d_ino_warns = dqp->q_ino.warnings; - dst->d_spc_warns = dqp->q_blk.warnings; + dst->d_ino_warns = 0; + dst->d_spc_warns = 0; dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit); dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit); dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); dst->d_rt_spc_timer = dqp->q_rtb.timer; - dst->d_rt_spc_warns = dqp->q_rtb.warnings; + dst->d_rt_spc_warns = 0; /* * Internally, we don't reset all the timers when quota enforcement diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 07989bd67728..9c162e69976b 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -40,9 +40,9 @@ xfs_qm_fill_state( tstate->spc_timelimit = (u32)defq->blk.time; tstate->ino_timelimit = (u32)defq->ino.time; tstate->rt_spc_timelimit = (u32)defq->rtb.time; - tstate->spc_warnlimit = defq->blk.warn; - tstate->ino_warnlimit = defq->ino.warn; - tstate->rt_spc_warnlimit = defq->rtb.warn; + tstate->spc_warnlimit = 0; + tstate->ino_warnlimit = 0; + tstate->rt_spc_warnlimit = 0; if (tempqip) xfs_irele(ip); } @@ -98,7 +98,7 @@ xfs_quota_type(int type) } } -#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK) +#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK) /* * Adjust quota timers & warnings diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 0d868c93144d..7e97bf19793d 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -35,6 +35,7 @@ STATIC void xfs_cui_item_free( struct xfs_cui_log_item *cuip) { + kmem_free(cuip->cui_item.li_lv_shadow); if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) kmem_free(cuip); else @@ -53,10 +54,11 @@ xfs_cui_release( struct xfs_cui_log_item *cuip) { ASSERT(atomic_read(&cuip->cui_refcount) > 0); - if (atomic_dec_and_test(&cuip->cui_refcount)) { - xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_cui_item_free(cuip); - } + if (!atomic_dec_and_test(&cuip->cui_refcount)) + return; + + xfs_trans_ail_delete(&cuip->cui_item, 0); + xfs_cui_item_free(cuip); } @@ -204,14 +206,24 @@ xfs_cud_item_release( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); xfs_cui_release(cudp->cud_cuip); + kmem_free(cudp->cud_item.li_lv_shadow); kmem_cache_free(xfs_cud_cache, cudp); } +static struct xfs_log_item * +xfs_cud_item_intent( + struct xfs_log_item *lip) +{ + return &CUD_ITEM(lip)->cud_cuip->cui_item; +} + static const struct xfs_item_ops xfs_cud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_cud_item_size, .iop_format = xfs_cud_item_format, .iop_release = xfs_cud_item_release, + .iop_intent = xfs_cud_item_intent, }; static struct xfs_cud_log_item * @@ -259,7 +271,7 @@ xfs_trans_log_finish_refcount_update( * 1.) releases the CUI and frees the CUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); return error; @@ -600,6 +612,7 @@ xfs_cui_item_relog( } static const struct xfs_item_ops xfs_cui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, .iop_unpin = xfs_cui_item_unpin, diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 54e68e5693fd..e7a7c00d93be 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -586,21 +586,21 @@ out: STATIC int xfs_reflink_end_cow_extent( struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - xfs_fileoff_t *end_fsb) + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) { - struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got, del, data; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - xfs_filblks_t rlen; unsigned int resblks; + int nmaps; int error; /* No COW extents? That's easy! */ if (ifp->if_bytes == 0) { - *end_fsb = offset_fsb; + *offset_fsb = end_fsb; return 0; } @@ -620,6 +620,9 @@ xfs_reflink_end_cow_extent( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_REFLINK_END_COW_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_REFLINK_END_COW_CNT); if (error) goto out_cancel; @@ -628,42 +631,66 @@ xfs_reflink_end_cow_extent( * left by the time I/O completes for the loser of the race. In that * case we are done. */ - if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) || - got.br_startoff + got.br_blockcount <= offset_fsb) { - *end_fsb = offset_fsb; + if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || + got.br_startoff >= end_fsb) { + *offset_fsb = end_fsb; goto out_cancel; } /* - * Structure copy @got into @del, then trim @del to the range that we - * were asked to remap. We preserve @got for the eventual CoW fork - * deletion; from now on @del represents the mapping that we're - * actually remapping. - */ - del = got; - xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb); - - ASSERT(del.br_blockcount > 0); - - /* * Only remap real extents that contain data. With AIO, speculative * preallocations can leak into the range we are called upon, and we - * need to skip them. + * need to skip them. Preserve @got for the eventual CoW fork + * deletion; from now on @del represents the mapping that we're + * actually remapping. */ - if (!xfs_bmap_is_written_extent(&got)) { - *end_fsb = del.br_startoff; - goto out_cancel; + while (!xfs_bmap_is_written_extent(&got)) { + if (!xfs_iext_next_extent(ifp, &icur, &got) || + got.br_startoff >= end_fsb) { + *offset_fsb = end_fsb; + goto out_cancel; + } } + del = got; - /* Unmap the old blocks in the data fork. */ - rlen = del.br_blockcount; - error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); + /* Grab the corresponding mapping in the data fork. */ + nmaps = 1; + error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, + &nmaps, 0); if (error) goto out_cancel; - /* Trim the extent to whatever got unmapped. */ - xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen); - trace_xfs_reflink_cow_remap(ip, &del); + /* We can only remap the smaller of the two extent sizes. */ + data.br_blockcount = min(data.br_blockcount, del.br_blockcount); + del.br_blockcount = data.br_blockcount; + + trace_xfs_reflink_cow_remap_from(ip, &del); + trace_xfs_reflink_cow_remap_to(ip, &data); + + if (xfs_bmap_is_real_extent(&data)) { + /* + * If the extent we're remapping is backed by storage (written + * or not), unmap the extent and drop its refcount. + */ + xfs_bmap_unmap_extent(tp, ip, &data); + xfs_refcount_decrease_extent(tp, &data); + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, + -data.br_blockcount); + } else if (data.br_startblock == DELAYSTARTBLOCK) { + int done; + + /* + * If the extent we're remapping is a delalloc reservation, + * we can use the regular bunmapi function to release the + * incore state. Dropping the delalloc reservation takes care + * of the quota reservation for us. + */ + error = xfs_bunmapi(NULL, ip, data.br_startoff, + data.br_blockcount, 0, 1, &done); + if (error) + goto out_cancel; + ASSERT(done); + } /* Free the CoW orphan record. */ xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); @@ -684,7 +711,7 @@ xfs_reflink_end_cow_extent( return error; /* Update the caller about how much progress we made. */ - *end_fsb = del.br_startoff; + *offset_fsb = del.br_startoff + del.br_blockcount; return 0; out_cancel: @@ -712,7 +739,7 @@ xfs_reflink_end_cow( end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); /* - * Walk backwards until we're out of the I/O range. The loop function + * Walk forwards until we've remapped the I/O range. The loop function * repeatedly cycles the ILOCK to allocate one transaction per remapped * extent. * @@ -744,7 +771,7 @@ xfs_reflink_end_cow( * blocks will be remapped. */ while (end_fsb > offset_fsb && !error) - error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb); + error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb); if (error) trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); @@ -1121,6 +1148,8 @@ xfs_reflink_remap_extent( ++iext_delta; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto out_cancel; @@ -1133,7 +1162,7 @@ xfs_reflink_remap_extent( xfs_refcount_decrease_extent(tp, &smap); qdelta -= smap.br_blockcount; } else if (smap.br_startblock == DELAYSTARTBLOCK) { - xfs_filblks_t len = smap.br_blockcount; + int done; /* * If the extent we're unmapping is a delalloc reservation, @@ -1141,10 +1170,11 @@ xfs_reflink_remap_extent( * incore state. Dropping the delalloc reservation takes care * of the quota reservation for us. */ - error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); + error = xfs_bunmapi(NULL, ip, smap.br_startoff, + smap.br_blockcount, 0, 1, &done); if (error) goto out_cancel; - ASSERT(len == 0); + ASSERT(done); } /* diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index a22b2d19ef91..fef92e02f3bb 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -35,6 +35,7 @@ STATIC void xfs_rui_item_free( struct xfs_rui_log_item *ruip) { + kmem_free(ruip->rui_item.li_lv_shadow); if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) kmem_free(ruip); else @@ -53,10 +54,11 @@ xfs_rui_release( struct xfs_rui_log_item *ruip) { ASSERT(atomic_read(&ruip->rui_refcount) > 0); - if (atomic_dec_and_test(&ruip->rui_refcount)) { - xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_rui_item_free(ruip); - } + if (!atomic_dec_and_test(&ruip->rui_refcount)) + return; + + xfs_trans_ail_delete(&ruip->rui_item, 0); + xfs_rui_item_free(ruip); } STATIC void @@ -227,14 +229,24 @@ xfs_rud_item_release( struct xfs_rud_log_item *rudp = RUD_ITEM(lip); xfs_rui_release(rudp->rud_ruip); + kmem_free(rudp->rud_item.li_lv_shadow); kmem_cache_free(xfs_rud_cache, rudp); } +static struct xfs_log_item * +xfs_rud_item_intent( + struct xfs_log_item *lip) +{ + return &RUD_ITEM(lip)->rud_ruip->rui_item; +} + static const struct xfs_item_ops xfs_rud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_rud_item_size, .iop_format = xfs_rud_item_format, .iop_release = xfs_rud_item_release, + .iop_intent = xfs_rud_item_intent, }; static struct xfs_rud_log_item * @@ -327,7 +339,7 @@ xfs_trans_log_finish_rmap_update( * 1.) releases the RUI and frees the RUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); return error; @@ -630,6 +642,7 @@ xfs_rui_item_relog( } static const struct xfs_item_ops xfs_rui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, .iop_unpin = xfs_rui_item_unpin, diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b8c79ee791af..292d5e54a92c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -806,6 +806,9 @@ xfs_growfs_rt_alloc( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; @@ -1284,6 +1287,44 @@ xfs_rtmount_init( return 0; } +static int +xfs_rtalloc_count_frextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + uint64_t *valp = priv; + + *valp += rec->ar_extcount; + return 0; +} + +/* + * Reinitialize the number of free realtime extents from the realtime bitmap. + * Callers must ensure that there is no other activity in the filesystem. + */ +int +xfs_rtalloc_reinit_frextents( + struct xfs_mount *mp) +{ + uint64_t val = 0; + int error; + + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, + &val); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL); + if (error) + return error; + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_frextents = val; + spin_unlock(&mp->m_sb_lock); + percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + return 0; +} + /* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 91b00289509b..62c7ad79cbb6 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -22,6 +22,7 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); @@ -123,27 +124,29 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); -int xfs_rtalloc_query_range(struct xfs_trans *tp, +int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_trans *tp, +int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, bool *is_free); +int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) # define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) -# define xfs_rtalloc_query_all(t,f,p) (ENOSYS) +# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS) # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) # define xfs_verify_rtbno(m, r) (false) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) +# define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index a276b8111f63..8495ef076ffc 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -843,9 +843,11 @@ xfs_fs_statfs( if (XFS_IS_REALTIME_MOUNT(mp) && (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { + s64 freertx; + statp->f_blocks = sbp->sb_rblocks; - statp->f_bavail = statp->f_bfree = - sbp->sb_frextents * sbp->sb_rextsize; + freertx = percpu_counter_sum_positive(&mp->m_frextents); + statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize; } return 0; @@ -1015,8 +1017,14 @@ xfs_init_percpu_counters( if (error) goto free_fdblocks; + error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); + if (error) + goto free_delalloc; + return 0; +free_delalloc: + percpu_counter_destroy(&mp->m_delalloc_blks); free_fdblocks: percpu_counter_destroy(&mp->m_fdblocks); free_ifree: @@ -1033,6 +1041,7 @@ xfs_reinit_percpu_counters( percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); + percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); } static void @@ -1045,6 +1054,7 @@ xfs_destroy_percpu_counters( ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); + percpu_counter_destroy(&mp->m_frextents); } static int @@ -1635,6 +1645,10 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_large_extent_counts(mp)) + xfs_warn(mp, + "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index affbedf78160..4145ba872547 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -226,11 +226,6 @@ xfs_symlink( goto out_trans_cancel; } - error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; - /* * Allocate an inode for the symlink. */ diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 7692e76ead33..f78ad6b10ea5 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -83,6 +83,7 @@ extern xfs_param_t xfs_params; struct xfs_globals { #ifdef DEBUG int pwork_threads; /* parallel workqueue threads */ + bool larp; /* log attribute replay */ #endif int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 574b80c29fe1..f7faf6e70d7f 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -228,6 +228,29 @@ pwork_threads_show( return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads); } XFS_SYSFS_ATTR_RW(pwork_threads); + +static ssize_t +larp_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &xfs_globals.larp); + if (ret < 0) + return ret; + return count; +} + +STATIC ssize_t +larp_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp); +} +XFS_SYSFS_ATTR_RW(larp); #endif /* DEBUG */ static struct attribute *xfs_dbg_attrs[] = { @@ -237,6 +260,7 @@ static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(always_cow), #ifdef DEBUG ATTR_LIST(pwork_threads), + ATTR_LIST(larp), #endif NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b141ef78c755..d32026585c1b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -418,6 +418,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __field(unsigned, lockval) __field(unsigned, flags) __field(unsigned long, caller_ip) + __field(const void *, buf_ops) ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; @@ -428,9 +429,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; + __entry->buf_ops = bp->b_ops; ), TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " - "lock %d flags %s caller %pS", + "lock %d flags %s bufops %pS caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->nblks, @@ -438,6 +440,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->pincount, __entry->lockval, __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + __entry->buf_ops, (void *)__entry->caller_ip) ) @@ -1096,22 +1099,6 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done); DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before); DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after); -#define XFS_QMOPT_FLAGS \ - { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ - { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ - { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ - { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ - { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ - { XFS_QMOPT_INHERIT, "INHERIT" }, \ - { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ - { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ - { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ - { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ - { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ - { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ - { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ - { XFS_QMOPT_RES_INOS, "RES_INOS" } - TRACE_EVENT(xfs_trans_mod_dquot, TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp, unsigned int field, int64_t delta), @@ -1348,6 +1335,9 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_push); DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin); DECLARE_EVENT_CLASS(xfs_ail_class, TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), @@ -1924,7 +1914,7 @@ DECLARE_EVENT_CLASS(xfs_da_class, __field(int, namelen) __field(xfs_dahash_t, hashval) __field(xfs_ino_t, inumber) - __field(int, op_flags) + __field(uint32_t, op_flags) ), TP_fast_assign( __entry->dev = VFS_I(args->dp)->i_sb->s_dev; @@ -1990,7 +1980,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(xfs_dahash_t, hashval) __field(unsigned int, attr_filter) __field(unsigned int, attr_flags) - __field(int, op_flags) + __field(uint32_t, op_flags) ), TP_fast_assign( __entry->dev = VFS_I(args->dp)->i_sb->s_dev; @@ -2097,7 +2087,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_space_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(int, op_flags) + __field(uint32_t, op_flags) __field(int, idx) ), TP_fast_assign( @@ -2128,7 +2118,7 @@ TRACE_EVENT(xfs_dir2_leafn_moveents, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(int, op_flags) + __field(uint32_t, op_flags) __field(int, src_idx) __field(int, dst_idx) __field(int, count) @@ -2169,7 +2159,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __field(int, which) __field(xfs_ino_t, ino) __field(int, format) - __field(int, nex) + __field(xfs_extnum_t, nex) __field(int, broot_size) __field(int, fork_off) ), @@ -2182,7 +2172,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), - TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " + TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, " "broot size %d, forkoff 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -3418,7 +3408,8 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); @@ -3513,7 +3504,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping); -TRACE_EVENT(xfs_trans_resv_calc, +DECLARE_EVENT_CLASS(xfs_trans_resv_class, TP_PROTO(struct xfs_mount *mp, unsigned int type, struct xfs_trans_res *res), TP_ARGS(mp, type, res), @@ -3537,6 +3528,33 @@ TRACE_EVENT(xfs_trans_resv_calc, __entry->logres, __entry->logcount, __entry->logflags) +) + +#define DEFINE_TRANS_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_trans_resv_class, name, \ + TP_PROTO(struct xfs_mount *mp, unsigned int type, \ + struct xfs_trans_res *res), \ + TP_ARGS(mp, type, res)) +DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc); +DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize); + +TRACE_EVENT(xfs_log_get_max_trans_res, + TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res), + TP_ARGS(mp, res), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint, logres) + __field(int, logcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->logres = res->tr_logres; + __entry->logcount = res->tr_logcount; + ), + TP_printk("dev %d:%d logres %u logcount %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->logres, + __entry->logcount) ); DECLARE_EVENT_CLASS(xfs_trans_class, @@ -4111,6 +4129,27 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); DEFINE_ICLOG_EVENT(xlog_iclog_write); +TRACE_DEFINE_ENUM(XFS_DAS_UNINIT); +TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR); +TRACE_DEFINE_ENUM(XFS_DAS_DONE); + DECLARE_EVENT_CLASS(xfs_das_state_class, TP_PROTO(int das, struct xfs_inode *ip), TP_ARGS(das, ip), @@ -4122,8 +4161,9 @@ DECLARE_EVENT_CLASS(xfs_das_state_class, __entry->das = das; __entry->ino = ip->i_ino; ), - TP_printk("state change %d ino 0x%llx", - __entry->das, __entry->ino) + TP_printk("state change %s ino 0x%llx", + __print_symbolic(__entry->das, XFS_DAS_STRINGS), + __entry->ino) ) #define DEFINE_DAS_STATE_EVENT(name) \ @@ -4132,9 +4172,15 @@ DEFINE_EVENT(xfs_das_state_class, name, \ TP_ARGS(das, ip)) DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove); + TRACE_EVENT(xfs_force_shutdown, TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 0ac717aad380..82cf0189c0db 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -32,7 +32,6 @@ static void xfs_trans_trace_reservations( struct xfs_mount *mp) { - struct xfs_trans_res resv; struct xfs_trans_res *res; struct xfs_trans_res *end_res; int i; @@ -41,8 +40,6 @@ xfs_trans_trace_reservations( end_res = (struct xfs_trans_res *)(M_RES(mp) + 1); for (i = 0; res < end_res; i++, res++) trace_xfs_trans_resv_calc(mp, i, res); - xfs_log_get_max_trans_res(mp, &resv); - trace_xfs_trans_resv_calc(mp, -1, &resv); } #else # define xfs_trans_trace_reservations(mp) @@ -194,11 +191,9 @@ xfs_trans_reserve( ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); error = xfs_log_regrant(mp, tp->t_ticket); } else { - error = xfs_log_reserve(mp, - resp->tr_logres, + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, - &tp->t_ticket, XFS_TRANSACTION, - permanent); + &tp->t_ticket, permanent); } if (error) @@ -498,10 +493,31 @@ xfs_trans_apply_sb_deltas( be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta); } - if (tp->t_frextents_delta) - be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta); - if (tp->t_res_frextents_delta) - be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta); + /* + * Updating frextents requires careful handling because it does not + * behave like the lazysb counters because we cannot rely on log + * recovery in older kenels to recompute the value from the rtbitmap. + * This means that the ondisk frextents must be consistent with the + * rtbitmap. + * + * Therefore, log the frextents change to the ondisk superblock and + * update the incore superblock so that future calls to xfs_log_sb + * write the correct value ondisk. + * + * Don't touch m_frextents because it includes incore reservations, + * and those are handled by the unreserve function. + */ + if (tp->t_frextents_delta || tp->t_res_frextents_delta) { + struct xfs_mount *mp = tp->t_mountp; + int64_t rtxdelta; + + rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; + + spin_lock(&mp->m_sb_lock); + be64_add_cpu(&sbp->sb_frextents, rtxdelta); + mp->m_sb.sb_frextents += rtxdelta; + spin_unlock(&mp->m_sb_lock); + } if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); @@ -614,7 +630,12 @@ xfs_trans_unreserve_and_mod_sb( if (ifreedelta) percpu_counter_add(&mp->m_ifree, ifreedelta); - if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + if (rtxdelta) { + error = xfs_mod_frextents(mp, rtxdelta); + ASSERT(!error); + } + + if (!(tp->t_flags & XFS_TRANS_SB_DIRTY)) return; /* apply remaining deltas */ @@ -622,7 +643,12 @@ xfs_trans_unreserve_and_mod_sb( mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta; mp->m_sb.sb_icount += idelta; mp->m_sb.sb_ifree += ifreedelta; - mp->m_sb.sb_frextents += rtxdelta; + /* + * Do not touch sb_frextents here because we are dealing with incore + * reservation. sb_frextents is not part of the lazy sb counters so it + * must be consistent with the ondisk rtbitmap and must never include + * incore reservations. + */ mp->m_sb.sb_dblocks += tp->t_dblocks_delta; mp->m_sb.sb_agcount += tp->t_agcount_delta; mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 0c82673238f4..9561f193e7e1 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -55,13 +55,15 @@ struct xfs_log_item { #define XFS_LI_IN_AIL 0 #define XFS_LI_ABORTED 1 #define XFS_LI_FAILED 2 -#define XFS_LI_DIRTY 3 /* log item dirty in transaction */ +#define XFS_LI_DIRTY 3 +#define XFS_LI_WHITEOUT 4 #define XFS_LI_FLAGS \ - { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ - { (1 << XFS_LI_ABORTED), "ABORTED" }, \ - { (1 << XFS_LI_FAILED), "FAILED" }, \ - { (1 << XFS_LI_DIRTY), "DIRTY" } + { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \ + { (1u << XFS_LI_ABORTED), "ABORTED" }, \ + { (1u << XFS_LI_FAILED), "FAILED" }, \ + { (1u << XFS_LI_DIRTY), "DIRTY" }, \ + { (1u << XFS_LI_WHITEOUT), "WHITEOUT" } struct xfs_item_ops { unsigned flags; @@ -78,30 +80,32 @@ struct xfs_item_ops { bool (*iop_match)(struct xfs_log_item *item, uint64_t id); struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, struct xfs_trans *tp); + struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; -/* Is this log item a deferred action intent? */ +/* + * Log item ops flags + */ +/* + * Release the log item when the journal commits instead of inserting into the + * AIL for writeback tracking and/or log tail pinning. + */ +#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0) +#define XFS_ITEM_INTENT (1 << 1) +#define XFS_ITEM_INTENT_DONE (1 << 2) + static inline bool xlog_item_is_intent(struct xfs_log_item *lip) { - return lip->li_ops->iop_recover != NULL && - lip->li_ops->iop_match != NULL; + return lip->li_ops->flags & XFS_ITEM_INTENT; } -/* Is this a log intent-done item? */ static inline bool xlog_item_is_intent_done(struct xfs_log_item *lip) { - return lip->li_ops->iop_unpin == NULL && - lip->li_ops->iop_push == NULL; + return lip->li_ops->flags & XFS_ITEM_INTENT_DONE; } -/* - * Release the log item as soon as committed. This is for items just logging - * intents that never need to be written back in place. - */ -#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0) - void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, const struct xfs_item_ops *ops); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 9ba7e6b9bed3..aa00cf67ad72 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -597,13 +597,11 @@ xfs_dqresv_check( if (softlimit && total_count > softlimit) { time64_t now = ktime_get_real_seconds(); - if ((res->timer != 0 && now > res->timer) || - (res->warnings != 0 && res->warnings >= qlim->warn)) { + if (res->timer != 0 && now > res->timer) { *fatal = true; return QUOTA_NL_ISOFTLONGWARN; } - res->warnings++; return QUOTA_NL_ISOFTWARN; } diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 0d050f8829ef..7a044afd4c46 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -12,9 +12,9 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_acl.h" -#include "xfs_da_btree.h" #include <linux/posix_acl_xattr.h> |