diff options
-rw-r--r-- | fs/xfs/libxfs/xfs_ag.c | 8 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ag.h | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_icache.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 344 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_log_recover.c | 36 |
6 files changed, 90 insertions, 310 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 71f5dae7ad6c..bb0c700afe3c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -194,7 +194,6 @@ xfs_free_perag( XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -323,10 +322,6 @@ xfs_initialize_perag( if (error) goto out_remove_pag; - error = xfs_iunlink_init(pag); - if (error) - goto out_hash_destroy; - /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; @@ -349,8 +344,6 @@ xfs_initialize_perag( mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; -out_hash_destroy: - xfs_buf_hash_destroy(pag); out_remove_pag: radix_tree_delete(&mp->m_perag_tree, index); out_free_pag: @@ -362,7 +355,6 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); - xfs_iunlink_destroy(pag); kmem_free(pag); } return error; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 75f7c10c110a..517a138faa66 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -103,12 +103,6 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; - /* - * Unlinked inode information. This incore information reflects - * data stored in the AGI, so callers must hold the AGI buffer lock - * or have some other means to control concurrency. - */ - struct rhashtable pagi_unlinked_hash; #endif /* __KERNEL__ */ }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2609825d53ee..7f4cc341aacc 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -111,6 +111,8 @@ xfs_inode_alloc( INIT_WORK(&ip->i_ioend_work, xfs_end_io); INIT_LIST_HEAD(&ip->i_ioend_list); spin_lock_init(&ip->i_ioend_lock); + ip->i_next_unlinked = NULLAGINO; + ip->i_prev_unlinked = NULLAGINO; return ip; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7153e3cc2627..63af7680cf73 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1801,197 +1801,22 @@ out: * because we must walk that list to find the inode that points to the inode * being removed from the unlinked hash bucket list. * - * What if we modelled the unlinked list as a collection of records capturing - * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd - * have a fast way to look up unlinked list predecessors, which avoids the - * slow list walk. That's exactly what we do here (in-core) with a per-AG - * rhashtable. + * Hence we keep an in-memory double linked list to link each inode on an + * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer + * based lists would require having 64 list heads in the perag, one for each + * list. This is expensive in terms of memory (think millions of AGs) and cache + * misses on lookups. Instead, use the fact that inodes on the unlinked list + * must be referenced at the VFS level to keep them on the list and hence we + * have an existence guarantee for inodes on the unlinked list. * - * Because this is a backref cache, we ignore operational failures since the - * iunlink code can fall back to the slow bucket walk. The only errors that - * should bubble out are for obviously incorrect situations. - * - * All users of the backref cache MUST hold the AGI buffer lock to serialize - * access or have otherwise provided for concurrency control. + * Given we have an existence guarantee, we can use lockless inode cache lookups + * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode + * for the double linked unlinked list, and we don't need any extra locking to + * keep the list safe as all manipulations are done under the AGI buffer lock. + * Keeping the list up to date does not require memory allocation, just finding + * the XFS inode and updating the next/prev unlinked list aginos. */ -/* Capture a "X.next_unlinked = Y" relationship. */ -struct xfs_iunlink { - struct rhash_head iu_rhash_head; - xfs_agino_t iu_agino; /* X */ - xfs_agino_t iu_next_unlinked; /* Y */ -}; - -/* Unlinked list predecessor lookup hashtable construction */ -static int -xfs_iunlink_obj_cmpfn( - struct rhashtable_compare_arg *arg, - const void *obj) -{ - const xfs_agino_t *key = arg->key; - const struct xfs_iunlink *iu = obj; - - if (iu->iu_next_unlinked != *key) - return 1; - return 0; -} - -static const struct rhashtable_params xfs_iunlink_hash_params = { - .min_size = XFS_AGI_UNLINKED_BUCKETS, - .key_len = sizeof(xfs_agino_t), - .key_offset = offsetof(struct xfs_iunlink, - iu_next_unlinked), - .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), - .automatic_shrinking = true, - .obj_cmpfn = xfs_iunlink_obj_cmpfn, -}; - -/* - * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such - * relation is found. - */ -static xfs_agino_t -xfs_iunlink_lookup_backref( - struct xfs_perag *pag, - xfs_agino_t agino) -{ - struct xfs_iunlink *iu; - - iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, - xfs_iunlink_hash_params); - return iu ? iu->iu_agino : NULLAGINO; -} - -/* - * Take ownership of an iunlink cache entry and insert it into the hash table. - * If successful, the entry will be owned by the cache; if not, it is freed. - * Either way, the caller does not own @iu after this call. - */ -static int -xfs_iunlink_insert_backref( - struct xfs_perag *pag, - struct xfs_iunlink *iu) -{ - int error; - - error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, - &iu->iu_rhash_head, xfs_iunlink_hash_params); - /* - * Fail loudly if there already was an entry because that's a sign of - * corruption of in-memory data. Also fail loudly if we see an error - * code we didn't anticipate from the rhashtable code. Currently we - * only anticipate ENOMEM. - */ - if (error) { - WARN(error != -ENOMEM, "iunlink cache insert error %d", error); - kmem_free(iu); - } - /* - * Absorb any runtime errors that aren't a result of corruption because - * this is a cache and we can always fall back to bucket list scanning. - */ - if (error != 0 && error != -EEXIST) - error = 0; - return error; -} - -/* Remember that @prev_agino.next_unlinked = @this_agino. */ -static int -xfs_iunlink_add_backref( - struct xfs_perag *pag, - xfs_agino_t prev_agino, - xfs_agino_t this_agino) -{ - struct xfs_iunlink *iu; - - if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) - return 0; - - iu = kmem_zalloc(sizeof(*iu), KM_NOFS); - iu->iu_agino = prev_agino; - iu->iu_next_unlinked = this_agino; - - return xfs_iunlink_insert_backref(pag, iu); -} - -/* - * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. - * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there - * wasn't any such entry then we don't bother. - */ -static int -xfs_iunlink_change_backref( - struct xfs_perag *pag, - xfs_agino_t agino, - xfs_agino_t next_unlinked) -{ - struct xfs_iunlink *iu; - int error; - - /* Look up the old entry; if there wasn't one then exit. */ - iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, - xfs_iunlink_hash_params); - if (!iu) - return 0; - - /* - * Remove the entry. This shouldn't ever return an error, but if we - * couldn't remove the old entry we don't want to add it again to the - * hash table, and if the entry disappeared on us then someone's - * violated the locking rules and we need to fail loudly. Either way - * we cannot remove the inode because internal state is or would have - * been corrupt. - */ - error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, - &iu->iu_rhash_head, xfs_iunlink_hash_params); - if (error) - return error; - - /* If there is no new next entry just free our item and return. */ - if (next_unlinked == NULLAGINO) { - kmem_free(iu); - return 0; - } - - /* Update the entry and re-add it to the hash table. */ - iu->iu_next_unlinked = next_unlinked; - return xfs_iunlink_insert_backref(pag, iu); -} - -/* Set up the in-core predecessor structures. */ -int -xfs_iunlink_init( - struct xfs_perag *pag) -{ - return rhashtable_init(&pag->pagi_unlinked_hash, - &xfs_iunlink_hash_params); -} - -/* Free the in-core predecessor structures. */ -static void -xfs_iunlink_free_item( - void *ptr, - void *arg) -{ - struct xfs_iunlink *iu = ptr; - bool *freed_anything = arg; - - *freed_anything = true; - kmem_free(iu); -} - -void -xfs_iunlink_destroy( - struct xfs_perag *pag) -{ - bool freed_anything = false; - - rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, - xfs_iunlink_free_item, &freed_anything); - - ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount)); -} - /* * Find an inode on the unlinked list. This does not take references to the * inode as we have existence guarantees by holding the AGI buffer lock and that @@ -2021,6 +1846,26 @@ xfs_iunlink_lookup( return ip; } +/* Update the prev pointer of the next agino. */ +static int +xfs_iunlink_update_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t next_agino) +{ + struct xfs_inode *ip; + + /* No update necessary if we are at the end of the list. */ + if (next_agino == NULLAGINO) + return 0; + + ip = xfs_iunlink_lookup(pag, next_agino); + if (!ip) + return -EFSCORRUPTED; + ip->i_prev_unlinked = prev_agino; + return 0; +} + /* * Point the AGI unlinked bucket at an inode and log the results. The caller * is responsible for validating the old value. @@ -2172,6 +2017,14 @@ xfs_iunlink_insert_inode( return -EFSCORRUPTED; } + /* + * Update the prev pointer in the next inode to point back to this + * inode. + */ + error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error) + return error; + if (next_agino != NULLAGINO) { xfs_agino_t old_agino; @@ -2185,14 +2038,6 @@ xfs_iunlink_insert_inode( return error; ASSERT(old_agino == NULLAGINO); ip->i_next_unlinked = next_agino; - - /* - * agino has been unlinked, add a backref from the next inode - * back to agino. - */ - error = xfs_iunlink_add_backref(pag, agino, next_agino); - if (error) - return error; } /* Point the head of the list to point to this inode. */ @@ -2233,61 +2078,6 @@ out: return error; } -/* - * Walk the unlinked chain from @head_agino until we find the inode that - * points to @target_agino. Return the inode number, map, dinode pointer, - * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. - * - * @tp, @pag, @head_agino, and @target_agino are input parameters. - * @agino, @imap, @dipp, and @bpp are all output parameters. - * - * Do not call this function if @target_agino is the head of the list. - */ -static int -xfs_iunlink_lookup_prev( - struct xfs_perag *pag, - xfs_agino_t head_agino, - xfs_agino_t target_agino, - struct xfs_inode **ipp) -{ - struct xfs_inode *ip; - xfs_agino_t next_agino; - - *ipp = NULL; - - next_agino = xfs_iunlink_lookup_backref(pag, target_agino); - if (next_agino != NULLAGINO) { - ip = xfs_iunlink_lookup(pag, next_agino); - if (ip && ip->i_next_unlinked == target_agino) { - *ipp = ip; - return 0; - } - } - - /* Otherwise, walk the entire bucket until we find it. */ - next_agino = head_agino; - while (next_agino != NULLAGINO) { - ip = xfs_iunlink_lookup(pag, next_agino); - if (!ip) - return -EFSCORRUPTED; - - /* - * Make sure this pointer is valid and isn't an obvious - * infinite loop. - */ - if (!xfs_verify_agino(pag, ip->i_next_unlinked) || - next_agino == ip->i_next_unlinked) - return -EFSCORRUPTED; - - if (ip->i_next_unlinked == target_agino) { - *ipp = ip; - return 0; - } - next_agino = ip->i_next_unlinked; - } - return -EFSCORRUPTED; -} - static int xfs_iunlink_remove_inode( struct xfs_trans *tp, @@ -2326,51 +2116,33 @@ xfs_iunlink_remove_inode( return error; /* - * If there was a backref pointing from the next inode back to this - * one, remove it because we've removed this inode from the list. - * - * Later, if this inode was in the middle of the list we'll update - * this inode's backref to point from the next inode. + * Update the prev pointer in the next inode to point back to previous + * inode in the chain. */ - if (next_agino != NULLAGINO) { - error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO); - if (error) - return error; - } + error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, + ip->i_next_unlinked); + if (error) + return error; if (head_agino != agino) { struct xfs_inode *prev_ip; - error = xfs_iunlink_lookup_prev(pag, head_agino, agino, - &prev_ip); - if (error) - return error; - - /* Point the previous inode on the list to the next inode. */ - error = xfs_iunlink_update_inode(tp, prev_ip, pag, next_agino, - NULL); - if (error) - return error; + prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); + if (!prev_ip) + return -EFSCORRUPTED; + error = xfs_iunlink_update_inode(tp, prev_ip, pag, + ip->i_next_unlinked, NULL); prev_ip->i_next_unlinked = ip->i_next_unlinked; - ip->i_next_unlinked = NULLAGINO; - - /* - * Now we deal with the backref for this inode. If this inode - * pointed at a real inode, change the backref that pointed to - * us to point to our old next. If this inode was the end of - * the list, delete the backref that pointed to us. Note that - * change_backref takes care of deleting the backref if - * next_agino is NULLAGINO. - */ - return xfs_iunlink_change_backref(agibp->b_pag, agino, - next_agino); + } else { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, + ip->i_next_unlinked); } - /* Point the head of the list to the next unlinked inode. */ ip->i_next_unlinked = NULLAGINO; - return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, - next_agino); + ip->i_prev_unlinked = NULLAGINO; + return error; } /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8e2a33c6cbe2..8d8cce61e5ba 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -70,6 +70,7 @@ typedef struct xfs_inode { /* unlinked list pointers */ xfs_agino_t i_next_unlinked; + xfs_agino_t i_prev_unlinked; /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ @@ -508,9 +509,6 @@ extern struct kmem_cache *xfs_inode_cache; bool xfs_inode_needs_inactive(struct xfs_inode *ip); -int xfs_iunlink_init(struct xfs_perag *pag); -void xfs_iunlink_destroy(struct xfs_perag *pag); - void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8d28487813b0..9e0e7ff76e02 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2674,28 +2674,50 @@ xlog_recover_iunlink_bucket( int bucket) { struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *prev_ip = NULL; struct xfs_inode *ip; - xfs_agino_t agino; + xfs_agino_t prev_agino, agino; + int error = 0; agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - int error; - error = xfs_iget(mp, NULL, XFS_AGINO_TO_INO(mp, pag->pag_agno, agino), 0, 0, &ip); if (error) - return error;; + break; ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); xfs_iflags_clear(ip, XFS_IRECOVERY); agino = ip->i_next_unlinked; - xfs_irele(ip); - cond_resched(); + if (prev_ip) { + ip->i_prev_unlinked = prev_agino; + xfs_irele(prev_ip); + + /* + * Ensure the inode is removed from the unlinked list + * before we continue so that it won't race with + * building the in-memory list here. This could be + * serialised with the agibp lock, but that just + * serialises via lockstepping and it's much simpler + * just to flush the inodegc queue and wait for it to + * complete. + */ + xfs_inodegc_flush(mp); + } + + prev_agino = agino; + prev_ip = ip; } - return 0; + + if (prev_ip) { + ip->i_prev_unlinked = prev_agino; + xfs_irele(prev_ip); + } + xfs_inodegc_flush(mp); + return error; } /* |