xfs: mark reclaimed inodes invalid earlier

The last thing we do before using call_rcu() on an xfs_inode to be freed is mark it as invalid. This means there is a window between when we know for certain that the inode is going to be freed and when we do actually mark it as "freed". This is important in the context of RCU lookups - we can look up the inode, find that it is valid, and then use it as such not realising that it is in the final stages of being freed. As such, mark the inode as being invalid the moment we know it is going to be reclaimed. This can be done while we still hold the XFS_ILOCK_EXCL and the flush lock in xfs_inode_reclaim, meaning that it occurs well before we remove it from the radix tree, and that the i_flags_lock, the XFS_ILOCK and the inode flush lock all act as synchronisation points for detecting that an inode is about to go away. For defensive purposes, this allows us to add a further check to xfs_iflush_cluster to ensure we skip inodes that are being freed after we grab the XFS_ILOCK_SHARED and the flush lock - we know that if the inode number if valid while we have these locks held we know that it has not progressed through reclaim to the point where it is clean and is about to be freed. [bfoster: fixed __xfs_inode_clear_reclaim() using ip->i_ino after it had already been zeroed.] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
author: Dave Chinner 2016-05-18 14:09:12 +1000
committer: Dave Chinner 2016-05-18 14:09:12 +1000
commit: 8a17d7ddedb4d9031f046ae0e97c40b46aa69db5 (patch)
tree: 36509f042caa361bc2254ae12fe70f5fe32aa77c /fs
parent: 1f2dcfe89edac4e3bf5b76c56f745191f921fd2a (diff)
2 files changed, 47 insertions, 12 deletions
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0c94cde41016..57fcd5917a66 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -114,6 +114,18 @@ xfs_inode_free_callback(
 	kmem_zone_free(xfs_inode_zone, ip);
 }
 
+static void
+__xfs_inode_free(
+	struct xfs_inode	*ip)
+{
+	/* asserts to verify all state is correct here */
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!xfs_isiflocked(ip));
+	XFS_STATS_DEC(ip->i_mount, vn_active);
+
+	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
 void
 xfs_inode_free(
 	struct xfs_inode	*ip)
@@ -129,12 +141,7 @@ xfs_inode_free(
 	ip->i_ino = 0;
 	spin_unlock(&ip->i_flags_lock);
 
-	/* asserts to verify all state is correct here */
-	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!xfs_isiflocked(ip));
-	XFS_STATS_DEC(ip->i_mount, vn_active);
-
-	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+	__xfs_inode_free(ip);
 }
 
 /*
@@ -772,8 +779,7 @@ __xfs_inode_set_reclaim_tag(
 	if (!pag->pag_ici_reclaimable) {
 		/* propagate the reclaim tag up into the perag radix tree */
 		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree, pag->pag_agno,
 				XFS_ICI_RECLAIM_TAG);
 		spin_unlock(&ip->i_mount->m_perag_lock);
 
@@ -817,8 +823,7 @@ __xfs_inode_clear_reclaim(
 	if (!pag->pag_ici_reclaimable) {
 		/* clear the reclaim tag from the perag radix tree */
 		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree, pag->pag_agno,
 				XFS_ICI_RECLAIM_TAG);
 		spin_unlock(&ip->i_mount->m_perag_lock);
 		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
@@ -929,6 +934,7 @@ xfs_reclaim_inode(
 	int			sync_mode)
 {
 	struct xfs_buf		*bp = NULL;
+	xfs_ino_t		ino = ip->i_ino; /* for radix_tree_delete */
 	int			error;
 
 restart:
@@ -993,6 +999,22 @@ restart:
 
 	xfs_iflock(ip);
 reclaim:
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always appears
+	 * to be reclaimed with an invalid inode number when in the free state.
+	 * We do this as early as possible under the ILOCK and flush lock so
+	 * that xfs_iflush_cluster() can be guaranteed to detect races with us
+	 * here. By doing this, we guarantee that once xfs_iflush_cluster has
+	 * locked both the XFS_ILOCK and the flush lock that it will see either
+	 * a valid, flushable inode that will serialise correctly against the
+	 * locks below, or it will see a clean (and invalid) inode that it can
+	 * skip.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
+
 	xfs_ifunlock(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
@@ -1006,7 +1028,7 @@ reclaim:
 	 */
 	spin_lock(&pag->pag_ici_lock);
 	if (!radix_tree_delete(&pag->pag_ici_root,
-				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+				XFS_INO_TO_AGINO(ip->i_mount, ino)))
 		ASSERT(0);
 	__xfs_inode_clear_reclaim(pag, ip);
 	spin_unlock(&pag->pag_ici_lock);
@@ -1023,7 +1045,7 @@ reclaim:
 	xfs_qm_dqdetach(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
-	xfs_inode_free(ip);
+	__xfs_inode_free(ip);
 	return error;
 
 out_ifunlock:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3cbc9031731b..e3b27982b3b2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3239,6 +3239,19 @@ xfs_iflush_cluster(
 			continue;
 		}
 
+
+		/*
+		 * Check the inode number again, just to be certain we are not
+		 * racing with freeing in xfs_reclaim_inode(). See the comments
+		 * in that function for more information as to why the initial
+		 * check is not sufficient.
+		 */
+		if (!iq->i_ino) {
+			xfs_ifunlock(iq);
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+			continue;
+		}
+
 		/*
 		 * arriving here means that this inode can be flushed.  First
 		 * re-check that it's dirty before flushing.
author	Dave Chinner	2016-05-18 14:09:12 +1000
committer	Dave Chinner	2016-05-18 14:09:12 +1000
commit	8a17d7ddedb4d9031f046ae0e97c40b46aa69db5 (patch)
tree	36509f042caa361bc2254ae12fe70f5fe32aa77c /fs
parent	1f2dcfe89edac4e3bf5b76c56f745191f921fd2a (diff)