diff options
author | Dave Chinner | 2015-06-01 10:51:38 +1000 |
---|---|---|
committer | Dave Chinner | 2015-06-01 10:51:38 +1000 |
commit | b9a350a1183efd7b63e59b6eaa39abfea908d0be (patch) | |
tree | 34144351b97b4dc749b6f6b003821af6c6a30824 /fs/xfs/libxfs | |
parent | e01c025fbdd5584bc2c8f6b88cb014f5f9bd790f (diff) | |
parent | 22ce1e1472fda6ce740cee966bb8e25a3cc662bd (diff) |
Merge branch 'xfs-sparse-inode' into for-next
Diffstat (limited to 'fs/xfs/libxfs')
-rw-r--r-- | fs/xfs/libxfs/xfs_alloc.c | 42 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_alloc.h | 2 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_format.h | 48 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_fs.h | 1 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ialloc.c | 541 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ialloc.h | 12 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ialloc_btree.c | 93 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ialloc_btree.h | 10 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_sb.c | 30 |
9 files changed, 715 insertions, 64 deletions
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 516162be1398..bc78ac08e72e 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -149,13 +149,27 @@ xfs_alloc_compute_aligned( { xfs_agblock_t bno; xfs_extlen_t len; + xfs_extlen_t diff; /* Trim busy sections out of found extent */ xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + /* + * If we have a largish extent that happens to start before min_agbno, + * see if we can shift it into range... + */ + if (bno < args->min_agbno && bno + len > args->min_agbno) { + diff = args->min_agbno - bno; + if (len > diff) { + bno += diff; + len -= diff; + } + } + if (args->alignment > 1 && len >= args->minlen) { xfs_agblock_t aligned_bno = roundup(bno, args->alignment); - xfs_extlen_t diff = aligned_bno - bno; + + diff = aligned_bno - bno; *resbno = aligned_bno; *reslen = diff >= len ? 0 : len - diff; @@ -795,9 +809,13 @@ xfs_alloc_find_best_extent( * The good extent is closer than this one. */ if (!dir) { + if (*sbnoa > args->max_agbno) + goto out_use_good; if (*sbnoa >= args->agbno + gdiff) goto out_use_good; } else { + if (*sbnoa < args->min_agbno) + goto out_use_good; if (*sbnoa <= args->agbno - gdiff) goto out_use_good; } @@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near( dofirst = prandom_u32() & 1; #endif + /* handle unitialized agbno range so caller doesn't have to */ + if (!args->min_agbno && !args->max_agbno) + args->max_agbno = args->mp->m_sb.sb_agblocks - 1; + ASSERT(args->min_agbno <= args->max_agbno); + + /* clamp agbno to the range if it's outside */ + if (args->agbno < args->min_agbno) + args->agbno = args->min_agbno; + if (args->agbno > args->max_agbno) + args->agbno = args->max_agbno; + restart: bno_cur_lt = NULL; bno_cur_gt = NULL; @@ -976,6 +1005,8 @@ restart: <bnoa, <lena); if (ltlena < args->minlen) continue; + if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) + continue; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ASSERT(args->len >= args->minlen); @@ -1096,11 +1127,11 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); - if (ltlena >= args->minlen) + if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) break; if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) goto error0; - if (!i) { + if (!i || ltbnoa < args->min_agbno) { xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); bno_cur_lt = NULL; @@ -1112,11 +1143,11 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, gtbno, gtlen, >bnoa, >lena); - if (gtlena >= args->minlen) + if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) break; if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) goto error0; - if (!i) { + if (!i || gtbnoa > args->max_agbno) { xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); bno_cur_gt = NULL; @@ -1216,6 +1247,7 @@ restart: ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno); args->agbno = ltnew; if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d1b4b6a5c894..29f27b272b7f 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg { xfs_extlen_t total; /* total blocks needed in xaction */ xfs_extlen_t alignment; /* align answer to multiple of this */ xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */ + xfs_agblock_t max_agbno; /* ... */ xfs_extlen_t len; /* output: actual size of extent */ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ xfs_alloctype_t otype; /* original allocation type */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index ff22a4d9ad0c..815f61b02bc1 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -170,7 +170,7 @@ typedef struct xfs_sb { __uint32_t sb_features_log_incompat; __uint32_t sb_crc; /* superblock crc */ - __uint32_t sb_pad; + xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */ xfs_ino_t sb_pquotino; /* project quota inode */ xfs_lsn_t sb_lsn; /* last write sequence */ @@ -256,7 +256,7 @@ typedef struct xfs_dsb { __be32 sb_features_log_incompat; __le32 sb_crc; /* superblock crc */ - __be32 sb_pad; + __be32 sb_spino_align; /* sparse inode chunk alignment */ __be64 sb_pquotino; /* project quota inode */ __be64 sb_lsn; /* last write sequence */ @@ -457,8 +457,10 @@ xfs_sb_has_ro_compat_feature( } #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ #define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE) + (XFS_SB_FEAT_INCOMPAT_FTYPE| \ + XFS_SB_FEAT_INCOMPAT_SPINODES) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); } +static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); +} + /* * end of superblock version macros */ @@ -1216,26 +1224,54 @@ typedef __uint64_t xfs_inofree_t; #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) #define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) +#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */ +#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t)) +#define XFS_INODES_PER_HOLEMASK_BIT \ + (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t))) + static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; } /* - * Data record structure + * The on-disk inode record structure has two formats. The original "full" + * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount + * and replaces the 3 high-order freecount bytes wth the holemask and inode + * count. + * + * The holemask of the sparse record format allows an inode chunk to have holes + * that refer to blocks not owned by the inode record. This facilitates inode + * allocation in the event of severe free space fragmentation. */ typedef struct xfs_inobt_rec { __be32 ir_startino; /* starting inode number */ - __be32 ir_freecount; /* count of free inodes (set bits) */ + union { + struct { + __be32 ir_freecount; /* count of free inodes */ + } f; + struct { + __be16 ir_holemask;/* hole mask for sparse chunks */ + __u8 ir_count; /* total inode count */ + __u8 ir_freecount; /* count of free inodes */ + } sp; + } ir_u; __be64 ir_free; /* free inode mask */ } xfs_inobt_rec_t; typedef struct xfs_inobt_rec_incore { xfs_agino_t ir_startino; /* starting inode number */ - __int32_t ir_freecount; /* count of free inodes (set bits) */ + __uint16_t ir_holemask; /* hole mask for sparse chunks */ + __uint8_t ir_count; /* total inode count */ + __uint8_t ir_freecount; /* count of free inodes (set bits) */ xfs_inofree_t ir_free; /* free inode mask */ } xfs_inobt_rec_incore_t; +static inline bool xfs_inobt_issparse(uint16_t holemask) +{ + /* non-zero holemask represents a sparse rec. */ + return holemask; +} /* * Key structure diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 18dc721ca19f..89689c6a43e2 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ +#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 1c9e75521250..a18bc75cc216 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -65,6 +65,8 @@ xfs_inobt_lookup( int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_holemask = 0; + cur->bc_rec.i.ir_count = 0; cur->bc_rec.i.ir_freecount = 0; cur->bc_rec.i.ir_free = 0; return xfs_btree_lookup(cur, dir, stat); @@ -82,7 +84,14 @@ xfs_inobt_update( union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); - rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); + rec.inobt.ir_u.sp.ir_count = irec->ir_count; + rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount); + } rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } @@ -100,12 +109,27 @@ xfs_inobt_get_rec( int error; error = xfs_btree_get_rec(cur, &rec, stat); - if (!error && *stat == 1) { - irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); - irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + if (error || *stat == 0) + return error; + + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); + irec->ir_count = rec->inobt.ir_u.sp.ir_count; + irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; + } else { + /* + * ir_holemask/ir_count not supported on-disk. Fill in hardcoded + * values for full inode chunks. + */ + irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL; + irec->ir_count = XFS_INODES_PER_CHUNK; + irec->ir_freecount = + be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } - return error; + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + + return 0; } /* @@ -114,10 +138,14 @@ xfs_inobt_get_rec( STATIC int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, + __uint16_t holemask, + __uint8_t count, __int32_t freecount, xfs_inofree_t free, int *stat) { + cur->bc_rec.i.ir_holemask = holemask; + cur->bc_rec.i.ir_count = count; cur->bc_rec.i.ir_freecount = freecount; cur->bc_rec.i.ir_free = free; return xfs_btree_insert(cur, stat); @@ -154,7 +182,9 @@ xfs_inobt_insert( } ASSERT(i == 0); - error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL, + XFS_INODES_PER_CHUNK, + XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -220,6 +250,7 @@ xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, struct list_head *buffer_list, + int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, @@ -275,7 +306,7 @@ xfs_ialloc_inode_init( * they track in the AIL as if they were physically logged. */ if (tp) - xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, + xfs_icreate_log(tp, agno, agbno, icount, mp->m_sb.sb_inodesize, length, gen); } else version = 2; @@ -347,6 +378,214 @@ xfs_ialloc_inode_init( } /* + * Align startino and allocmask for a recently allocated sparse chunk such that + * they are fit for insertion (or merge) into the on-disk inode btrees. + * + * Background: + * + * When enabled, sparse inode support increases the inode alignment from cluster + * size to inode chunk size. This means that the minimum range between two + * non-adjacent inode records in the inobt is large enough for a full inode + * record. This allows for cluster sized, cluster aligned block allocation + * without need to worry about whether the resulting inode record overlaps with + * another record in the tree. Without this basic rule, we would have to deal + * with the consequences of overlap by potentially undoing recent allocations in + * the inode allocation codepath. + * + * Because of this alignment rule (which is enforced on mount), there are two + * inobt possibilities for newly allocated sparse chunks. One is that the + * aligned inode record for the chunk covers a range of inodes not already + * covered in the inobt (i.e., it is safe to insert a new sparse record). The + * other is that a record already exists at the aligned startino that considers + * the newly allocated range as sparse. In the latter case, record content is + * merged in hope that sparse inode chunks fill to full chunks over time. + */ +STATIC void +xfs_align_sparse_ino( + struct xfs_mount *mp, + xfs_agino_t *startino, + uint16_t *allocmask) +{ + xfs_agblock_t agbno; + xfs_agblock_t mod; + int offset; + + agbno = XFS_AGINO_TO_AGBNO(mp, *startino); + mod = agbno % mp->m_sb.sb_inoalignmt; + if (!mod) + return; + + /* calculate the inode offset and align startino */ + offset = mod << mp->m_sb.sb_inopblog; + *startino -= offset; + + /* + * Since startino has been aligned down, left shift allocmask such that + * it continues to represent the same physical inodes relative to the + * new startino. + */ + *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; +} + +/* + * Determine whether the source inode record can merge into the target. Both + * records must be sparse, the inode ranges must match and there must be no + * allocation overlap between the records. + */ +STATIC bool +__xfs_inobt_can_merge( + struct xfs_inobt_rec_incore *trec, /* tgt record */ + struct xfs_inobt_rec_incore *srec) /* src record */ +{ + uint64_t talloc; + uint64_t salloc; + + /* records must cover the same inode range */ + if (trec->ir_startino != srec->ir_startino) + return false; + + /* both records must be sparse */ + if (!xfs_inobt_issparse(trec->ir_holemask) || + !xfs_inobt_issparse(srec->ir_holemask)) + return false; + + /* both records must track some inodes */ + if (!trec->ir_count || !srec->ir_count) + return false; + + /* can't exceed capacity of a full record */ + if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) + return false; + + /* verify there is no allocation overlap */ + talloc = xfs_inobt_irec_to_allocmask(trec); + salloc = xfs_inobt_irec_to_allocmask(srec); + if (talloc & salloc) + return false; + + return true; +} + +/* + * Merge the source inode record into the target. The caller must call + * __xfs_inobt_can_merge() to ensure the merge is valid. + */ +STATIC void +__xfs_inobt_rec_merge( + struct xfs_inobt_rec_incore *trec, /* target */ + struct xfs_inobt_rec_incore *srec) /* src */ +{ + ASSERT(trec->ir_startino == srec->ir_startino); + + /* combine the counts */ + trec->ir_count += srec->ir_count; + trec->ir_freecount += srec->ir_freecount; + + /* + * Merge the holemask and free mask. For both fields, 0 bits refer to + * allocated inodes. We combine the allocated ranges with bitwise AND. + */ + trec->ir_holemask &= srec->ir_holemask; + trec->ir_free &= srec->ir_free; +} + +/* + * Insert a new sparse inode chunk into the associated inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * This function supports two modes of handling preexisting records depending on + * the merge flag. If merge is true, the provided record is merged with the + * existing record and updated in place. The merged record is returned in nrec. + * If merge is false, an existing record is replaced with the provided record. + * If no preexisting record exists, the provided record is always inserted. + * + * It is considered corruption if a merge is requested and not possible. Given + * the sparse inode alignment constraints, this should never happen. + */ +STATIC int +xfs_inobt_insert_sprec( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + int btnum, + struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ + bool merge) /* merge or replace */ +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + int error; + int i; + struct xfs_inobt_rec_incore rec; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + + goto out; + } + + /* + * A record exists at this startino. Merge or replace the record + * depending on what we've been asked to do. + */ + if (merge) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, + rec.ir_startino == nrec->ir_startino, + error); + + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec), + error); + + trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, + rec.ir_holemask, nrec->ir_startino, + nrec->ir_holemask); + + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); + + trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + nrec->ir_holemask); + + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; + } + + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + +out: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. */ @@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc( xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */ + struct xfs_inobt_rec_incore rec; struct xfs_perag *pag; + int do_sparse = 0; + +#ifdef DEBUG + /* randomly do sparse inode allocations */ + if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb)) + do_sparse = prandom_u32() & 1; +#endif + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; + args.fsbno = NULLFSBLOCK; /* * Locking will ensure that we don't have two callers in here @@ -390,6 +640,8 @@ xfs_ialloc_ag_alloc( agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + args.mp->m_ialloc_blks; + if (do_sparse) + goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); @@ -428,8 +680,7 @@ xfs_ialloc_ag_alloc( * subsequent requests. */ args.minalignslop = 0; - } else - args.fsbno = NULLFSBLOCK; + } if (unlikely(args.fsbno == NULLFSBLOCK)) { /* @@ -480,6 +731,46 @@ xfs_ialloc_ag_alloc( return error; } + /* + * Finally, try a sparse allocation if the filesystem supports it and + * the sparse allocation length is smaller than a full chunk. + */ + if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks && + args.fsbno == NULLFSBLOCK) { +sparse_alloc: + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = args.mp->m_sb.sb_spino_align; + args.prod = 1; + + args.minlen = args.mp->m_ialloc_min_blks; + args.maxlen = args.minlen; + + /* + * The inode record will be aligned to full chunk size. We must + * prevent sparse allocation from AG boundaries that result in + * invalid inode records, such as records that start at agbno 0 + * or extend beyond the AG. + * + * Set min agbno to the first aligned, non-zero agbno and max to + * the last aligned agbno that is at least one full chunk from + * the end of the AG. + */ + args.min_agbno = args.mp->m_sb.sb_inoalignmt; + args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.mp->m_sb.sb_inoalignmt) - + args.mp->m_ialloc_blks; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + newlen = args.len << args.mp->m_sb.sb_inopblog; + allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; + } + if (args.fsbno == NULLFSBLOCK) { *alloc = 0; return 0; @@ -495,8 +786,8 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, - args.len, prandom_u32()); + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno, + args.agbno, args.len, prandom_u32()); if (error) return error; @@ -504,6 +795,73 @@ xfs_ialloc_ag_alloc( * Convert the results. */ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + + if (xfs_inobt_issparse(~allocmask)) { + /* + * We've allocated a sparse chunk. Align the startino and mask. + */ + xfs_align_sparse_ino(args.mp, &newino, &allocmask); + + rec.ir_startino = newino; + rec.ir_holemask = ~allocmask; + rec.ir_count = newlen; + rec.ir_freecount = newlen; + rec.ir_free = XFS_INOBT_ALL_FREE; + + /* + * Insert the sparse record into the inobt and allow for a merge + * if necessary. If a merge does occur, rec is updated to the + * merged record. + */ + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, + &rec, true); + if (error == -EFSCORRUPTED) { + xfs_alert(args.mp, + "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", + XFS_AGINO_TO_INO(args.mp, agno, + rec.ir_startino), + rec.ir_holemask, rec.ir_count); + xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); + } + if (error) + return error; + + /* + * We can't merge the part we've just allocated as for the inobt + * due to finobt semantics. The original record may or may not + * exist independent of whether physical inodes exist in this + * sparse chunk. + * + * We must update the finobt record based on the inobt record. + * rec contains the fully merged and up to date inobt record + * from the previous call. Set merge false to replace any + * existing record with this one. + */ + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, + XFS_BTNUM_FINO, &rec, + false); + if (error) + return error; + } + } else { + /* full chunk - insert new records to both btrees */ + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, + newlen, XFS_BTNUM_FINO); + if (error) + return error; + } + } + + /* + * Update AGI counts and newino. + */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag = xfs_perag_get(args.mp, agno); @@ -512,20 +870,6 @@ xfs_ialloc_ag_alloc( agi->agi_newino = cpu_to_be32(newino); /* - * Insert records describing the new inode chunk into the btrees. - */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_INO); - if (error) - return error; - - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_FINO); - if (error) - return error; - } - /* * Log allocation group header fields */ xfs_ialloc_log_agi(tp, agbp, @@ -645,7 +989,7 @@ xfs_ialloc_ag_select( * if we fail allocation due to alignment issues then it is most * likely a real ENOSPC condition. */ - ineed = mp->m_ialloc_blks; + ineed = mp->m_ialloc_min_blks; if (flags && ineed > 1) ineed += xfs_ialloc_cluster_alignment(mp); longest = pag->pagf_longest; @@ -732,6 +1076,27 @@ xfs_ialloc_get_rec( } /* + * Return the offset of the first free inode in the record. If the inode chunk + * is sparsely allocated, we convert the record holemask to inode granularity + * and mask off the unallocated regions from the inode free mask. + */ +STATIC int +xfs_inobt_first_free_inode( + struct xfs_inobt_rec_incore *rec) +{ + xfs_inofree_t realfree; + + /* if there are no holes, return the first available offset */ + if (!xfs_inobt_issparse(rec->ir_holemask)) + return xfs_lowbit64(rec->ir_free); + + realfree = xfs_inobt_irec_to_allocmask(rec); + realfree &= rec->ir_free; + + return xfs_lowbit64(realfree); +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int @@ -961,7 +1326,7 @@ newino: } alloc_inode: - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1210,7 +1575,7 @@ xfs_dialloc_ag( if (error) goto error_cur; - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1439,6 +1804,83 @@ out_error: return error; } +/* + * Free the blocks of an inode chunk. We must consider that the inode chunk + * might be sparse and only free the regions that are allocated as part of the + * chunk. + */ +STATIC void +xfs_difree_inode_chunk( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *rec, + struct xfs_bmap_free *flist) +{ + xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); + int startidx, endidx; + int nextbit; + xfs_agblock_t agbno; + int contigblk; + DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); + + if (!xfs_inobt_issparse(rec->ir_holemask)) { + /* not sparse, calculate extent info directly */ + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), + mp->m_ialloc_blks, flist, mp); + return; + } + + /* holemask is only 16-bits (fits in an unsigned long) */ + ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0])); + holemask[0] = rec->ir_holemask; + + /* + * Find contiguous ranges of zeroes (i.e., allocated regions) in the + * holemask and convert the start/end index of each range to an extent. + * We start with the start and end index both pointing at the first 0 in + * the mask. + */ + startidx = endidx = find_first_zero_bit(holemask, + XFS_INOBT_HOLEMASK_BITS); + nextbit = startidx + 1; + while (startidx < XFS_INOBT_HOLEMASK_BITS) { + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, + nextbit); + /* + * If the next zero bit is contiguous, update the end index of + * the current range and continue. + */ + if (nextbit != XFS_INOBT_HOLEMASK_BITS && + nextbit == endidx + 1) { + endidx = nextbit; + goto next; + } + + /* + * nextbit is not contiguous with the current end index. Convert + * the current start/end to an extent and add it to the free + * list. + */ + agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + contigblk = ((endidx - startidx + 1) * + XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + + ASSERT(agbno % mp->m_sb.sb_spino_align == 0); + ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, + flist, mp); + + /* reset range to current bit and carry on... */ + startidx = endidx = nextbit; + +next: + nextbit++; + } +} + STATIC int xfs_difree_inobt( struct xfs_mount *mp, @@ -1446,8 +1888,7 @@ xfs_difree_inobt( struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_bmap_free *flist, - int *deleted, - xfs_ino_t *first_ino, + struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); @@ -1501,20 +1942,23 @@ xfs_difree_inobt( rec.ir_freecount++; /* - * When an inode cluster is free, it becomes eligible for removal + * When an inode chunk is free, it becomes eligible for removal. Don't + * remove the chunk if the block size is large enough for multiple inode + * chunks (that might not be free). */ if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - (rec.ir_freecount == mp->m_ialloc_inos)) { - - *deleted = 1; - *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { + xic->deleted = 1; + xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* * Remove the inode cluster from the AGI B+Tree, adjust the * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = mp->m_ialloc_inos; + ilen = rec.ir_freecount; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1530,11 +1974,9 @@ xfs_difree_inobt( goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_difree_inode_chunk(mp, agno, &rec, flist); } else { - *deleted = 0; + xic->deleted = 0; error = xfs_inobt_update(cur, &rec); if (error) { @@ -1599,7 +2041,9 @@ xfs_difree_finobt( */ XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); - error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, + ibtrec->ir_count, + ibtrec->ir_freecount, ibtrec->ir_free, &i); if (error) goto error; @@ -1634,8 +2078,13 @@ xfs_difree_finobt( * free inode. Hence, if all of the inodes are free and we aren't * keeping inode chunks permanently on disk, remove the record. * Otherwise, update the record with the new information. + * + * Note that we currently can't free chunks when the block size is large + * enough for multiple chunks. Leave the finobt record to remain in sync + * with the inobt. */ - if (rec.ir_freecount == mp->m_ialloc_inos && + if (rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK && !(mp->m_flags & XFS_MOUNT_IKEEP)) { error = xfs_btree_delete(cur, &i); if (error) @@ -1671,8 +2120,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted,/* set if inode cluster was deleted */ - xfs_ino_t *first_ino)/* first inode in deleted cluster */ + struct xfs_icluster *xic) /* cluster info if deleted */ { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ @@ -1723,8 +2171,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, - &rec); + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec); if (error) goto error0; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 100007d56449..12401fea7bff 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -28,6 +28,13 @@ struct xfs_btree_cur; /* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 +struct xfs_icluster { + bool deleted; /* record is deleted */ + xfs_ino_t first_ino; /* first inode number */ + uint64_t alloc; /* inode phys. allocation bitmap for + * sparse chunks */ +}; + /* Calculate and return the number of filesystem blocks per inode cluster */ static inline int xfs_icluster_size_fsb( @@ -90,8 +97,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted, /* set if inode cluster was deleted */ - xfs_ino_t *first_ino); /* first inode in deleted cluster */ + struct xfs_icluster *ifree); /* cluster info if deleted */ /* * Return the location of the inode in imap, for mapping it into a buffer. @@ -156,7 +162,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, * Inode chunk initialisation routine */ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, - struct list_head *buffer_list, + struct list_head *buffer_list, int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 964c465ca69c..674ad8f760be 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur( union xfs_btree_rec *rec) { rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); - rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec->inobt.ir_u.sp.ir_holemask = + cpu_to_be16(cur->bc_rec.i.ir_holemask); + rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; + rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec->inobt.ir_u.f.ir_freecount = + cpu_to_be32(cur->bc_rec.i.ir_freecount); + } rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); } @@ -418,3 +427,85 @@ xfs_inobt_maxrecs( return blocklen / sizeof(xfs_inobt_rec_t); return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); } + +/* + * Convert the inode record holemask to an inode allocation bitmap. The inode + * allocation bitmap is inode granularity and specifies whether an inode is + * physically allocated on disk (not whether the inode is considered allocated + * or free by the fs). + * + * A bit value of 1 means the inode is allocated, a value of 0 means it is free. + */ +uint64_t +xfs_inobt_irec_to_allocmask( + struct xfs_inobt_rec_incore *rec) +{ + uint64_t bitmap = 0; + uint64_t inodespbit; + int nextbit; + uint allocbitmap; + + /* + * The holemask has 16-bits for a 64 inode record. Therefore each + * holemask bit represents multiple inodes. Create a mask of bits to set + * in the allocmask for each holemask bit. + */ + inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1; + + /* + * Allocated inodes are represented by 0 bits in holemask. Invert the 0 + * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask + * anything beyond the 16 holemask bits since this casts to a larger + * type. + */ + allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1); + + /* + * allocbitmap is the inverted holemask so every set bit represents + * allocated inodes. To expand from 16-bit holemask granularity to + * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target + * bitmap for every holemask bit. + */ + nextbit = xfs_next_bit(&allocbitmap, 1, 0); + while (nextbit != -1) { + ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY)); + + bitmap |= (inodespbit << + (nextbit * XFS_INODES_PER_HOLEMASK_BIT)); + + nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1); + } + + return bitmap; +} + +#if defined(DEBUG) || defined(XFS_WARN) +/* + * Verify that an in-core inode record has a valid inode count. + */ +int +xfs_inobt_rec_check_count( + struct xfs_mount *mp, + struct xfs_inobt_rec_incore *rec) +{ + int inocount = 0; + int nextbit = 0; + uint64_t allocbmap; + int wordsz; + + wordsz = sizeof(allocbmap) / sizeof(unsigned int); + allocbmap = xfs_inobt_irec_to_allocmask(rec); + + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit); + while (nextbit != -1) { + inocount++; + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, + nextbit + 1); + } + + if (inocount != rec->ir_count) + return -EFSCORRUPTED; + + return 0; +} +#endif /* DEBUG */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index d7ebea72c2d0..bd88453217ce 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, xfs_btnum_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +/* ir_holemask to inode allocation bitmap conversion */ +uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); + +#if defined(DEBUG) || defined(XFS_WARN) +int xfs_inobt_rec_check_count(struct xfs_mount *, + struct xfs_inobt_rec_incore *); +#else +#define xfs_inobt_rec_check_count(mp, rec) 0 +#endif /* DEBUG */ + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index dc4bfc5d88fc..019dc324a146 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -174,6 +174,27 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + /* + * Full inode chunks must be aligned to inode chunk size when + * sparse inodes are enabled to support the sparse chunk + * allocation algorithm and prevent overlapping inode records. + */ + if (xfs_sb_version_hassparseinodes(sbp)) { + uint32_t align; + + xfs_alert(mp, + "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); + + align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize + >> sbp->sb_blocklog; + if (sbp->sb_inoalignmt != align) { + xfs_warn(mp, +"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", + sbp->sb_inoalignmt, align); + return -EINVAL; + } + } + if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -374,7 +395,7 @@ __xfs_sb_from_disk( be32_to_cpu(from->sb_features_log_incompat); /* crc is only used on disk, not in memory; just init to 0 here. */ to->sb_crc = 0; - to->sb_pad = 0; + to->sb_spino_align = be32_to_cpu(from->sb_spino_align); to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); /* Convert on-disk flags to in-memory flags? */ @@ -516,7 +537,7 @@ xfs_sb_to_disk( cpu_to_be32(from->sb_features_incompat); to->sb_features_log_incompat = cpu_to_be32(from->sb_features_log_incompat); - to->sb_pad = 0; + to->sb_spino_align = cpu_to_be32(from->sb_spino_align); to->sb_lsn = cpu_to_be64(from->sb_lsn); } } @@ -689,6 +710,11 @@ xfs_sb_mount_common( mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; + + if (sbp->sb_spino_align) + mp->m_ialloc_min_blks = sbp->sb_spino_align; + else + mp->m_ialloc_min_blks = mp->m_ialloc_blks; } /* |