From ad431025aecda85d3ebef5e4a3aca5c1c681d0c7 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:10:39 -0400 Subject: ext4: generalize extents status tree search functions Ext4 contains a few functions that are used to search for delayed extents or blocks in the extents status tree. Rather than duplicate code to add new functions to search for extents with different status values, such as written or a combination of delayed and unwritten, generalize the existing code to search for caller-specified extents status values. Also, move this code into extents_status.c where it is better associated with the data structures it operates upon, and where it can be more readily used to implement new extents status tree functions that might want a broader scope for i_es_lock. Three missing static specifiers in RFC version of patch reported and fixed by Fengguang Wu . Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 -- fs/ext4/extents.c | 52 ++++------------ fs/ext4/extents_status.c | 149 ++++++++++++++++++++++++++++++++++++++------ fs/ext4/extents_status.h | 13 +++- fs/ext4/inode.c | 17 ++--- include/trace/events/ext4.h | 4 +- 6 files changed, 165 insertions(+), 74 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index caff935fbeb8..ad2c215720be 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3142,10 +3142,6 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 72a361d5ef74..95796f00e4e6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2351,8 +2351,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, { struct extent_status es; - ext4_es_find_delayed_extent_range(inode, hole_start, - hole_start + hole_len - 1, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, + hole_start + hole_len - 1, &es); if (es.es_len) { /* There's delayed extent containing lblock? */ if (es.es_lblk <= hole_start) @@ -3819,39 +3819,6 @@ out: return ext4_mark_inode_dirty(handle, inode); } -/** - * ext4_find_delalloc_range: find delayed allocated block in the given range. - * - * Return 1 if there is a delalloc block in the range, otherwise 0. - */ -int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end) -{ - struct extent_status es; - - ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); - if (es.es_len == 0) - return 0; /* there is no delay extent in this tree */ - else if (es.es_lblk <= lblk_start && - lblk_start < es.es_lblk + es.es_len) - return 1; - else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) - return 1; - else - return 0; -} - -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_lblk_t lblk_start, lblk_end; - lblk_start = EXT4_LBLK_CMASK(sbi, lblk); - lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - - return ext4_find_delalloc_range(inode, lblk_start, lblk_end); -} - /** * Determines how many complete clusters (out of those specified by the 'map') * are under delalloc and were reserved quota for. @@ -3910,7 +3877,8 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); lblk_to = lblk_from + c_offset - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) + if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from, + lblk_to)) allocated_clusters--; } @@ -3920,7 +3888,8 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) + if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from, + lblk_to)) allocated_clusters--; } @@ -5075,8 +5044,10 @@ static int ext4_find_delayed_extent(struct inode *inode, ext4_lblk_t block, next_del; if (newes->es_pblk == 0) { - ext4_es_find_delayed_extent_range(inode, newes->es_lblk, - newes->es_lblk + newes->es_len - 1, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + newes->es_lblk, + newes->es_lblk + newes->es_len - 1, + &es); /* * No extent in extent-tree contains block @newes->es_pblk, @@ -5097,7 +5068,8 @@ static int ext4_find_delayed_extent(struct inode *inode, } block = newes->es_lblk + newes->es_len; - ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block, + EXT_MAX_BLOCKS, &es); if (es.es_len == 0) next_del = EXT_MAX_BLOCKS; else diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c4e6fb15101b..8530fbd3012d 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -233,30 +233,38 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering - * @es->lblk if it exists, otherwise, the next extent after @es->lblk. + * ext4_es_find_extent_range - find extent with specified status within block + * range or next extent following block range in + * extents status tree * - * @inode: the inode which owns delayed extents - * @lblk: the offset where we start to search - * @end: the offset where we stop to search - * @es: delayed extent that we found + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * @es - extent found, if any + * + * Find the first extent within the block range specified by @lblk and @end + * in the extents status tree that satisfies @matching_fn. If a match + * is found, it's returned in @es. If not, and a matching extent is found + * beyond the block range, it's returned in @es. If no match is found, an + * extent is returned in @es whose es_lblk, es_len, and es_pblk components + * are 0. */ -void ext4_es_find_delayed_extent_range(struct inode *inode, - ext4_lblk_t lblk, ext4_lblk_t end, - struct extent_status *es) +static void __es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; - BUG_ON(es == NULL); - BUG_ON(end < lblk); - trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); + WARN_ON(es == NULL); + WARN_ON(end < lblk); - read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; - /* find extent in cache firstly */ + /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; if (tree->cache_es) { es1 = tree->cache_es; @@ -271,28 +279,133 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, es1 = __es_tree_search(&tree->root, lblk); out: - if (es1 && !ext4_es_is_delayed(es1)) { + if (es1 && !matching_fn(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); if (es1->es_lblk > end) { es1 = NULL; break; } - if (ext4_es_is_delayed(es1)) + if (matching_fn(es1)) break; } } - if (es1 && ext4_es_is_delayed(es1)) { + if (es1 && matching_fn(es1)) { tree->cache_es = es1; es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; } +} + +/* + * Locking for __es_find_extent_range() for external use + */ +void ext4_es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + trace_ext4_es_find_extent_range_enter(inode, lblk); + + read_lock(&EXT4_I(inode)->i_es_lock); + __es_find_extent_range(inode, matching_fn, lblk, end, es); + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_find_extent_range_exit(inode, es); +} + +/* + * __es_scan_range - search block range for block with specified status + * in extents status tree + * + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * + * Returns true if at least one block in the specified block range satisfies + * the criterion specified by @matching_fn, and false if not. If at least + * one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t start, ext4_lblk_t end) +{ + struct extent_status es; + + __es_find_extent_range(inode, matching_fn, start, end, &es); + if (es.es_len == 0) + return false; /* no matching extent in the tree */ + else if (es.es_lblk <= start && + start < es.es_lblk + es.es_len) + return true; + else if (start <= es.es_lblk && es.es_lblk <= end) + return true; + else + return false; +} +/* + * Locking for __es_scan_range() for external use + */ +bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end) +{ + bool ret; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_range(inode, matching_fn, lblk, end); + read_unlock(&EXT4_I(inode)->i_es_lock); + + return ret; +} + +/* + * __es_scan_clu - search cluster for block with specified status in + * extents status tree + * + * @inode - file containing the cluster + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block in cluster to be searched + * + * Returns true if at least one extent in the cluster containing @lblk + * satisfies the criterion specified by @matching_fn, and false if not. If at + * least one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); +} + +/* + * Locking for __es_scan_clu() for external use + */ +bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + bool ret; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_clu(inode, matching_fn, lblk); read_unlock(&EXT4_I(inode)->i_es_lock); - trace_ext4_es_find_delayed_extent_range_exit(inode, es); + return ret; } static void ext4_es_list_add(struct inode *inode) diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8efdeb903d6b..df9628c3ec3b 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -90,11 +90,18 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, unsigned int status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern void ext4_es_find_delayed_extent_range(struct inode *inode, - ext4_lblk_t lblk, ext4_lblk_t end, - struct extent_status *es); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d767e993591d..b83bf3308b5e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -577,8 +577,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && - ext4_find_delalloc_range(inode, map->m_lblk, - map->m_lblk + map->m_len - 1)) + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@ -701,8 +701,8 @@ found: EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && - ext4_find_delalloc_range(inode, map->m_lblk, - map->m_lblk + map->m_len - 1)) + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@ -1681,7 +1681,7 @@ static void ext4_da_page_release_reservation(struct page *page, lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk)) + !ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) ext4_da_release_space(inode, 1); num_clusters--; @@ -1859,6 +1859,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, add_delayed: if (retval == 0) { int ret; + /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? @@ -1869,7 +1870,8 @@ add_delayed: * to reserve metadata for every block we're going to write. */ if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, map->m_lblk)) { + !ext4_es_scan_clu(inode, + &ext4_es_is_delayed, map->m_lblk)) { ret = ext4_da_reserve_space(inode); if (ret) { /* not enough space to reserve */ @@ -3450,7 +3452,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, ext4_lblk_t end = map.m_lblk + map.m_len - 1; struct extent_status es; - ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map.m_lblk, end, &es); if (!es.es_len || es.es_lblk > end) { /* entire range is a hole */ diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0e31eb136c57..7849b7f8fd9d 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2270,7 +2270,7 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->lblk, __entry->len) ); -TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, +TRACE_EVENT(ext4_es_find_extent_range_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), @@ -2292,7 +2292,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, (unsigned long) __entry->ino, __entry->lblk) ); -TRACE_EVENT(ext4_es_find_delayed_extent_range_exit, +TRACE_EVENT(ext4_es_find_extent_range_exit, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), -- cgit v1.2.3 From 1dc0aa46e74a3366e12f426b7caaca477853e9c3 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:17:41 -0400 Subject: ext4: add new pending reservation mechanism Add new pending reservation mechanism to help manage reserved cluster accounting. Its primary function is to avoid the need to read extents from the disk when invalidating pages as a result of a truncate, punch hole, or collapse range operation. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 + fs/ext4/extents_status.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 51 +++++++++++++ fs/ext4/super.c | 8 ++ 4 files changed, 249 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ad2c215720be..fc0f41dbf90b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1030,6 +1030,9 @@ struct ext4_inode_info { ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + /* on-disk additional length */ __u16 i_extra_isize; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 8530fbd3012d..194785ce890a 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -142,6 +142,7 @@ */ static struct kmem_cache *ext4_es_cachep; +static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, @@ -1365,3 +1366,189 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) ei->i_es_tree.cache_es = NULL; return nr_shrunk; } + +#ifdef ES_DEBUG__ +static void ext4_print_pending_tree(struct inode *inode) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr; + + printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_pending_tree; + node = rb_first(&tree->root); + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + printk(KERN_DEBUG " %u", pr->lclu); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_print_pending_tree(inode) +#endif + +int __init ext4_init_pending(void) +{ + ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation", + sizeof(struct pending_reservation), + 0, (SLAB_RECLAIM_ACCOUNT), NULL); + if (ext4_pending_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_pending(void) +{ + kmem_cache_destroy(ext4_pending_cachep); +} + +void ext4_init_pending_tree(struct ext4_pending_tree *tree) +{ + tree->root = RB_ROOT; +} + +/* + * __get_pending - retrieve a pointer to a pending reservation + * + * @inode - file containing the pending cluster reservation + * @lclu - logical cluster of interest + * + * Returns a pointer to a pending reservation if it's a member of + * the set, and NULL if not. Must be called holding i_es_lock. + */ +static struct pending_reservation *__get_pending(struct inode *inode, + ext4_lblk_t lclu) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr = NULL; + + tree = &EXT4_I(inode)->i_pending_tree; + node = (&tree->root)->rb_node; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else if (lclu == pr->lclu) + return pr; + } + return NULL; +} + +/* + * __insert_pending - adds a pending cluster reservation to the set of + * pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster to be added + * + * Returns 0 on successful insertion and -ENOMEM on failure. If the + * pending reservation is already in the set, returns successfully. + */ +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct pending_reservation *pr; + ext4_lblk_t lclu; + int ret = 0; + + lclu = EXT4_B2C(sbi, lblk); + /* search to find parent for insertion */ + while (*p) { + parent = *p; + pr = rb_entry(parent, struct pending_reservation, rb_node); + + if (lclu < pr->lclu) { + p = &(*p)->rb_left; + } else if (lclu > pr->lclu) { + p = &(*p)->rb_right; + } else { + /* pending reservation already inserted */ + goto out; + } + } + + pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); + if (pr == NULL) { + ret = -ENOMEM; + goto out; + } + pr->lclu = lclu; + + rb_link_node(&pr->rb_node, parent, p); + rb_insert_color(&pr->rb_node, &tree->root); + +out: + return ret; +} + +/* + * __remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Returns successfully if pending reservation is not a member of the set. + */ +static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree; + + pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); + if (pr != NULL) { + tree = &EXT4_I(inode)->i_pending_tree; + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + } +} + +/* + * ext4_remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Locking for external use of __remove_pending. + */ +void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + write_lock(&ei->i_es_lock); + __remove_pending(inode, lblk); + write_unlock(&ei->i_es_lock); +} + +/* + * ext4_is_pending - determine whether a cluster has a pending reservation + * on it + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster + * + * Returns true if there's a pending reservation for the cluster in the + * set of pending reservations, and false if not. + */ +bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + bool ret; + + read_lock(&ei->i_es_lock); + ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); + read_unlock(&ei->i_es_lock); + + return ret; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index df9628c3ec3b..379b7171c67c 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -78,6 +78,51 @@ struct ext4_es_stats { struct percpu_counter es_stats_shk_cnt; }; +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); @@ -182,4 +227,10 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); + #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1145109968ef..faf293ed8060 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1040,6 +1040,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_da_metadata_calc_len = 0; ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); @@ -5954,6 +5955,10 @@ static int __init ext4_init_fs(void) if (err) return err; + err = ext4_init_pending(); + if (err) + goto out6; + err = ext4_init_pageio(); if (err) goto out5; @@ -5992,6 +5997,8 @@ out3: out4: ext4_exit_pageio(); out5: + ext4_exit_pending(); +out6: ext4_exit_es(); return err; @@ -6009,6 +6016,7 @@ static void __exit ext4_exit_fs(void) ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_es(); + ext4_exit_pending(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -- cgit v1.2.3 From 0b02f4c0d6d9e2c611dfbdd4317193e9dca740e6 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:19:37 -0400 Subject: ext4: fix reserved cluster accounting at delayed write time The code in ext4_da_map_blocks sometimes reserves space for more delayed allocated clusters than it should, resulting in premature ENOSPC, exceeded quota, and inaccurate free space reporting. Fix this by checking for written and unwritten blocks shared in the same cluster with the newly delayed allocated block. A cluster reservation should not be made for a cluster for which physical space has already been allocated. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 79 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.c | 53 ++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 12 +++++++ fs/ext4/inode.c | 79 ++++++++++++++++++++++++++++++++++----------- include/trace/events/ext4.h | 35 ++++++++++++++++++++ 6 files changed, 241 insertions(+), 18 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fc0f41dbf90b..d85fd5c8a2c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3155,6 +3155,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 95796f00e4e6..26481e543312 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5930,3 +5930,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, } return replaced_count; } + +/* + * ext4_clu_mapped - determine whether any block in a logical cluster has + * been mapped to a physical cluster + * + * @inode - file containing the logical cluster + * @lclu - logical cluster of interest + * + * Returns 1 if any block in the logical cluster is mapped, signifying + * that a physical cluster has been allocated for it. Otherwise, + * returns 0. Can also return negative error codes. Derived from + * ext4_ext_map_blocks(). + */ +int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_ext_path *path; + int depth, mapped = 0, err = 0; + struct ext4_extent *extent; + ext4_lblk_t first_lblk, first_lclu, last_lclu; + + /* search for the extent closest to the first block in the cluster */ + path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out; + } + + depth = ext_depth(inode); + + /* + * A consistent leaf must not be empty. This situation is possible, + * though, _during_ tree modification, and it's why an assert can't + * be put in ext4_find_extent(). + */ + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, + "bad extent address - lblock: %lu, depth: %d, pblock: %lld", + (unsigned long) EXT4_C2B(sbi, lclu), + depth, path[depth].p_block); + err = -EFSCORRUPTED; + goto out; + } + + extent = path[depth].p_ext; + + /* can't be mapped if the extent tree is empty */ + if (extent == NULL) + goto out; + + first_lblk = le32_to_cpu(extent->ee_block); + first_lclu = EXT4_B2C(sbi, first_lblk); + + /* + * Three possible outcomes at this point - found extent spanning + * the target cluster, to the left of the target cluster, or to the + * right of the target cluster. The first two cases are handled here. + * The last case indicates the target cluster is not mapped. + */ + if (lclu >= first_lclu) { + last_lclu = EXT4_B2C(sbi, first_lblk + + ext4_ext_get_actual_len(extent) - 1); + if (lclu <= last_lclu) { + mapped = 1; + } else { + first_lblk = ext4_ext_next_allocated_block(path); + first_lclu = EXT4_B2C(sbi, first_lblk); + if (lclu == first_lclu) + mapped = 1; + } + } + +out: + ext4_ext_drop_refs(path); + kfree(path); + + return err ? err : mapped; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 194785ce890a..c5d456e12062 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1552,3 +1552,56 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) return ret; } + +/* + * ext4_es_insert_delayed_block - adds a delayed block to the extents status + * tree, adding a pending reservation where + * needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * @allocated - indicates whether a physical cluster has been allocated for + * the logical cluster that contains the block + * + * Returns 0 on success, negative error code on failure. + */ +int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated) +{ + struct extent_status newes; + int err = 0; + + es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", + lblk, inode->i_ino); + + newes.es_lblk = lblk; + newes.es_len = 1; + ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); + trace_ext4_es_insert_delayed_block(inode, &newes, allocated); + + ext4_es_insert_extent_check(inode, &newes); + + write_lock(&EXT4_I(inode)->i_es_lock); + + err = __es_remove_extent(inode, lblk, lblk); + if (err != 0) + goto error; +retry: + err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) + goto retry; + if (err != 0) + goto error; + + if (allocated) + __insert_pending(inode, lblk); + +error: + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + ext4_print_pending_tree(inode); + + return err; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 379b7171c67c..9d3c676ec623 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -178,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es) return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; @@ -232,5 +242,7 @@ extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b83bf3308b5e..57c6dd38f071 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1780,6 +1780,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } +/* + * ext4_insert_delayed_block - adds a delayed block to the extents status + * tree, incrementing the reserved cluster/block + * count or making a pending reservation + * where needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * + * Returns 0 on success, negative error code on failure. + */ +static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + bool allocated = false; + + /* + * If the cluster containing lblk is shared with a delayed, + * written, or unwritten extent in a bigalloc file system, it's + * already been accounted for and does not need to be reserved. + * A pending reservation must be made for the cluster if it's + * shared with a written or unwritten extent and doesn't already + * have one. Written and unwritten extents can be purged from the + * extents status tree if the system is under memory pressure, so + * it's necessary to examine the extent tree if a search of the + * extents status tree doesn't get a match. + */ + if (sbi->s_cluster_ratio == 1) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { /* bigalloc */ + if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { + if (!ext4_es_scan_clu(inode, + &ext4_es_is_mapped, lblk)) { + ret = ext4_clu_mapped(inode, + EXT4_B2C(sbi, lblk)); + if (ret < 0) + goto errout; + if (ret == 0) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { + allocated = true; + } + } else { + allocated = true; + } + } + } + + ret = ext4_es_insert_delayed_block(inode, lblk, allocated); + +errout: + return ret; +} + /* * This function is grabs code from the very beginning of * ext4_map_blocks, but assumes that the caller is from delayed write @@ -1864,25 +1923,9 @@ add_delayed: * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* - * If the block was allocated from previously allocated cluster, - * then we don't need to reserve it again. However we still need - * to reserve metadata for every block we're going to write. - */ - if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || - !ext4_es_scan_clu(inode, - &ext4_es_is_delayed, map->m_lblk)) { - ret = ext4_da_reserve_space(inode); - if (ret) { - /* not enough space to reserve */ - retval = ret; - goto out_unlock; - } - } - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - ~0, EXTENT_STATUS_DELAYED); - if (ret) { + ret = ext4_insert_delayed_block(inode, map->m_lblk); + if (ret != 0) { retval = ret; goto out_unlock; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 7849b7f8fd9d..6d7a943f849c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2512,6 +2512,41 @@ TRACE_EVENT(ext4_es_shrink, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); +TRACE_EVENT(ext4_es_insert_delayed_block, + TP_PROTO(struct inode *inode, struct extent_status *es, + bool allocated), + + TP_ARGS(inode, es, allocated), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, lblk ) + __field( ext4_lblk_t, len ) + __field( ext4_fsblk_t, pblk ) + __field( char, status ) + __field( bool, allocated ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->lblk = es->es_lblk; + __entry->len = es->es_len; + __entry->pblk = ext4_es_pblock(es); + __entry->status = ext4_es_status(es); + __entry->allocated = allocated; + ), + + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " + "allocated %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->lblk, __entry->len, + __entry->pblk, show_extent_status(__entry->status), + __entry->allocated) +); + /* fsmap traces */ DECLARE_EVENT_CLASS(ext4_fsmap_class, TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, -- cgit v1.2.3 From b6bf9171ef5c37b66d446378ba63af5339a56a97 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:24:08 -0400 Subject: ext4: reduce reserved cluster count by number of allocated clusters Ext4 does not always reduce the reserved cluster count by the number of clusters allocated when mapping a delayed extent. It sometimes adds back one or more clusters after allocation if delalloc blocks adjacent to the range allocated by ext4_ext_map_blocks() share the clusters newly allocated for that range. However, this overcounts the number of clusters needed to satisfy future mapping requests (holding one or more reservations for clusters that have already been allocated) and premature ENOSPC and quota failures, etc., result. Ext4 also does not reduce the reserved cluster count when allocating clusters for non-delayed allocated writes that have previously been reserved for delayed writes. This also results in overcounts. To make it possible to handle reserved cluster accounting for fallocated regions in the same manner as used for other non-delayed writes, do the reserved cluster accounting for them at the time of allocation. In the current code, this is only done later when a delayed extent sharing the fallocated region is finally mapped. Address comment correcting handling of unsigned long long constant from Jan Kara's review of RFC version of this patch. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 188 +++++++---------------------------------------- fs/ext4/extents_status.c | 175 +++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 4 + 3 files changed, 207 insertions(+), 160 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 26481e543312..b52ac813ca20 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3819,83 +3819,6 @@ out: return ext4_mark_inode_dirty(handle, inode); } -/** - * Determines how many complete clusters (out of those specified by the 'map') - * are under delalloc and were reserved quota for. - * This function is called when we are writing out the blocks that were - * originally written with their allocation delayed, but then the space was - * allocated using fallocate() before the delayed allocation could be resolved. - * The cases to look for are: - * ('=' indicated delayed allocated blocks - * '-' indicates non-delayed allocated blocks) - * (a) partial clusters towards beginning and/or end outside of allocated range - * are not delalloc'ed. - * Ex: - * |----c---=|====c====|====c====|===-c----| - * |++++++ allocated ++++++| - * ==> 4 complete clusters in above example - * - * (b) partial cluster (outside of allocated range) towards either end is - * marked for delayed allocation. In this case, we will exclude that - * cluster. - * Ex: - * |----====c========|========c========| - * |++++++ allocated ++++++| - * ==> 1 complete clusters in above example - * - * Ex: - * |================c================| - * |++++++ allocated ++++++| - * ==> 0 complete clusters in above example - * - * The ext4_da_update_reserve_space will be called only if we - * determine here that there were some "entire" clusters that span - * this 'allocated' range. - * In the non-bigalloc case, this function will just end up returning num_blks - * without ever calling ext4_find_delalloc_range. - */ -static unsigned int -get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, - unsigned int num_blks) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_lblk_t alloc_cluster_start, alloc_cluster_end; - ext4_lblk_t lblk_from, lblk_to, c_offset; - unsigned int allocated_clusters = 0; - - alloc_cluster_start = EXT4_B2C(sbi, lblk_start); - alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); - - /* max possible clusters for this allocation */ - allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; - - trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); - - /* Check towards left side */ - c_offset = EXT4_LBLK_COFF(sbi, lblk_start); - if (c_offset) { - lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); - lblk_to = lblk_from + c_offset - 1; - - if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from, - lblk_to)) - allocated_clusters--; - } - - /* Now check towards right. */ - c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); - if (allocated_clusters && c_offset) { - lblk_from = lblk_start + num_blks; - lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - - if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from, - lblk_to)) - allocated_clusters--; - } - - return allocated_clusters; -} - static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -4077,23 +4000,6 @@ out: } map->m_len = allocated; - /* - * If we have done fallocate with the offset that is already - * delayed allocated, we would have block reservation - * and quota reservation done in the delayed write path. - * But fallocate would have already updated quota and block - * count for this offset. So cancel these reservation - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - unsigned int reserved_clusters; - reserved_clusters = get_reserved_cluster_alloc(inode, - map->m_lblk, map->m_len); - if (reserved_clusters) - ext4_da_update_reserve_space(inode, - reserved_clusters, - 0); - } - map_out: map->m_flags |= EXT4_MAP_MAPPED; if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { @@ -4482,77 +4388,39 @@ got_allocated_blocks: map->m_flags |= EXT4_MAP_NEW; /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. + * Reduce the reserved cluster count to reflect successful deferred + * allocation of delayed allocated clusters or direct allocation of + * clusters discovered to be delayed allocated. Once allocated, a + * cluster is not included in the reserved count. */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - unsigned int reserved_clusters; - /* - * Check how many clusters we had reserved this allocated range - */ - reserved_clusters = get_reserved_cluster_alloc(inode, - map->m_lblk, allocated); - if (!map_from_cluster) { - BUG_ON(allocated_clusters < reserved_clusters); - if (reserved_clusters < allocated_clusters) { - struct ext4_inode_info *ei = EXT4_I(inode); - int reservation = allocated_clusters - - reserved_clusters; - /* - * It seems we claimed few clusters outside of - * the range of this allocation. We should give - * it back to the reservation pool. This can - * happen in the following case: - * - * * Suppose s_cluster_ratio is 4 (i.e., each - * cluster has 4 blocks. Thus, the clusters - * are [0-3],[4-7],[8-11]... - * * First comes delayed allocation write for - * logical blocks 10 & 11. Since there were no - * previous delayed allocated blocks in the - * range [8-11], we would reserve 1 cluster - * for this write. - * * Next comes write for logical blocks 3 to 8. - * In this case, we will reserve 2 clusters - * (for [0-3] and [4-7]; and not for [8-11] as - * that range has a delayed allocated blocks. - * Thus total reserved clusters now becomes 3. - * * Now, during the delayed allocation writeout - * time, we will first write blocks [3-8] and - * allocate 3 clusters for writing these - * blocks. Also, we would claim all these - * three clusters above. - * * Now when we come here to writeout the - * blocks [10-11], we would expect to claim - * the reservation of 1 cluster we had made - * (and we would claim it since there are no - * more delayed allocated blocks in the range - * [8-11]. But our reserved cluster count had - * already gone to 0. - * - * Thus, at the step 4 above when we determine - * that there are still some unwritten delayed - * allocated blocks outside of our current - * block range, we should increment the - * reserved clusters count so that when the - * remaining blocks finally gets written, we - * could claim them. - */ - dquot_reserve_block(inode, - EXT4_C2B(sbi, reservation)); - spin_lock(&ei->i_block_reservation_lock); - ei->i_reserved_data_blocks += reservation; - spin_unlock(&ei->i_block_reservation_lock); - } + if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { /* - * We will claim quota for all newly allocated blocks. - * We're updating the reserved space *after* the - * correction above so we do not accidentally free - * all the metadata reservation because we might - * actually need it later on. + * When allocating delayed allocated clusters, simply + * reduce the reserved cluster count and claim quota */ ext4_da_update_reserve_space(inode, allocated_clusters, 1); + } else { + ext4_lblk_t lblk, len; + unsigned int n; + + /* + * When allocating non-delayed allocated clusters + * (from fallocate, filemap, DIO, or clusters + * allocated when delalloc has been disabled by + * ext4_nonda_switch), reduce the reserved cluster + * count by the number of allocated clusters that + * have previously been delayed allocated. Quota + * has been claimed by ext4_mb_new_blocks() above, + * so release the quota reservations made for any + * previously delayed allocated clusters. + */ + lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); + len = allocated_clusters << sbi->s_cluster_bits; + n = ext4_es_delayed_clu(inode, lblk, len); + if (n > 0) + ext4_da_update_reserve_space(inode, (int) n, 0); } } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c5d456e12062..c92fbf444d08 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -150,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); int __init ext4_init_es(void) { @@ -808,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); @@ -844,6 +847,11 @@ retry: if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) err = 0; + if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && + (status & EXTENT_STATUS_WRITTEN || + status & EXTENT_STATUS_UNWRITTEN)) + __revise_pending(inode, lblk, len); + error: write_unlock(&EXT4_I(inode)->i_es_lock); @@ -1605,3 +1613,170 @@ error: return err; } + +/* + * __es_delayed_clu - count number of clusters containing blocks that + * are delayed only + * + * @inode - file containing block range + * @start - logical block defining start of range + * @end - logical block defining end of range + * + * Returns the number of clusters containing only delayed (not delayed + * and unwritten) blocks in the range specified by @start and @end. Any + * cluster or part of a cluster within the range and containing a delayed + * and not unwritten block within the range is counted as a whole cluster. + */ +static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct rb_node *node; + ext4_lblk_t first_lclu, last_lclu; + unsigned long long last_counted_lclu; + unsigned int n = 0; + + /* guaranteed to be unequal to any ext4_lblk_t value */ + last_counted_lclu = ~0ULL; + + es = __es_tree_search(&tree->root, start); + + while (es && (es->es_lblk <= end)) { + if (ext4_es_is_delonly(es)) { + if (es->es_lblk <= start) + first_lclu = EXT4_B2C(sbi, start); + else + first_lclu = EXT4_B2C(sbi, es->es_lblk); + + if (ext4_es_end(es) >= end) + last_lclu = EXT4_B2C(sbi, end); + else + last_lclu = EXT4_B2C(sbi, ext4_es_end(es)); + + if (first_lclu == last_counted_lclu) + n += last_lclu - first_lclu; + else + n += last_lclu - first_lclu + 1; + last_counted_lclu = last_lclu; + } + node = rb_next(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, rb_node); + } + + return n; +} + +/* + * ext4_es_delayed_clu - count number of clusters containing blocks that + * are both delayed and unwritten + * + * @inode - file containing block range + * @lblk - logical block defining start of range + * @len - number of blocks in range + * + * Locking for external use of __es_delayed_clu(). + */ +unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_lblk_t end; + unsigned int n; + + if (len == 0) + return 0; + + end = lblk + len - 1; + WARN_ON(end < lblk); + + read_lock(&ei->i_es_lock); + + n = __es_delayed_clu(inode, lblk, end); + + read_unlock(&ei->i_es_lock); + + return n; +} + +/* + * __revise_pending - makes, cancels, or leaves unchanged pending cluster + * reservations for a specified block range depending + * upon the presence or absence of delayed blocks + * outside the range within clusters at the ends of the + * range + * + * @inode - file containing the range + * @lblk - logical block defining the start of range + * @len - length of range in blocks + * + * Used after a newly allocated extent is added to the extents status tree. + * Requires that the extents in the range have either written or unwritten + * status. Must be called while holding i_es_lock. + */ +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t end = lblk + len - 1; + ext4_lblk_t first, last; + bool f_del = false, l_del = false; + + if (len == 0) + return; + + /* + * Two cases - block range within single cluster and block range + * spanning two or more clusters. Note that a cluster belonging + * to a range starting and/or ending on a cluster boundary is treated + * as if it does not contain a delayed extent. The new range may + * have allocated space for previously delayed blocks out to the + * cluster boundary, requiring that any pre-existing pending + * reservation be canceled. Because this code only looks at blocks + * outside the range, it should revise pending reservations + * correctly even if the extent represented by the range can't be + * inserted in the extents status tree due to ENOSPC. + */ + + if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delonly, + first, lblk - 1); + if (f_del) { + __insert_pending(inode, first); + } else { + last = EXT4_LBLK_CMASK(sbi, end) + + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, + &ext4_es_is_delonly, + end + 1, last); + if (l_del) + __insert_pending(inode, last); + else + __remove_pending(inode, last); + } + } else { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delonly, + first, lblk - 1); + if (f_del) + __insert_pending(inode, first); + else + __remove_pending(inode, first); + + last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, &ext4_es_is_delonly, + end + 1, last); + if (l_del) + __insert_pending(inode, last); + else + __remove_pending(inode, last); + } +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 9d3c676ec623..131a8b7df265 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -244,5 +244,9 @@ extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated); +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); #endif /* _EXT4_EXTENTS_STATUS_H */ -- cgit v1.2.3 From 9fe671496b6c286f9033aedfc1718d67721da0ae Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:25:08 -0400 Subject: ext4: adjust reserved cluster count when removing extents Modify ext4_ext_remove_space() and the code it calls to correct the reserved cluster count for pending reservations (delayed allocated clusters shared with allocated blocks) when a block range is removed from the extent tree. Pending reservations may be found for the clusters at the ends of written or unwritten extents when a block range is removed. If a physical cluster at the end of an extent is freed, it's necessary to increment the reserved cluster count to maintain correct accounting if the corresponding logical cluster is shared with at least one delayed and unwritten extent as found in the extents status tree. Add a new function, ext4_rereserve_cluster(), to reapply a reservation on a delayed allocated cluster sharing blocks with a freed allocated cluster. To avoid ENOSPC on reservation, a flag is applied to ext4_free_blocks() to briefly defer updating the freeclusters counter when an allocated cluster is freed. This prevents another thread from allocating the freed block before the reservation can be reapplied. Redefine the partial cluster object as a struct to carry more state information and to clarify the code using it. Adjust the conditional code structure in ext4_ext_remove_space to reduce the indentation level in the main body of the code to improve readability. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/ext4_extents.h | 13 ++ fs/ext4/extents.c | 284 +++++++++++++++++++++++++++----------------- fs/ext4/mballoc.c | 14 ++- include/trace/events/ext4.h | 60 ++++++---- 5 files changed, 238 insertions(+), 134 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d85fd5c8a2c4..0bdbbd151d2c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -628,6 +628,7 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 /* * ioctl commands diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index adf6668b596f..98bd0e9ee7df 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -119,6 +119,19 @@ struct ext4_ext_path { struct buffer_head *p_bh; }; +/* + * Used to record a portion of a cluster found at the beginning or end + * of an extent while traversing the extent tree during space removal. + * A partial cluster may be removed if it does not contain blocks shared + * with extents that aren't being deleted (tofree state). Otherwise, + * it cannot be removed (nofree state). + */ +struct partial_cluster { + ext4_fsblk_t pclu; /* physical cluster number */ + ext4_lblk_t lblk; /* logical block number within logical cluster */ + enum {initial, tofree, nofree} state; +}; + /* * structure for external API */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b52ac813ca20..240b6dea5441 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2490,106 +2490,157 @@ static inline int get_default_free_blocks_flags(struct inode *inode) return 0; } +/* + * ext4_rereserve_cluster - increment the reserved cluster count when + * freeing a cluster with a pending reservation + * + * @inode - file containing the cluster + * @lblk - logical block in cluster to be reserved + * + * Increments the reserved cluster count and adjusts quota in a bigalloc + * file system when freeing a partial cluster containing at least one + * delayed and unwritten block. A partial cluster meeting that + * requirement will have a pending reservation. If so, the + * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to + * defer reserved and allocated space accounting to a subsequent call + * to this function. + */ +static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); + + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); + spin_unlock(&ei->i_block_reservation_lock); + + percpu_counter_add(&sbi->s_freeclusters_counter, 1); + ext4_remove_pending(inode, lblk); +} + static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, - long long *partial_cluster, + struct partial_cluster *partial, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); - ext4_fsblk_t pblk; - int flags = get_default_free_blocks_flags(inode); + ext4_fsblk_t last_pblk, pblk; + ext4_lblk_t num; + int flags; + + /* only extent tail removal is allowed */ + if (from < le32_to_cpu(ex->ee_block) || + to != le32_to_cpu(ex->ee_block) + ee_len - 1) { + ext4_error(sbi->s_sb, + "strange request: removal(2) %u-%u from %u:%u", + from, to, le32_to_cpu(ex->ee_block), ee_len); + return 0; + } + +#ifdef EXTENTS_STATS + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); +#endif + + trace_ext4_remove_blocks(inode, ex, from, to, partial); /* - * For bigalloc file systems, we never free a partial cluster - * at the beginning of the extent. Instead, we make a note - * that we tried freeing the cluster, and check to see if we - * need to free it on a subsequent call to ext4_remove_blocks, - * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. + * if we have a partial cluster, and it's different from the + * cluster of the last block in the extent, we free it */ - flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + last_pblk = ext4_ext_pblock(ex) + ee_len - 1; + + if (partial->state != initial && + partial->pclu != EXT4_B2C(sbi, last_pblk)) { + if (partial->state == tofree) { + flags = get_default_free_blocks_flags(inode); + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); + } + partial->state = initial; + } + + num = le32_to_cpu(ex->ee_block) + ee_len - from; + pblk = ext4_ext_pblock(ex) + ee_len - num; - trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); /* - * If we have a partial cluster, and it's different from the - * cluster of the last block, we need to explicitly free the - * partial cluster here. + * We free the partial cluster at the end of the extent (if any), + * unless the cluster is used by another extent (partial_cluster + * state is nofree). If a partial cluster exists here, it must be + * shared with the last block in the extent. */ - pblk = ext4_ext_pblock(ex) + ee_len - 1; - if (*partial_cluster > 0 && - *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + flags = get_default_free_blocks_flags(inode); + + /* partial, left end cluster aligned, right end unaligned */ + if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && + (EXT4_LBLK_CMASK(sbi, to) >= from) && + (partial->state != nofree)) { + if (ext4_is_pending(inode, to)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), + EXT4_PBLK_CMASK(sbi, last_pblk), sbi->s_cluster_ratio, flags); - *partial_cluster = 0; + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, to); + partial->state = initial; + flags = get_default_free_blocks_flags(inode); } -#ifdef EXTENTS_STATS - { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - spin_lock(&sbi->s_ext_stats_lock); - sbi->s_ext_blocks += ee_len; - sbi->s_ext_extents++; - if (ee_len < sbi->s_ext_min) - sbi->s_ext_min = ee_len; - if (ee_len > sbi->s_ext_max) - sbi->s_ext_max = ee_len; - if (ext_depth(inode) > sbi->s_depth_max) - sbi->s_depth_max = ext_depth(inode); - spin_unlock(&sbi->s_ext_stats_lock); - } -#endif - if (from >= le32_to_cpu(ex->ee_block) - && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* tail removal */ - ext4_lblk_t num; - long long first_cluster; - - num = le32_to_cpu(ex->ee_block) + ee_len - from; - pblk = ext4_ext_pblock(ex) + ee_len - num; - /* - * Usually we want to free partial cluster at the end of the - * extent, except for the situation when the cluster is still - * used by any other extent (partial_cluster is negative). - */ - if (*partial_cluster < 0 && - *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) - flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; - ext_debug("free last %u blocks starting %llu partial %lld\n", - num, pblk, *partial_cluster); - ext4_free_blocks(handle, inode, NULL, pblk, num, flags); - /* - * If the block range to be freed didn't start at the - * beginning of a cluster, and we removed the entire - * extent and the cluster is not used by any other extent, - * save the partial cluster here, since we might need to - * delete if we determine that the truncate or punch hole - * operation has removed all of the blocks in the cluster. - * If that cluster is used by another extent, preserve its - * negative value so it isn't freed later on. - * - * If the whole extent wasn't freed, we've reached the - * start of the truncated/punched region and have finished - * removing blocks. If there's a partial cluster here it's - * shared with the remainder of the extent and is no longer - * a candidate for removal. - */ - if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { - first_cluster = (long long) EXT4_B2C(sbi, pblk); - if (first_cluster != -*partial_cluster) - *partial_cluster = first_cluster; - } else { - *partial_cluster = 0; + /* + * For bigalloc file systems, we never free a partial cluster + * at the beginning of the extent. Instead, we check to see if we + * need to free it on a subsequent call to ext4_remove_blocks, + * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. + */ + flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + + /* reset the partial cluster if we've freed past it */ + if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) + partial->state = initial; + + /* + * If we've freed the entire extent but the beginning is not left + * cluster aligned and is not marked as ineligible for freeing we + * record the partial cluster at the beginning of the extent. It + * wasn't freed by the preceding ext4_free_blocks() call, and we + * need to look farther to the left to determine if it's to be freed + * (not shared with another extent). Else, reset the partial + * cluster - we're either done freeing or the beginning of the + * extent is left cluster aligned. + */ + if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { + if (partial->state == initial) { + partial->pclu = EXT4_B2C(sbi, pblk); + partial->lblk = from; + partial->state = tofree; } - } else - ext4_error(sbi->s_sb, "strange request: removal(2) " - "%u-%u from %u:%u", - from, to, le32_to_cpu(ex->ee_block), ee_len); + } else { + partial->state = initial; + } + return 0; } - /* * ext4_ext_rm_leaf() Removes the extents associated with the * blocks appearing between "start" and "end". Both "start" @@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - long long *partial_cluster, + struct partial_cluster *partial, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); - trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); + trace_ext4_ext_rm_leaf(inode, start, ex, partial); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex); - *partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + partial->pclu = EXT4_B2C(sbi, pblk); + partial->state = nofree; } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - err = ext4_remove_blocks(handle, inode, ex, partial_cluster, - a, b); + err = ext4_remove_blocks(handle, inode, ex, partial, a, b); if (err) goto out; @@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the * current extent. If it is shared with the current extent - * we zero partial_cluster because we've reached the start of the + * we reset the partial cluster because we've reached the start of the * truncated/punched region and we're done removing blocks. */ - if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { + if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; - if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + if (partial->pclu != EXT4_B2C(sbi, pblk)) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, - get_default_free_blocks_flags(inode)); + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); } - *partial_cluster = 0; + partial->state = initial; } /* if this leaf is free, then we should @@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; - long long partial_cluster = 0; + struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + partial.pclu = 0; + partial.lblk = 0; + partial.state = initial; + ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ @@ -2882,8 +2941,8 @@ again: */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex) + end - ee_block + 2; - partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; } /* @@ -2911,9 +2970,10 @@ again: &ex); if (err) goto out; - if (pblk) - partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + if (pblk) { + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; + } } } /* @@ -2948,8 +3008,7 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - &partial_cluster, start, - end); + &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -3021,21 +3080,24 @@ again: } } - trace_ext4_ext_remove_space_done(inode, start, end, depth, - partial_cluster, path->p_hdr->eh_entries); + trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, + path->p_hdr->eh_entries); /* - * If we still have something in the partial cluster and we have removed - * even the first extent, then we should free the blocks in the partial - * cluster as well. (This code will only run when there are no leaves - * to the immediate left of the truncated/punched region.) + * if there's a partial cluster and we have removed the first extent + * in the file, then we also free the partial cluster, if any */ - if (partial_cluster > 0 && err == 0) { - /* don't zero partial_cluster since it's not used afterwards */ + if (partial.state == tofree && err == 0) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial.lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, partial_cluster), - sbi->s_cluster_ratio, - get_default_free_blocks_flags(inode)); + EXT4_C2B(sbi, partial.pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial.lblk); + partial.state = initial; } /* TODO: flexible tree reduction should be here */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e29fce2fbf25..e2248083cdca 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4915,9 +4915,17 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + /* + * on a bigalloc file system, defer the s_freeclusters_counter + * update to the caller (ext4_remove_space and friends) so they + * can determine if a cluster freed here should be rereserved + */ + if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, + count_clusters); + } ext4_mb_unload_buddy(&e4b); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6d7a943f849c..698e0d8a5ca4 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -17,6 +17,7 @@ struct mpage_da_data; struct ext4_map_blocks; struct extent_status; struct ext4_fsmap; +struct partial_cluster; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -2035,21 +2036,23 @@ TRACE_EVENT(ext4_ext_show_extent, ); TRACE_EVENT(ext4_remove_blocks, - TP_PROTO(struct inode *inode, struct ext4_extent *ex, - ext4_lblk_t from, ext4_fsblk_t to, - long long partial_cluster), + TP_PROTO(struct inode *inode, struct ext4_extent *ex, + ext4_lblk_t from, ext4_fsblk_t to, + struct partial_cluster *pc), - TP_ARGS(inode, ex, from, to, partial_cluster), + TP_ARGS(inode, ex, from, to, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) - __field( long long, partial ) __field( ext4_fsblk_t, ee_pblk ) __field( ext4_lblk_t, ee_lblk ) __field( unsigned short, ee_len ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state) ), TP_fast_assign( @@ -2057,14 +2060,16 @@ TRACE_EVENT(ext4_remove_blocks, __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; - __entry->partial = partial_cluster; __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" - "from %u to %u partial_cluster %lld", + "from %u to %u partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->ee_lblk, @@ -2072,45 +2077,53 @@ TRACE_EVENT(ext4_remove_blocks, (unsigned short) __entry->ee_len, (unsigned) __entry->from, (unsigned) __entry->to, - (long long) __entry->partial) + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_leaf, TP_PROTO(struct inode *inode, ext4_lblk_t start, struct ext4_extent *ex, - long long partial_cluster), + struct partial_cluster *pc), - TP_ARGS(inode, start, ex, partial_cluster), + TP_ARGS(inode, start, ex, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( long long, partial ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, ee_lblk ) __field( ext4_fsblk_t, ee_pblk ) __field( short, ee_len ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->partial = partial_cluster; __entry->start = start; __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" - "partial_cluster %lld", + "partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, - (long long) __entry->partial) + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_idx, @@ -2168,9 +2181,9 @@ TRACE_EVENT(ext4_ext_remove_space, TRACE_EVENT(ext4_ext_remove_space_done, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, - int depth, long long partial, __le16 eh_entries), + int depth, struct partial_cluster *pc, __le16 eh_entries), - TP_ARGS(inode, start, end, depth, partial, eh_entries), + TP_ARGS(inode, start, end, depth, pc, eh_entries), TP_STRUCT__entry( __field( dev_t, dev ) @@ -2178,7 +2191,9 @@ TRACE_EVENT(ext4_ext_remove_space_done, __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) - __field( long long, partial ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state ) __field( unsigned short, eh_entries ) ), @@ -2188,18 +2203,23 @@ TRACE_EVENT(ext4_ext_remove_space_done, __entry->start = start; __entry->end = end; __entry->depth = depth; - __entry->partial = partial; + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; __entry->eh_entries = le16_to_cpu(eh_entries); ), - TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld " + TP_printk("dev %d,%d ino %lu since %u end %u depth %d " + "partial [pclu %lld lblk %u state %d] " "remaining_entries %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth, - (long long) __entry->partial, + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state, (unsigned short) __entry->eh_entries) ); -- cgit v1.2.3 From f456767d3391e9f7d9d25a2e7241d75676dc19da Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 1 Oct 2018 14:33:24 -0400 Subject: ext4: fix reserved cluster accounting at page invalidation time Add new code to count canceled pending cluster reservations on bigalloc file systems and to reduce the cluster reservation count on all file systems using delayed allocation. This replaces old code in ext4_da_page_release_reservations that was incorrect. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/extents_status.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 23 +++---------- 3 files changed, 95 insertions(+), 19 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0bdbbd151d2c..57cbc98d730f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2491,6 +2491,7 @@ extern int ext4_page_mkwrite(struct vm_fault *vmf); extern int ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c92fbf444d08..2b439afafe13 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1780,3 +1780,93 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, __remove_pending(inode, last); } } + +/* + * ext4_es_remove_blks - remove block range from extents status tree and + * reduce reservation count or cancel pending + * reservation as needed + * + * @inode - file containing range + * @lblk - first block in range + * @len - number of blocks to remove + * + */ +void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int clu_size, reserved = 0; + ext4_lblk_t last_lclu, first, length, remainder, last; + bool delonly; + int err = 0; + struct pending_reservation *pr; + struct ext4_pending_tree *tree; + + /* + * Process cluster by cluster for bigalloc - there may be up to + * two clusters in a 4k page with a 1k block size and two blocks + * per cluster. Also necessary for systems with larger page sizes + * and potentially larger block sizes. + */ + clu_size = sbi->s_cluster_ratio; + last_lclu = EXT4_B2C(sbi, lblk + len - 1); + + write_lock(&EXT4_I(inode)->i_es_lock); + + for (first = lblk, remainder = len; + remainder > 0; + first += length, remainder -= length) { + + if (EXT4_B2C(sbi, first) == last_lclu) + length = remainder; + else + length = clu_size - EXT4_LBLK_COFF(sbi, first); + + /* + * The BH_Delay flag, which triggers calls to this function, + * and the contents of the extents status tree can be + * inconsistent due to writepages activity. So, note whether + * the blocks to be removed actually belong to an extent with + * delayed only status. + */ + delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first); + + /* + * because of the writepages effect, written and unwritten + * blocks could be removed here + */ + last = first + length - 1; + err = __es_remove_extent(inode, first, last); + if (err) + ext4_warning(inode->i_sb, + "%s: couldn't remove page (err = %d)", + __func__, err); + + /* non-bigalloc case: simply count the cluster for release */ + if (sbi->s_cluster_ratio == 1 && delonly) { + reserved++; + continue; + } + + /* + * bigalloc case: if all delayed allocated only blocks have + * just been removed from a cluster, either cancel a pending + * reservation if it exists or count a cluster for release + */ + if (delonly && + !__es_scan_clu(inode, &ext4_es_is_delonly, first)) { + pr = __get_pending(inode, EXT4_B2C(sbi, first)); + if (pr != NULL) { + tree = &EXT4_I(inode)->i_pending_tree; + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + } else { + reserved++; + } + } + } + + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_da_release_space(inode, reserved); +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 57c6dd38f071..9b69f88bdacc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1595,7 +1595,7 @@ static int ext4_da_reserve_space(struct inode *inode) return 0; /* success */ } -static void ext4_da_release_space(struct inode *inode, int to_free) +void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1634,13 +1634,11 @@ static void ext4_da_page_release_reservation(struct page *page, unsigned int offset, unsigned int length) { - int to_release = 0, contiguous_blks = 0; + int contiguous_blks = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int stop = offset + length; - int num_clusters; ext4_fsblk_t lblk; BUG_ON(stop > PAGE_SIZE || stop < length); @@ -1654,7 +1652,6 @@ static void ext4_da_page_release_reservation(struct page *page, break; if ((offset <= curr_off) && (buffer_delay(bh))) { - to_release++; contiguous_blks++; clear_buffer_delay(bh); } else if (contiguous_blks) { @@ -1662,7 +1659,7 @@ static void ext4_da_page_release_reservation(struct page *page, (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_extent(inode, lblk, contiguous_blks); + ext4_es_remove_blks(inode, lblk, contiguous_blks); contiguous_blks = 0; } curr_off = next_off; @@ -1671,21 +1668,9 @@ static void ext4_da_page_release_reservation(struct page *page, if (contiguous_blks) { lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_extent(inode, lblk, contiguous_blks); + ext4_es_remove_blks(inode, lblk, contiguous_blks); } - /* If we have released all the blocks belonging to a cluster, then we - * need to release the reserved space for that cluster. */ - num_clusters = EXT4_NUM_B2C(sbi, to_release); - while (num_clusters > 0) { - lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + - ((num_clusters - 1) << sbi->s_cluster_bits); - if (sbi->s_cluster_ratio == 1 || - !ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) - ext4_da_release_space(inode, 1); - - num_clusters--; - } } /* -- cgit v1.2.3 From f18b2b83a727a3db208308057d2c7945f368e625 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 2 Oct 2018 01:34:44 -0400 Subject: ext4: fix argument checking in EXT4_IOC_MOVE_EXT If the starting block number of either the source or destination file exceeds the EOF, EXT4_IOC_MOVE_EXT should return EINVAL. Also fixed the helper function mext_check_coverage() so that if the logical block is beyond EOF, make it return immediately, instead of looping until the block number wraps all the away around. This takes long enough that if there are multiple threads trying to do pound on an the same inode doing non-sensical things, it can end up triggering the kernel's soft lockup detector. Reported-by: syzbot+c61979f6f2cba5cb3c06@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/move_extent.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a409ff70d67b..2f5be02fc6f6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -516,9 +516,13 @@ mext_check_arguments(struct inode *orig_inode, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_eof < orig_start + *len - 1) + if (orig_eof <= orig_start) + *len = 0; + else if (orig_eof < orig_start + *len - 1) *len = orig_eof - orig_start; - if (donor_eof < donor_start + *len - 1) + if (donor_eof <= donor_start) + *len = 0; + else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " -- cgit v1.2.3 From 799578ab16e86b074c184ec5abbda0bc698c7b0b Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 2 Oct 2018 12:43:51 -0400 Subject: ext4: fix build error when DX_DEBUG is defined Enabling DX_DEBUG triggers the build error below. info is an attribute of the dxroot structure. linux/fs/ext4/namei.c:2264:12: error: ‘info’ undeclared (first use in this function); did you mean ‘insl’? info->indirect_levels)); Fixes: e08ac99fa2a2 ("ext4: add largedir feature") Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Theodore Ts'o Reviewed-by: Lukas Czerner --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 377d516c475f..67a38532032a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2261,7 +2261,7 @@ again: dxroot->info.indirect_levels += 1; dxtrace(printk(KERN_DEBUG "Creating %d level index...\n", - info->indirect_levels)); + dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; -- cgit v1.2.3 From 18aded17492088962ef43f00825179598b3e8c58 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 2 Oct 2018 18:21:19 -0400 Subject: ext4: fix EXT4_IOC_SWAP_BOOT The code EXT4_IOC_SWAP_BOOT ioctl hasn't been updated in a while, and it's a bit broken with respect to more modern ext4 kernels, especially metadata checksums. Other problems fixed with this commit: * Don't allow installing a DAX, swap file, or an encrypted file as a boot loader. * Respect the immutable and append-only flags. * Wait until any DIO operations are finished *before* calling truncate_inode_pages(). * Don't swap inode->i_flags, since these flags have nothing to do with the inode blocks --- and it will give the IMA/audit code heartburn when the inode is evicted. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Reported-by: syzbot+e81ccd4744c6c4f71354@syzkaller.appspotmail.com --- fs/ext4/ioctl.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a7074115d6f6..d7ed7487e630 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -67,7 +67,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); - swap(inode1->i_flags, inode2->i_flags); swap(inode1->i_version, inode2->i_version); swap(inode1->i_blocks, inode2->i_blocks); swap(inode1->i_bytes, inode2->i_bytes); @@ -85,6 +84,21 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) i_size_write(inode2, isize); } +static void reset_inode_seed(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + __u32 csum; + + if (!ext4_has_metadata_csum(inode->i_sb)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); +} + /** * Swap the information from the given @inode and the inode * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other @@ -102,10 +116,13 @@ static long swap_inode_boot_loader(struct super_block *sb, struct inode *inode_bl; struct ext4_inode_info *ei_bl; - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + ext4_has_inline_data(inode)) return -EINVAL; - if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) return -EPERM; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); @@ -120,13 +137,13 @@ static long swap_inode_boot_loader(struct super_block *sb, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); - truncate_inode_pages(&inode->i_data, 0); - truncate_inode_pages(&inode_bl->i_data, 0); - /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; @@ -159,6 +176,8 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_generation = prandom_u32(); inode_bl->i_generation = prandom_u32(); + reset_inode_seed(inode); + reset_inode_seed(inode_bl); ext4_discard_preallocations(inode); @@ -169,6 +188,7 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); } else { err = ext4_mark_inode_dirty(handle, inode_bl); if (err < 0) { @@ -178,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb, /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); } } ext4_journal_stop(handle); -- cgit v1.2.3 From 625ef8a3acd111d5f496d190baf99d1a815bd03e Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 2 Oct 2018 21:18:45 -0400 Subject: ext4: initialize retries variable in ext4_da_write_inline_data_begin() Variable retries is not initialized in ext4_da_write_inline_data_begin() which can lead to nondeterministic number of retries in case we hit ENOSPC. Initialize retries to zero as we do everywhere else. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Fixes: bc0ca9df3b2a ("ext4: retry allocation when inline->extent conversion failed") Cc: stable@kernel.org --- fs/ext4/inline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 7b4736022761..9c4bac18cc6c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -863,7 +863,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, handle_t *handle; struct page *page; struct ext4_iloc iloc; - int retries; + int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) -- cgit v1.2.3 From 401b25aa1a75e7fe4e3202a6336604269697d705 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Tue, 2 Oct 2018 22:20:50 -0400 Subject: ext4: convert fault handler to use vm_fault_t type Return type of ext4_page_mkwrite and ext4_filemap_fault are changed to use vm_fault_t type. With this patch all the callers of block_page_mkwrite_return() are changed to handle vm_fault_t. So converting the return type of block_page_mkwrite_return() to vm_fault_t. Signed-off-by: Souptick Joarder Signed-off-by: Theodore Ts'o Reviewed-by: Matthew Wilcox --- fs/ext4/ext4.h | 4 ++-- fs/ext4/inode.c | 29 +++++++++++++++-------------- include/linux/buffer_head.h | 2 +- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 57cbc98d730f..86e1bacac757 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2487,8 +2487,8 @@ extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); -extern int ext4_page_mkwrite(struct vm_fault *vmf); -extern int ext4_filemap_fault(struct vm_fault *vmf); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_release_space(struct inode *inode, int to_free); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b69f88bdacc..c3d9a42c561e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -6184,13 +6184,14 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_fault *vmf) +vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; loff_t size; unsigned long len; - int ret; + int err; + vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; @@ -6203,8 +6204,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf) down_read(&EXT4_I(inode)->i_mmap_sem); - ret = ext4_convert_inline_data(inode); - if (ret) + err = ext4_convert_inline_data(inode); + if (err) goto out_ret; /* Delalloc case is easy... */ @@ -6212,9 +6213,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf) !ext4_should_journal_data(inode) && !ext4_nonda_switch(inode->i_sb)) { do { - ret = block_page_mkwrite(vma, vmf, + err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); - } while (ret == -ENOSPC && + } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } @@ -6259,8 +6260,8 @@ retry_alloc: ret = VM_FAULT_SIGBUS; goto out; } - ret = block_page_mkwrite(vma, vmf, get_block); - if (!ret && ext4_should_journal_data(inode)) { + err = block_page_mkwrite(vma, vmf, get_block); + if (!err && ext4_should_journal_data(inode)) { if (ext4_walk_page_buffers(handle, page_buffers(page), 0, PAGE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); @@ -6271,24 +6272,24 @@ retry_alloc: ext4_set_inode_state(inode, EXT4_STATE_JDATA); } ext4_journal_stop(handle); - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: - ret = block_page_mkwrite_return(ret); + ret = block_page_mkwrite_return(err); out: up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } -int ext4_filemap_fault(struct vm_fault *vmf) +vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - int err; + vm_fault_t ret; down_read(&EXT4_I(inode)->i_mmap_sem); - err = filemap_fault(vmf); + ret = filemap_fault(vmf); up_read(&EXT4_I(inode)->i_mmap_sem); - return err; + return ret; } diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 96225a77c112..7b73ef7f902d 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -242,7 +242,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); /* Convert errno to return value from ->page_mkwrite() call */ -static inline int block_page_mkwrite_return(int err) +static inline vm_fault_t block_page_mkwrite_return(int err) { if (err == 0) return VM_FAULT_LOCKED; -- cgit v1.2.3 From e5f0926115a4a40ed1cd0d3ce8b09bb88be73ab9 Mon Sep 17 00:00:00 2001 From: Darrick J. Wong Date: Tue, 2 Oct 2018 22:40:32 -0400 Subject: docs: generate a separate ext4 pdf file from the documentation The documentation build scripts won't build a pdf for the ext4 documentation unless explicitly called for, so ask for a separate ext4.pdf to be generated with all the documentation. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- Documentation/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/conf.py b/Documentation/conf.py index b691af4831fa..05dad6bda787 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -383,6 +383,8 @@ latex_documents = [ 'The kernel development community', 'manual'), ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', 'The kernel development community', 'manual'), + ('filesystems/ext4/index', 'ext4.tex', 'ext4 Filesystem', + 'ext4 Filesystem Developers', 'manual'), ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', 'The kernel development community', 'manual'), ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', -- cgit v1.2.3 From de7abd7bbb73d67f90c6fb48d4b2debe54f6f46e Mon Sep 17 00:00:00 2001 From: Darrick J. Wong Date: Tue, 2 Oct 2018 22:43:40 -0400 Subject: docs: fix ext4 documentation table formatting problems It turns out that the latex table formatters lay out table columns with the exact proportional widths given in the table metadata, even if text overflows outside the box. This was not caught during the initial import because the HTML renderers are smart enough to fudge the table. Fix the table column width formatting problems in the data structures and algorithms documentation so that we don't have squashed columns. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- .../filesystems/ext4/ondisk/attributes.rst | 8 +++--- .../filesystems/ext4/ondisk/checksums.rst | 2 +- .../filesystems/ext4/ondisk/directory.rst | 18 ++++++------ .../filesystems/ext4/ondisk/group_descr.rst | 4 +-- Documentation/filesystems/ext4/ondisk/ifork.rst | 8 +++--- Documentation/filesystems/ext4/ondisk/inodes.rst | 19 +++++++------ Documentation/filesystems/ext4/ondisk/journal.rst | 32 +++++++++++----------- Documentation/filesystems/ext4/ondisk/mmp.rst | 2 +- .../filesystems/ext4/ondisk/special_inodes.rst | 2 +- Documentation/filesystems/ext4/ondisk/super.rst | 24 ++++++++-------- 10 files changed, 60 insertions(+), 59 deletions(-) diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/ondisk/attributes.rst index 0b01b67b81fe..54386a010a8d 100644 --- a/Documentation/filesystems/ext4/ondisk/attributes.rst +++ b/Documentation/filesystems/ext4/ondisk/attributes.rst @@ -30,7 +30,7 @@ Extended attributes, when stored after the inode, have a header ``ext4_xattr_ibody_header`` that is 4 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -47,7 +47,7 @@ The beginning of an extended attribute block is in ``struct ext4_xattr_header``, which is 32 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -92,7 +92,7 @@ entries must be stored in sorted order. The sort order is Attributes stored inside an inode do not need be stored in sorted order. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -157,7 +157,7 @@ attribute name index field is set, and matching string is removed from the key name. Here is a map of name index values to key prefixes: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Name Index diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/ondisk/checksums.rst index 9d6a793b2e03..5519e253810d 100644 --- a/Documentation/filesystems/ext4/ondisk/checksums.rst +++ b/Documentation/filesystems/ext4/ondisk/checksums.rst @@ -28,7 +28,7 @@ of checksum. The checksum function is whatever the superblock describes (crc32c as of October 2013) unless noted otherwise. .. list-table:: - :widths: 1 1 4 + :widths: 20 8 50 :header-rows: 1 * - Metadata diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/ondisk/directory.rst index 8fcba68c2884..614034e24669 100644 --- a/Documentation/filesystems/ext4/ondisk/directory.rst +++ b/Documentation/filesystems/ext4/ondisk/directory.rst @@ -34,7 +34,7 @@ is at most 263 bytes long, though on disk you'll need to reference ``dirent.rec_len`` to know for sure. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -66,7 +66,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most ``dirent.rec_len`` to know for sure. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -99,7 +99,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most The directory file type is one of the following values: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -130,7 +130,7 @@ in the place where the name normally goes. The structure is ``struct ext4_dir_entry_tail``: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -212,7 +212,7 @@ The root of the htree is in ``struct dx_root``, which is the full length of a data block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -305,7 +305,7 @@ of a data block: The directory hash is one of the following values: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -327,7 +327,7 @@ Interior nodes of an htree are recorded as ``struct dx_node``, which is also the full length of a data block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -375,7 +375,7 @@ The hash maps that exist in both ``struct dx_root`` and long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -405,7 +405,7 @@ directory index (which will ensure that there's space for the checksum. The dx\_tail structure is 8 bytes long and looks like this: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/ondisk/group_descr.rst index 759827e5d2cf..0f783ed88592 100644 --- a/Documentation/filesystems/ext4/ondisk/group_descr.rst +++ b/Documentation/filesystems/ext4/ondisk/group_descr.rst @@ -43,7 +43,7 @@ entire bitmap. The block group descriptor is laid out in ``struct ext4_group_desc``. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -157,7 +157,7 @@ The block group descriptor is laid out in ``struct ext4_group_desc``. Block group flags can be any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ondisk/ifork.rst index 5dbe3b2b121a..b9816d5a896b 100644 --- a/Documentation/filesystems/ext4/ondisk/ifork.rst +++ b/Documentation/filesystems/ext4/ondisk/ifork.rst @@ -68,7 +68,7 @@ The extent tree header is recorded in ``struct ext4_extent_header``, which is 12 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -104,7 +104,7 @@ Internal nodes of the extent tree, also known as index nodes, are recorded as ``struct ext4_extent_idx``, and are 12 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -134,7 +134,7 @@ Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, and are also 12 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -174,7 +174,7 @@ including) the checksum itself. ``struct ext4_extent_tail`` is 4 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/ondisk/inodes.rst index 655ce898f3f5..6bd35e506b6f 100644 --- a/Documentation/filesystems/ext4/ondisk/inodes.rst +++ b/Documentation/filesystems/ext4/ondisk/inodes.rst @@ -29,8 +29,9 @@ and the inode structure itself. The inode table entry is laid out in ``struct ext4_inode``. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 + :class: longtable * - Offset - Size @@ -176,7 +177,7 @@ The inode table entry is laid out in ``struct ext4_inode``. The ``i_mode`` value is a combination of the following flags: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -227,7 +228,7 @@ The ``i_mode`` value is a combination of the following flags: The ``i_flags`` field is a combination of these values: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -314,7 +315,7 @@ The ``osd1`` field has multiple meanings depending on the creator: Linux: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -331,7 +332,7 @@ Linux: Hurd: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -346,7 +347,7 @@ Hurd: Masix: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -365,7 +366,7 @@ The ``osd2`` field has multiple meanings depending on the filesystem creator: Linux: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -402,7 +403,7 @@ Linux: Hurd: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -433,7 +434,7 @@ Hurd: Masix: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/ondisk/journal.rst index e7031af86876..ea613ee701f5 100644 --- a/Documentation/filesystems/ext4/ondisk/journal.rst +++ b/Documentation/filesystems/ext4/ondisk/journal.rst @@ -48,7 +48,7 @@ Layout Generally speaking, the journal has this format: .. list-table:: - :widths: 1 1 78 + :widths: 16 48 16 :header-rows: 1 * - Superblock @@ -76,7 +76,7 @@ The journal superblock will be in the next full block after the superblock. .. list-table:: - :widths: 1 1 1 1 76 + :widths: 12 12 12 32 12 :header-rows: 1 * - 1024 bytes of padding @@ -98,7 +98,7 @@ Every block in the journal starts with a common 12-byte header ``struct journal_header_s``: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -124,7 +124,7 @@ Every block in the journal starts with a common 12-byte header The journal block type can be any one of: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -154,7 +154,7 @@ The journal superblock is recorded as ``struct journal_superblock_s``, which is 1024 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -264,7 +264,7 @@ which is 1024 bytes long: The journal compat features are any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -278,7 +278,7 @@ The journal compat features are any combination of the following: The journal incompat features are any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -306,7 +306,7 @@ Journal checksum type codes are one of the following. crc32 or crc32c are the most likely choices. .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -330,7 +330,7 @@ described by a data structure, but here is the block structure anyway. Descriptor blocks consume at least 36 bytes, but use a full block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -355,7 +355,7 @@ defined as ``struct journal_block_tag3_s``, which looks like the following. The size is 16 or 32 bytes. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -400,7 +400,7 @@ following. The size is 16 or 32 bytes. The journal tag flags are any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -421,7 +421,7 @@ is defined as ``struct journal_block_tag_s``, which looks like the following. The size is 8, 12, 24, or 28 bytes: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -471,7 +471,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a ``struct jbd2_journal_block_tail``, which looks like this: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -513,7 +513,7 @@ Revocation blocks are described in length, but use a full block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -543,7 +543,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation block is a ``struct jbd2_journal_revoke_tail``, which has this format: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -567,7 +567,7 @@ The commit block is described by ``struct commit_header``, which is 32 bytes long (but uses a full block): .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/ondisk/mmp.rst index b7d7a3137f80..25660981d93c 100644 --- a/Documentation/filesystems/ext4/ondisk/mmp.rst +++ b/Documentation/filesystems/ext4/ondisk/mmp.rst @@ -32,7 +32,7 @@ The checksum is calculated against the FS UUID and the MMP structure. The MMP structure (``struct mmp_struct``) is as follows: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 12 20 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/ondisk/special_inodes.rst index a82f70c9baeb..9061aabba827 100644 --- a/Documentation/filesystems/ext4/ondisk/special_inodes.rst +++ b/Documentation/filesystems/ext4/ondisk/special_inodes.rst @@ -6,7 +6,7 @@ Special inodes ext4 reserves some inode for special features, as follows: .. list-table:: - :widths: 1 79 + :widths: 6 70 :header-rows: 1 * - inode Number diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/ondisk/super.rst index 5f81dd87e0b9..04ff079a2acf 100644 --- a/Documentation/filesystems/ext4/ondisk/super.rst +++ b/Documentation/filesystems/ext4/ondisk/super.rst @@ -19,7 +19,7 @@ The ext4 superblock is laid out as follows in ``struct ext4_super_block``: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -483,7 +483,7 @@ The ext4 superblock is laid out as follows in The superblock state is some combination of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -500,7 +500,7 @@ The superblock state is some combination of the following: The superblock error policy is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -517,7 +517,7 @@ The superblock error policy is one of the following: The filesystem creator is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -538,7 +538,7 @@ The filesystem creator is one of the following: The superblock revision is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -556,7 +556,7 @@ The superblock compatible features field is a combination of any of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -595,7 +595,7 @@ The superblock incompatible features field is a combination of any of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -647,7 +647,7 @@ The superblock read-only compatible features field is a combination of any of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -702,7 +702,7 @@ the following: The ``s_def_hash_version`` field is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -725,7 +725,7 @@ The ``s_def_hash_version`` field is one of the following: The ``s_default_mount_opts`` field is any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -767,7 +767,7 @@ The ``s_default_mount_opts`` field is any combination of the following: The ``s_flags`` field is any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -784,7 +784,7 @@ The ``s_flags`` field is any combination of the following: The ``s_encrypt_algos`` list can contain any of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value -- cgit v1.2.3 From c0e3e0406a0c39044c7dc25f3386694542d50fcc Mon Sep 17 00:00:00 2001 From: Darrick J. Wong Date: Tue, 2 Oct 2018 22:45:25 -0400 Subject: docs: make ext4 readme tables readable The tables in the ext4 readme are not particularly space efficient in the text or html outputs, and they're totally broken in the pdf output. Convert them into titled paragraphs so that they render more nicely. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/ext4.rst | 821 +++++++++++++++----------------- 1 file changed, 391 insertions(+), 430 deletions(-) diff --git a/Documentation/filesystems/ext4/ext4.rst b/Documentation/filesystems/ext4/ext4.rst index 9d4368d591fa..e2b6bb7c2730 100644 --- a/Documentation/filesystems/ext4/ext4.rst +++ b/Documentation/filesystems/ext4/ext4.rst @@ -101,269 +101,256 @@ Options When mounting an ext4 filesystem, the following option are accepted: (*) == default -======================= ======================================================= -Mount Option Description -======================= ======================================================= -ro Mount filesystem read only. Note that ext4 will - replay the journal (and thus write to the - partition) even when mounted "read only". The - mount options "ro,noload" can be used to prevent - writes to the filesystem. - -journal_checksum Enable checksumming of the journal transactions. - This will allow the recovery code in e2fsck and the - kernel to detect corruption in the kernel. It is a - compatible change and will be ignored by older kernels. - -journal_async_commit Commit block can be written to disk without waiting - for descriptor blocks. If enabled older kernels cannot - mount the device. This will enable 'journal_checksum' - internally. - -journal_path=path -journal_dev=devnum When the external journal device's major/minor numbers - have changed, these options allow the user to specify - the new journal location. The journal device is - identified through either its new major/minor numbers - encoded in devnum, or via a path to the device. - -norecovery Don't load the journal on mounting. Note that -noload if the filesystem was not unmounted cleanly, - skipping the journal replay will lead to the - filesystem containing inconsistencies that can - lead to any number of problems. - -data=journal All data are committed into the journal prior to being - written into the main file system. Enabling - this mode will disable delayed allocation and - O_DIRECT support. - -data=ordered (*) All data are forced directly out to the main file - system prior to its metadata being committed to the - journal. - -data=writeback Data ordering is not preserved, data may be written - into the main file system after its metadata has been - committed to the journal. - -commit=nrsec (*) Ext4 can be told to sync all its data and metadata - every 'nrsec' seconds. The default value is 5 seconds. - This means that if you lose your power, you will lose - as much as the latest 5 seconds of work (your - filesystem will not be damaged though, thanks to the - journaling). This default value (or any low value) - will hurt performance, but it's good for data-safety. - Setting it to 0 will have the same effect as leaving - it at the default (5 seconds). - Setting it to very large values will improve - performance. - -barrier=<0|1(*)> This enables/disables the use of write barriers in -barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. -nobarrier This also requires an IO stack which can support - barriers, and if jbd gets an error on a barrier - write, it will disable again with a warning. - Write barriers enforce proper on-disk ordering - of journal commits, making volatile disk write caches - safe to use, at some performance penalty. If - your disks are battery-backed in one way or another, - disabling barriers may safely improve performance. - The mount options "barrier" and "nobarrier" can - also be used to enable or disable barriers, for - consistency with other ext4 mount options. - -inode_readahead_blks=n This tuning parameter controls the maximum - number of inode table blocks that ext4's inode - table readahead algorithm will pre-read into - the buffer cache. The default value is 32 blocks. - -nouser_xattr Disables Extended User Attributes. See the - attr(5) manual page for more information about - extended attributes. - -noacl This option disables POSIX Access Control List - support. If ACL support is enabled in the kernel - configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is - enabled by default on mount. See the acl(5) manual - page for more information about acl. - -bsddf (*) Make 'df' act like BSD. -minixdf Make 'df' act like Minix. - -debug Extra debugging information is sent to syslog. - -abort Simulate the effects of calling ext4_abort() for - debugging purposes. This is normally used while - remounting a filesystem which is already mounted. - -errors=remount-ro Remount the filesystem read-only on an error. -errors=continue Keep going on a filesystem error. -errors=panic Panic and halt the machine if an error occurs. - (These mount options override the errors behavior - specified in the superblock, which can be configured - using tune2fs) - -data_err=ignore(*) Just print an error message if an error occurs - in a file data buffer in ordered mode. -data_err=abort Abort the journal if an error occurs in a file - data buffer in ordered mode. - -grpid New objects have the group ID of their parent. -bsdgroups - -nogrpid (*) New objects have the group ID of their creator. -sysvgroups - -resgid=n The group ID which may use the reserved blocks. - -resuid=n The user ID which may use the reserved blocks. - -sb=n Use alternate superblock at this location. - -quota These options are ignored by the filesystem. They -noquota are used only by quota tools to recognize volumes -grpquota where quota should be turned on. See documentation -usrquota in the quota-tools package for more details - (http://sourceforge.net/projects/linuxquota). - -jqfmt= These options tell filesystem details about quota -usrjquota= so that quota information can be properly updated -grpjquota= during journal replay. They replace the above - quota options. See documentation in the quota-tools - package for more details - (http://sourceforge.net/projects/linuxquota). - -stripe=n Number of filesystem blocks that mballoc will try - to use for allocation size and alignment. For RAID5/6 - systems this should be the number of data - disks * RAID chunk size in file system blocks. - -delalloc (*) Defer block allocation until just before ext4 - writes out the block(s) in question. This - allows ext4 to better allocation decisions - more efficiently. -nodelalloc Disable delayed allocation. Blocks are allocated - when the data is copied from userspace to the - page cache, either via the write(2) system call - or when an mmap'ed page which was previously - unallocated is written for the first time. - -max_batch_time=usec Maximum amount of time ext4 should wait for - additional filesystem operations to be batch - together with a synchronous write operation. - Since a synchronous write operation is going to - force a commit and then a wait for the I/O - complete, it doesn't cost much, and can be a - huge throughput win, we wait for a small amount - of time to see if any other transactions can - piggyback on the synchronous write. The - algorithm used is designed to automatically tune - for the speed of the disk, by measuring the - amount of time (on average) that it takes to - finish committing a transaction. Call this time - the "commit time". If the time that the - transaction has been running is less than the - commit time, ext4 will try sleeping for the - commit time to see if other operations will join - the transaction. The commit time is capped by - the max_batch_time, which defaults to 15000us - (15ms). This optimization can be turned off - entirely by setting max_batch_time to 0. - -min_batch_time=usec This parameter sets the commit time (as - described above) to be at least min_batch_time. - It defaults to zero microseconds. Increasing - this parameter may improve the throughput of - multi-threaded, synchronous workloads on very - fast disks, at the cost of increasing latency. - -journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the - highest priority) which should be used for I/O - operations submitted by kjournald2 during a - commit operation. This defaults to 3, which is - a slightly higher priority than the default I/O - priority. - -auto_da_alloc(*) Many broken applications don't use fsync() when -noauto_da_alloc replacing existing files via patterns such as - fd = open("foo.new")/write(fd,..)/close(fd)/ - rename("foo.new", "foo"), or worse yet, - fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). - If auto_da_alloc is enabled, ext4 will detect - the replace-via-rename and replace-via-truncate - patterns and force that any delayed allocation - blocks are allocated such that at the next - journal commit, in the default data=ordered - mode, the data blocks of the new file are forced - to disk before the rename() operation is - committed. This provides roughly the same level - of guarantees as ext3, and avoids the - "zero-length" problem that can happen when a - system crashes before the delayed allocation - blocks are forced to disk. - -noinit_itable Do not initialize any uninitialized inode table - blocks in the background. This feature may be - used by installation CD's so that the install - process can complete as quickly as possible; the - inode table initialization process would then be - deferred until the next time the file system - is unmounted. - -init_itable=n The lazy itable init code will wait n times the - number of milliseconds it took to zero out the - previous block group's inode table. This - minimizes the impact on the system performance - while file system's inode table is being initialized. - -discard Controls whether ext4 should issue discard/TRIM -nodiscard(*) commands to the underlying block device when - blocks are freed. This is useful for SSD devices - and sparse/thinly-provisioned LUNs, but it is off - by default until sufficient testing has been done. - -nouid32 Disables 32-bit UIDs and GIDs. This is for - interoperability with older kernels which only - store and expect 16-bit values. - -block_validity(*) These options enable or disable the in-kernel -noblock_validity facility for tracking filesystem metadata blocks - within internal data structures. This allows multi- - block allocator and other routines to notice - bugs or corrupted allocation bitmaps which cause - blocks to be allocated which overlap with - filesystem metadata blocks. - -dioread_lock Controls whether or not ext4 should use the DIO read -dioread_nolock locking. If the dioread_nolock option is specified - ext4 will allocate uninitialized extent before buffer - write and convert the extent to initialized after IO - completes. This approach allows ext4 code to avoid - using inode mutex, which improves scalability on high - speed storages. However this does not work with - data journaling and dioread_nolock option will be - ignored with kernel warning. Note that dioread_nolock - code path is only used for extent-based files. - Because of the restrictions this options comprises - it is off by default (e.g. dioread_lock). - -max_dir_size_kb=n This limits the size of directories so that any - attempt to expand them beyond the specified - limit in kilobytes will cause an ENOSPC error. - This is useful in memory constrained - environments, where a very large directory can - cause severe performance problems or even - provoke the Out Of Memory killer. (For example, - if there is only 512mb memory available, a 176mb - directory may seriously cramp the system's style.) - -i_version Enable 64-bit inode version support. This option is - off by default. - -dax Use direct access (no page cache). See - Documentation/filesystems/dax.txt. Note that - this option is incompatible with data=journal. -======================= ======================================================= + ro + Mount filesystem read only. Note that ext4 will replay the journal (and + thus write to the partition) even when mounted "read only". The mount + options "ro,noload" can be used to prevent writes to the filesystem. + + journal_checksum + Enable checksumming of the journal transactions. This will allow the + recovery code in e2fsck and the kernel to detect corruption in the + kernel. It is a compatible change and will be ignored by older + kernels. + + journal_async_commit + Commit block can be written to disk without waiting for descriptor + blocks. If enabled older kernels cannot mount the device. This will + enable 'journal_checksum' internally. + + journal_path=path, journal_dev=devnum + When the external journal device's major/minor numbers have changed, + these options allow the user to specify the new journal location. The + journal device is identified through either its new major/minor numbers + encoded in devnum, or via a path to the device. + + norecovery, noload + Don't load the journal on mounting. Note that if the filesystem was + not unmounted cleanly, skipping the journal replay will lead to the + filesystem containing inconsistencies that can lead to any number of + problems. + + data=journal + All data are committed into the journal prior to being written into the + main file system. Enabling this mode will disable delayed allocation + and O_DIRECT support. + + data=ordered (*) + All data are forced directly out to the main file system prior to its + metadata being committed to the journal. + + data=writeback + Data ordering is not preserved, data may be written into the main file + system after its metadata has been committed to the journal. + + commit=nrsec (*) + Ext4 can be told to sync all its data and metadata every 'nrsec' + seconds. The default value is 5 seconds. This means that if you lose + your power, you will lose as much as the latest 5 seconds of work (your + filesystem will not be damaged though, thanks to the journaling). This + default value (or any low value) will hurt performance, but it's good + for data-safety. Setting it to 0 will have the same effect as leaving + it at the default (5 seconds). Setting it to very large values will + improve performance. + + barrier=<0|1(*)>, barrier(*), nobarrier + This enables/disables the use of write barriers in the jbd code. + barrier=0 disables, barrier=1 enables. This also requires an IO stack + which can support barriers, and if jbd gets an error on a barrier + write, it will disable again with a warning. Write barriers enforce + proper on-disk ordering of journal commits, making volatile disk write + caches safe to use, at some performance penalty. If your disks are + battery-backed in one way or another, disabling barriers may safely + improve performance. The mount options "barrier" and "nobarrier" can + also be used to enable or disable barriers, for consistency with other + ext4 mount options. + + inode_readahead_blks=n + This tuning parameter controls the maximum number of inode table blocks + that ext4's inode table readahead algorithm will pre-read into the + buffer cache. The default value is 32 blocks. + + nouser_xattr + Disables Extended User Attributes. See the attr(5) manual page for + more information about extended attributes. + + noacl + This option disables POSIX Access Control List support. If ACL support + is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL + is enabled by default on mount. See the acl(5) manual page for more + information about acl. + + bsddf (*) + Make 'df' act like BSD. + + minixdf + Make 'df' act like Minix. + + debug + Extra debugging information is sent to syslog. + + abort + Simulate the effects of calling ext4_abort() for debugging purposes. + This is normally used while remounting a filesystem which is already + mounted. + + errors=remount-ro + Remount the filesystem read-only on an error. + + errors=continue + Keep going on a filesystem error. + + errors=panic + Panic and halt the machine if an error occurs. (These mount options + override the errors behavior specified in the superblock, which can be + configured using tune2fs) + + data_err=ignore(*) + Just print an error message if an error occurs in a file data buffer in + ordered mode. + data_err=abort + Abort the journal if an error occurs in a file data buffer in ordered + mode. + + grpid | bsdgroups + New objects have the group ID of their parent. + + nogrpid (*) | sysvgroups + New objects have the group ID of their creator. + + resgid=n + The group ID which may use the reserved blocks. + + resuid=n + The user ID which may use the reserved blocks. + + sb= + Use alternate superblock at this location. + + quota, noquota, grpquota, usrquota + These options are ignored by the filesystem. They are used only by + quota tools to recognize volumes where quota should be turned on. See + documentation in the quota-tools package for more details + (http://sourceforge.net/projects/linuxquota). + + jqfmt=, usrjquota=, grpjquota= + These options tell filesystem details about quota so that quota + information can be properly updated during journal replay. They replace + the above quota options. See documentation in the quota-tools package + for more details (http://sourceforge.net/projects/linuxquota). + + stripe=n + Number of filesystem blocks that mballoc will try to use for allocation + size and alignment. For RAID5/6 systems this should be the number of + data disks * RAID chunk size in file system blocks. + + delalloc (*) + Defer block allocation until just before ext4 writes out the block(s) + in question. This allows ext4 to better allocation decisions more + efficiently. + + nodelalloc + Disable delayed allocation. Blocks are allocated when the data is + copied from userspace to the page cache, either via the write(2) system + call or when an mmap'ed page which was previously unallocated is + written for the first time. + + max_batch_time=usec + Maximum amount of time ext4 should wait for additional filesystem + operations to be batch together with a synchronous write operation. + Since a synchronous write operation is going to force a commit and then + a wait for the I/O complete, it doesn't cost much, and can be a huge + throughput win, we wait for a small amount of time to see if any other + transactions can piggyback on the synchronous write. The algorithm + used is designed to automatically tune for the speed of the disk, by + measuring the amount of time (on average) that it takes to finish + committing a transaction. Call this time the "commit time". If the + time that the transaction has been running is less than the commit + time, ext4 will try sleeping for the commit time to see if other + operations will join the transaction. The commit time is capped by + the max_batch_time, which defaults to 15000us (15ms). This + optimization can be turned off entirely by setting max_batch_time to 0. + + min_batch_time=usec + This parameter sets the commit time (as described above) to be at least + min_batch_time. It defaults to zero microseconds. Increasing this + parameter may improve the throughput of multi-threaded, synchronous + workloads on very fast disks, at the cost of increasing latency. + + journal_ioprio=prio + The I/O priority (from 0 to 7, where 0 is the highest priority) which + should be used for I/O operations submitted by kjournald2 during a + commit operation. This defaults to 3, which is a slightly higher + priority than the default I/O priority. + + auto_da_alloc(*), noauto_da_alloc + Many broken applications don't use fsync() when replacing existing + files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/ + rename("foo.new", "foo"), or worse yet, fd = open("foo", + O_TRUNC)/write(fd,..)/close(fd). If auto_da_alloc is enabled, ext4 + will detect the replace-via-rename and replace-via-truncate patterns + and force that any delayed allocation blocks are allocated such that at + the next journal commit, in the default data=ordered mode, the data + blocks of the new file are forced to disk before the rename() operation + is committed. This provides roughly the same level of guarantees as + ext3, and avoids the "zero-length" problem that can happen when a + system crashes before the delayed allocation blocks are forced to disk. + + noinit_itable + Do not initialize any uninitialized inode table blocks in the + background. This feature may be used by installation CD's so that the + install process can complete as quickly as possible; the inode table + initialization process would then be deferred until the next time the + file system is unmounted. + + init_itable=n + The lazy itable init code will wait n times the number of milliseconds + it took to zero out the previous block group's inode table. This + minimizes the impact on the system performance while file system's + inode table is being initialized. + + discard, nodiscard(*) + Controls whether ext4 should issue discard/TRIM commands to the + underlying block device when blocks are freed. This is useful for SSD + devices and sparse/thinly-provisioned LUNs, but it is off by default + until sufficient testing has been done. + + nouid32 + Disables 32-bit UIDs and GIDs. This is for interoperability with + older kernels which only store and expect 16-bit values. + + block_validity(*), noblock_validity + These options enable or disable the in-kernel facility for tracking + filesystem metadata blocks within internal data structures. This + allows multi- block allocator and other routines to notice bugs or + corrupted allocation bitmaps which cause blocks to be allocated which + overlap with filesystem metadata blocks. + + dioread_lock, dioread_nolock + Controls whether or not ext4 should use the DIO read locking. If the + dioread_nolock option is specified ext4 will allocate uninitialized + extent before buffer write and convert the extent to initialized after + IO completes. This approach allows ext4 code to avoid using inode + mutex, which improves scalability on high speed storages. However this + does not work with data journaling and dioread_nolock option will be + ignored with kernel warning. Note that dioread_nolock code path is only + used for extent-based files. Because of the restrictions this options + comprises it is off by default (e.g. dioread_lock). + + max_dir_size_kb=n + This limits the size of directories so that any attempt to expand them + beyond the specified limit in kilobytes will cause an ENOSPC error. + This is useful in memory constrained environments, where a very large + directory can cause severe performance problems or even provoke the Out + Of Memory killer. (For example, if there is only 512mb memory + available, a 176mb directory may seriously cramp the system's style.) + + i_version + Enable 64-bit inode version support. This option is off by default. + + dax + Use direct access (no page cache). See + Documentation/filesystems/dax.txt. Note that this option is + incompatible with data=journal. Data Mode ========= @@ -407,11 +394,8 @@ in table below. Files in /proc/fs/ext4/ -================ ======= - File Content -================ ======= - mb_groups details of multiblock allocator buddy cache of free blocks -================ ======= + mb_groups + details of multiblock allocator buddy cache of free blocks /sys entries ============ @@ -426,74 +410,71 @@ Files in /sys/fs/ext4/: (see also Documentation/ABI/testing/sysfs-fs-ext4) -============================= ================================================= -File Content -============================= ================================================= - delayed_allocation_blocks This file is read-only and shows the number of - blocks that are dirty in the page cache, but - which do not have their location in the - filesystem allocated yet. - -inode_goal Tuning parameter which (if non-zero) controls - the goal inode used by the inode allocator in - preference to all other allocation heuristics. - This is intended for debugging use only, and - should be 0 on production systems. - -inode_readahead_blks Tuning parameter which controls the maximum - number of inode table blocks that ext4's inode - table readahead algorithm will pre-read into - the buffer cache - -lifetime_write_kbytes This file is read-only and shows the number of - kilobytes of data that have been written to this - filesystem since it was created. - - max_writeback_mb_bump The maximum number of megabytes the writeback - code will try to write out before move on to - another inode. - - mb_group_prealloc The multiblock allocator will round up allocation - requests to a multiple of this tuning parameter if - the stripe size is not set in the ext4 superblock - - mb_max_to_scan The maximum number of extents the multiblock - allocator will search to find the best extent - - mb_min_to_scan The minimum number of extents the multiblock - allocator will search to find the best extent - - mb_order2_req Tuning parameter which controls the minimum size - for requests (as a power of 2) where the buddy - cache is used - - mb_stats Controls whether the multiblock allocator should - collect statistics, which are shown during the - unmount. 1 means to collect statistics, 0 means - not to collect statistics - - mb_stream_req Files which have fewer blocks than this tunable - parameter will have their blocks allocated out - of a block group specific preallocation pool, so - that small files are packed closely together. - Each large file will have its blocks allocated - out of its own unique preallocation pool. - - session_write_kbytes This file is read-only and shows the number of - kilobytes of data that have been written to this - filesystem since it was mounted. - - reserved_clusters This is RW file and contains number of reserved - clusters in the file system which will be used - in the specific situations to avoid costly - zeroout, unexpected ENOSPC, or possible data - loss. The default is 2% or 4096 clusters, - whichever is smaller and this can be changed - however it can never exceed number of clusters - in the file system. If there is not enough space - for the reserved space when mounting the file - mount will _not_ fail. -============================= ================================================= + delayed_allocation_blocks + This file is read-only and shows the number of blocks that are dirty in + the page cache, but which do not have their location in the filesystem + allocated yet. + + inode_goal + Tuning parameter which (if non-zero) controls the goal inode used by + the inode allocator in preference to all other allocation heuristics. + This is intended for debugging use only, and should be 0 on production + systems. + + inode_readahead_blks + Tuning parameter which controls the maximum number of inode table + blocks that ext4's inode table readahead algorithm will pre-read into + the buffer cache. + + lifetime_write_kbytes + This file is read-only and shows the number of kilobytes of data that + have been written to this filesystem since it was created. + + max_writeback_mb_bump + The maximum number of megabytes the writeback code will try to write + out before move on to another inode. + + mb_group_prealloc + The multiblock allocator will round up allocation requests to a + multiple of this tuning parameter if the stripe size is not set in the + ext4 superblock + + mb_max_to_scan + The maximum number of extents the multiblock allocator will search to + find the best extent. + + mb_min_to_scan + The minimum number of extents the multiblock allocator will search to + find the best extent. + + mb_order2_req + Tuning parameter which controls the minimum size for requests (as a + power of 2) where the buddy cache is used. + + mb_stats + Controls whether the multiblock allocator should collect statistics, + which are shown during the unmount. 1 means to collect statistics, 0 + means not to collect statistics. + + mb_stream_req + Files which have fewer blocks than this tunable parameter will have + their blocks allocated out of a block group specific preallocation + pool, so that small files are packed closely together. Each large file + will have its blocks allocated out of its own unique preallocation + pool. + + session_write_kbytes + This file is read-only and shows the number of kilobytes of data that + have been written to this filesystem since it was mounted. + + reserved_clusters + This is RW file and contains number of reserved clusters in the file + system which will be used in the specific situations to avoid costly + zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or + 4096 clusters, whichever is smaller and this can be changed however it + can never exceed number of clusters in the file system. If there is not + enough space for the reserved space when mounting the file mount will + _not_ fail. Ioctls ====== @@ -504,100 +485,80 @@ shown in the table below. Table of Ext4 specific ioctls -============================= ================================================= -Ioctl Description -============================= ================================================= - EXT4_IOC_GETFLAGS Get additional attributes associated with inode. - The ioctl argument is an integer bitfield, with - bit values described in ext4.h. This ioctl is an - alias for FS_IOC_GETFLAGS. - - EXT4_IOC_SETFLAGS Set additional attributes associated with inode. - The ioctl argument is an integer bitfield, with - bit values described in ext4.h. This ioctl is an - alias for FS_IOC_SETFLAGS. - - EXT4_IOC_GETVERSION - EXT4_IOC_GETVERSION_OLD - Get the inode i_generation number stored for - each inode. The i_generation number is normally - changed only when new inode is created and it is - particularly useful for network filesystems. The - '_OLD' version of this ioctl is an alias for - FS_IOC_GETVERSION. - - EXT4_IOC_SETVERSION - EXT4_IOC_SETVERSION_OLD - Set the inode i_generation number stored for - each inode. The '_OLD' version of this ioctl - is an alias for FS_IOC_SETVERSION. - - EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize - mount option. It allows to resize filesystem - to the end of the last existing block group, - further resize has to be done with resize2fs, - either online, or offline. The argument points - to the unsigned logn number representing the - filesystem new block count. - - EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one - this ioctl is pointing to) to the donor_fd (the - one specified in move_extent structure passed - as an argument to this ioctl). Then, exchange - inode metadata between orig_fd and donor_fd. - This is especially useful for online - defragmentation, because the allocator has the - opportunity to allocate moved blocks better, - ideally into one contiguous extent. - - EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or - new group descriptor block. The new group - descriptor is described by ext4_new_group_input - structure, which is passed as an argument to - this ioctl. This is especially useful in - conjunction with EXT4_IOC_GROUP_EXTEND, - which allows online resize of the filesystem - to the end of the last existing block group. - Those two ioctls combined is used in userspace - online resize tool (e.g. resize2fs). - - EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself. - It converts (migrates) ext3 indirect block mapped - inode to ext4 extent mapped inode by walking - through indirect block mapping of the original - inode and converting contiguous block ranges - into ext4 extents of the temporary inode. Then, - inodes are swapped. This ioctl might help, when - migrating from ext3 to ext4 filesystem, however - suggestion is to create fresh ext4 filesystem - and copy data from the backup. Note, that - filesystem has to support extents for this ioctl - to work. - - EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be - allocated to preserve application-expected ext3 - behaviour. Note that this will also start - triggering a write of the data blocks, but this - behaviour may change in the future as it is - not necessary and has been done this way only - for sake of simplicity. - - EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number - of blocks of resized filesystem is passed in via - 64 bit integer argument. The kernel allocates - bitmaps and inode table, the userspace tool thus - just passes the new number of blocks. - - EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes - (like i_blocks, i_size, i_flags, ...) from - the specified inode with inode - EXT4_BOOT_LOADER_INO (#5). This is typically - used to store a boot loader in a secure part of - the filesystem, where it can't be changed by a - normal user by accident. - The data blocks of the previous boot loader - will be associated with the given inode. -============================= ================================================= + EXT4_IOC_GETFLAGS + Get additional attributes associated with inode. The ioctl argument is + an integer bitfield, with bit values described in ext4.h. This ioctl is + an alias for FS_IOC_GETFLAGS. + + EXT4_IOC_SETFLAGS + Set additional attributes associated with inode. The ioctl argument is + an integer bitfield, with bit values described in ext4.h. This ioctl is + an alias for FS_IOC_SETFLAGS. + + EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD + Get the inode i_generation number stored for each inode. The + i_generation number is normally changed only when new inode is created + and it is particularly useful for network filesystems. The '_OLD' + version of this ioctl is an alias for FS_IOC_GETVERSION. + + EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD + Set the inode i_generation number stored for each inode. The '_OLD' + version of this ioctl is an alias for FS_IOC_SETVERSION. + + EXT4_IOC_GROUP_EXTEND + This ioctl has the same purpose as the resize mount option. It allows + to resize filesystem to the end of the last existing block group, + further resize has to be done with resize2fs, either online, or + offline. The argument points to the unsigned logn number representing + the filesystem new block count. + + EXT4_IOC_MOVE_EXT + Move the block extents from orig_fd (the one this ioctl is pointing to) + to the donor_fd (the one specified in move_extent structure passed as + an argument to this ioctl). Then, exchange inode metadata between + orig_fd and donor_fd. This is especially useful for online + defragmentation, because the allocator has the opportunity to allocate + moved blocks better, ideally into one contiguous extent. + + EXT4_IOC_GROUP_ADD + Add a new group descriptor to an existing or new group descriptor + block. The new group descriptor is described by ext4_new_group_input + structure, which is passed as an argument to this ioctl. This is + especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which + allows online resize of the filesystem to the end of the last existing + block group. Those two ioctls combined is used in userspace online + resize tool (e.g. resize2fs). + + EXT4_IOC_MIGRATE + This ioctl operates on the filesystem itself. It converts (migrates) + ext3 indirect block mapped inode to ext4 extent mapped inode by walking + through indirect block mapping of the original inode and converting + contiguous block ranges into ext4 extents of the temporary inode. Then, + inodes are swapped. This ioctl might help, when migrating from ext3 to + ext4 filesystem, however suggestion is to create fresh ext4 filesystem + and copy data from the backup. Note, that filesystem has to support + extents for this ioctl to work. + + EXT4_IOC_ALLOC_DA_BLKS + Force all of the delay allocated blocks to be allocated to preserve + application-expected ext3 behaviour. Note that this will also start + triggering a write of the data blocks, but this behaviour may change in + the future as it is not necessary and has been done this way only for + sake of simplicity. + + EXT4_IOC_RESIZE_FS + Resize the filesystem to a new size. The number of blocks of resized + filesystem is passed in via 64 bit integer argument. The kernel + allocates bitmaps and inode table, the userspace tool thus just passes + the new number of blocks. + + EXT4_IOC_SWAP_BOOT + Swap i_blocks and associated attributes (like i_blocks, i_size, + i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO + (#5). This is typically used to store a boot loader in a secure part of + the filesystem, where it can't be changed by a normal user by accident. + The data blocks of the previous boot loader will be associated with the + given inode. References ========== -- cgit v1.2.3 From dc7ac6c4cae3b58724c2f1e21a7c05ce19ecd5a8 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 3 Oct 2018 10:33:32 -0400 Subject: ext4: fix setattr project check in fssetxattr ioctl Currently, project quota could be changed by fssetxattr ioctl, and existed permission check inode_owner_or_capable() is obviously not enough, just think that common users could change project id of file, that could make users to break project quota easily. This patch try to follow same regular of xfs project quota: "Project Quota ID state is only allowed to change from within the init namespace. Enforce that restriction only if we are trying to change the quota ID state. Everything else is allowed in user namespaces." Besides that, check and set project id'state should be an atomic operation, protect whole operation with inode lock, ext4_ioctl_setproject() is only used for ioctl EXT4_IOC_FSSETXATTR, we have held mnt_want_write_file() before ext4_ioctl_setflags(), and ext4_ioctl_setproject() is called after ext4_ioctl_setflags(), we could share codes, so remove it inside ext4_ioctl_setproject(). Signed-off-by: Wang Shilong Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@kernel.org --- fs/ext4/ioctl.c | 60 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d7ed7487e630..0b3e2486f988 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -360,19 +360,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) return 0; - err = mnt_want_write_file(filp); - if (err) - return err; - err = -EPERM; - inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) - goto out_unlock; + return err; err = ext4_get_inode_loc(inode, &iloc); if (err) - goto out_unlock; + return err; raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { @@ -380,7 +375,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) EXT4_SB(sb)->s_want_extra_isize, &iloc); if (err) - goto out_unlock; + return err; } else { brelse(iloc.bh); } @@ -390,10 +385,8 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto out_unlock; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) @@ -421,9 +414,6 @@ out_dirty: err = rc; out_stop: ext4_journal_stop(handle); -out_unlock: - inode_unlock(inode); - mnt_drop_write_file(filp); return err; } #else @@ -647,6 +637,30 @@ group_add_out: return err; } +static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa) +{ + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid) + return -EINVAL; + + if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) { + if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) + return -EINVAL; + } else { + if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + return 0; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1046,19 +1060,19 @@ resizefs_out: return err; inode_lock(inode); + err = ext4_ioctl_check_project(inode, &fa); + if (err) + goto out; flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); err = ext4_ioctl_setflags(inode, flags); - inode_unlock(inode); - mnt_drop_write_file(filp); if (err) - return err; - + goto out; err = ext4_ioctl_setproject(filp, fa.fsx_projid); - if (err) - return err; - - return 0; +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; } case EXT4_IOC_SHUTDOWN: return ext4_shutdown(sb, arg); -- cgit v1.2.3 From 182a79e0c17147d2c2d3990a9a7b6b58a1561c7a Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 3 Oct 2018 12:19:21 -0400 Subject: ext4: propagate error from dquot_initialize() in EXT4_IOC_FSSETXATTR We return most failure of dquota_initialize() except inode evict, this could make a bit sense, for example we allow file removal even quota files are broken? But it dosen't make sense to allow setting project if quota files etc are broken. Signed-off-by: Wang Shilong Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ioctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0b3e2486f988..0edee31913d1 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -380,7 +380,9 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) brelse(iloc.bh); } - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) + return err; handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + -- cgit v1.2.3 From ccd3c4373eacb044eb3832966299d13d2631f66f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Oct 2018 18:44:40 -0400 Subject: jbd2: fix use after free in jbd2_log_do_checkpoint() The code cleaning transaction's lists of checkpoint buffers has a bug where it increases bh refcount only after releasing journal->j_list_lock. Thus the following race is possible: CPU0 CPU1 jbd2_log_do_checkpoint() jbd2_journal_try_to_free_buffers() __journal_try_to_free_buffer(bh) ... while (transaction->t_checkpoint_io_list) ... if (buffer_locked(bh)) { <-- IO completes now, buffer gets unlocked --> spin_unlock(&journal->j_list_lock); spin_lock(&journal->j_list_lock); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); try_to_free_buffers(page); get_bh(bh) <-- accesses freed bh Fix the problem by grabbing bh reference before unlocking journal->j_list_lock. Fixes: dc6e8d669cf5 ("jbd2: don't call get_bh() before calling __jbd2_journal_remove_checkpoint()") Fixes: be1158cc615f ("jbd2: fold __process_buffer() into jbd2_log_do_checkpoint()") Reported-by: syzbot+7f4a27091759e2fe7453@syzkaller.appspotmail.com CC: stable@vger.kernel.org Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index c125d662777c..26f8d7e46462 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -251,8 +251,8 @@ restart: bh = jh2bh(jh); if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); get_bh(bh); + spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -333,8 +333,8 @@ restart2: jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); get_bh(bh); + spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); -- cgit v1.2.3 From d3091215921bd4b8fdf3129bf8f733b8ca48dc80 Mon Sep 17 00:00:00 2001 From: Darrick J. Wong Date: Fri, 5 Oct 2018 19:11:59 -0400 Subject: docs: move ext4 administrative docs to admin-guide/ Move the ext4 mount option and other administrative stuff to the Linux administrator's guide. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- Documentation/admin-guide/ext4.rst | 574 +++++++++++++++++++++++++++++++ Documentation/admin-guide/index.rst | 1 + Documentation/conf.py | 2 + Documentation/filesystems/ext4/ext4.rst | 574 ------------------------------- Documentation/filesystems/ext4/index.rst | 1 - 5 files changed, 577 insertions(+), 575 deletions(-) create mode 100644 Documentation/admin-guide/ext4.rst delete mode 100644 Documentation/filesystems/ext4/ext4.rst diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst new file mode 100644 index 000000000000..e506d3dae510 --- /dev/null +++ b/Documentation/admin-guide/ext4.rst @@ -0,0 +1,574 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +ext4 General Information +======================== + +Ext4 is an advanced level of the ext3 filesystem which incorporates +scalability and reliability enhancements for supporting large filesystems +(64 bit) in keeping with increasing disk capacities and state-of-the-art +feature requirements. + +Mailing list: linux-ext4@vger.kernel.org +Web site: http://ext4.wiki.kernel.org + + +Quick usage instructions +======================== + +Note: More extensive information for getting started with ext4 can be +found at the ext4 wiki site at the URL: +http://ext4.wiki.kernel.org/index.php/Ext4_Howto + + - The latest version of e2fsprogs can be found at: + + https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ + + or + + http://sourceforge.net/project/showfiles.php?group_id=2406 + + or grab the latest git repository from: + + https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git + + - Create a new filesystem using the ext4 filesystem type: + + # mke2fs -t ext4 /dev/hda1 + + Or to configure an existing ext3 filesystem to support extents: + + # tune2fs -O extents /dev/hda1 + + If the filesystem was created with 128 byte inodes, it can be + converted to use 256 byte for greater efficiency via: + + # tune2fs -I 256 /dev/hda1 + + - Mounting: + + # mount -t ext4 /dev/hda1 /wherever + + - When comparing performance with other filesystems, it's always + important to try multiple workloads; very often a subtle change in a + workload parameter can completely change the ranking of which + filesystems do well compared to others. When comparing versus ext3, + note that ext4 enables write barriers by default, while ext3 does + not enable write barriers by default. So it is useful to use + explicitly specify whether barriers are enabled or not when via the + '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems + for a fair comparison. When tuning ext3 for best benchmark numbers, + it is often worthwhile to try changing the data journaling mode; '-o + data=writeback' can be faster for some workloads. (Note however that + running mounted with data=writeback can potentially leave stale data + exposed in recently written files in case of an unclean shutdown, + which could be a security exposure in some situations.) Configuring + the filesystem with a large journal can also be helpful for + metadata-intensive workloads. + +Features +======== + +Currently Available +------------------- + +* ability to use filesystems > 16TB (e2fsprogs support not available yet) +* extent format reduces metadata overhead (RAM, IO for access, transactions) +* extent format more robust in face of on-disk corruption due to magics, +* internal redundancy in tree +* improved file allocation (multi-block alloc) +* lift 32000 subdirectory limit imposed by i_links_count[1] +* nsec timestamps for mtime, atime, ctime, create time +* inode version field on disk (NFSv4, Lustre) +* reduced e2fsck time via uninit_bg feature +* journal checksumming for robustness, performance +* persistent file preallocation (e.g for streaming media, databases) +* ability to pack bitmaps and inode tables into larger virtual groups via the + flex_bg feature +* large file support +* inode allocation using large virtual block groups via flex_bg +* delayed allocation +* large block (up to pagesize) support +* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force + the ordering) + +[1] Filesystems with a block size of 1k may see a limit imposed by the +directory hash tree having a maximum depth of two. + +Options +======= + +When mounting an ext4 filesystem, the following option are accepted: +(*) == default + + ro + Mount filesystem read only. Note that ext4 will replay the journal (and + thus write to the partition) even when mounted "read only". The mount + options "ro,noload" can be used to prevent writes to the filesystem. + + journal_checksum + Enable checksumming of the journal transactions. This will allow the + recovery code in e2fsck and the kernel to detect corruption in the + kernel. It is a compatible change and will be ignored by older + kernels. + + journal_async_commit + Commit block can be written to disk without waiting for descriptor + blocks. If enabled older kernels cannot mount the device. This will + enable 'journal_checksum' internally. + + journal_path=path, journal_dev=devnum + When the external journal device's major/minor numbers have changed, + these options allow the user to specify the new journal location. The + journal device is identified through either its new major/minor numbers + encoded in devnum, or via a path to the device. + + norecovery, noload + Don't load the journal on mounting. Note that if the filesystem was + not unmounted cleanly, skipping the journal replay will lead to the + filesystem containing inconsistencies that can lead to any number of + problems. + + data=journal + All data are committed into the journal prior to being written into the + main file system. Enabling this mode will disable delayed allocation + and O_DIRECT support. + + data=ordered (*) + All data are forced directly out to the main file system prior to its + metadata being committed to the journal. + + data=writeback + Data ordering is not preserved, data may be written into the main file + system after its metadata has been committed to the journal. + + commit=nrsec (*) + Ext4 can be told to sync all its data and metadata every 'nrsec' + seconds. The default value is 5 seconds. This means that if you lose + your power, you will lose as much as the latest 5 seconds of work (your + filesystem will not be damaged though, thanks to the journaling). This + default value (or any low value) will hurt performance, but it's good + for data-safety. Setting it to 0 will have the same effect as leaving + it at the default (5 seconds). Setting it to very large values will + improve performance. + + barrier=<0|1(*)>, barrier(*), nobarrier + This enables/disables the use of write barriers in the jbd code. + barrier=0 disables, barrier=1 enables. This also requires an IO stack + which can support barriers, and if jbd gets an error on a barrier + write, it will disable again with a warning. Write barriers enforce + proper on-disk ordering of journal commits, making volatile disk write + caches safe to use, at some performance penalty. If your disks are + battery-backed in one way or another, disabling barriers may safely + improve performance. The mount options "barrier" and "nobarrier" can + also be used to enable or disable barriers, for consistency with other + ext4 mount options. + + inode_readahead_blks=n + This tuning parameter controls the maximum number of inode table blocks + that ext4's inode table readahead algorithm will pre-read into the + buffer cache. The default value is 32 blocks. + + nouser_xattr + Disables Extended User Attributes. See the attr(5) manual page for + more information about extended attributes. + + noacl + This option disables POSIX Access Control List support. If ACL support + is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL + is enabled by default on mount. See the acl(5) manual page for more + information about acl. + + bsddf (*) + Make 'df' act like BSD. + + minixdf + Make 'df' act like Minix. + + debug + Extra debugging information is sent to syslog. + + abort + Simulate the effects of calling ext4_abort() for debugging purposes. + This is normally used while remounting a filesystem which is already + mounted. + + errors=remount-ro + Remount the filesystem read-only on an error. + + errors=continue + Keep going on a filesystem error. + + errors=panic + Panic and halt the machine if an error occurs. (These mount options + override the errors behavior specified in the superblock, which can be + configured using tune2fs) + + data_err=ignore(*) + Just print an error message if an error occurs in a file data buffer in + ordered mode. + data_err=abort + Abort the journal if an error occurs in a file data buffer in ordered + mode. + + grpid | bsdgroups + New objects have the group ID of their parent. + + nogrpid (*) | sysvgroups + New objects have the group ID of their creator. + + resgid=n + The group ID which may use the reserved blocks. + + resuid=n + The user ID which may use the reserved blocks. + + sb= + Use alternate superblock at this location. + + quota, noquota, grpquota, usrquota + These options are ignored by the filesystem. They are used only by + quota tools to recognize volumes where quota should be turned on. See + documentation in the quota-tools package for more details + (http://sourceforge.net/projects/linuxquota). + + jqfmt=, usrjquota=, grpjquota= + These options tell filesystem details about quota so that quota + information can be properly updated during journal replay. They replace + the above quota options. See documentation in the quota-tools package + for more details (http://sourceforge.net/projects/linuxquota). + + stripe=n + Number of filesystem blocks that mballoc will try to use for allocation + size and alignment. For RAID5/6 systems this should be the number of + data disks * RAID chunk size in file system blocks. + + delalloc (*) + Defer block allocation until just before ext4 writes out the block(s) + in question. This allows ext4 to better allocation decisions more + efficiently. + + nodelalloc + Disable delayed allocation. Blocks are allocated when the data is + copied from userspace to the page cache, either via the write(2) system + call or when an mmap'ed page which was previously unallocated is + written for the first time. + + max_batch_time=usec + Maximum amount of time ext4 should wait for additional filesystem + operations to be batch together with a synchronous write operation. + Since a synchronous write operation is going to force a commit and then + a wait for the I/O complete, it doesn't cost much, and can be a huge + throughput win, we wait for a small amount of time to see if any other + transactions can piggyback on the synchronous write. The algorithm + used is designed to automatically tune for the speed of the disk, by + measuring the amount of time (on average) that it takes to finish + committing a transaction. Call this time the "commit time". If the + time that the transaction has been running is less than the commit + time, ext4 will try sleeping for the commit time to see if other + operations will join the transaction. The commit time is capped by + the max_batch_time, which defaults to 15000us (15ms). This + optimization can be turned off entirely by setting max_batch_time to 0. + + min_batch_time=usec + This parameter sets the commit time (as described above) to be at least + min_batch_time. It defaults to zero microseconds. Increasing this + parameter may improve the throughput of multi-threaded, synchronous + workloads on very fast disks, at the cost of increasing latency. + + journal_ioprio=prio + The I/O priority (from 0 to 7, where 0 is the highest priority) which + should be used for I/O operations submitted by kjournald2 during a + commit operation. This defaults to 3, which is a slightly higher + priority than the default I/O priority. + + auto_da_alloc(*), noauto_da_alloc + Many broken applications don't use fsync() when replacing existing + files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/ + rename("foo.new", "foo"), or worse yet, fd = open("foo", + O_TRUNC)/write(fd,..)/close(fd). If auto_da_alloc is enabled, ext4 + will detect the replace-via-rename and replace-via-truncate patterns + and force that any delayed allocation blocks are allocated such that at + the next journal commit, in the default data=ordered mode, the data + blocks of the new file are forced to disk before the rename() operation + is committed. This provides roughly the same level of guarantees as + ext3, and avoids the "zero-length" problem that can happen when a + system crashes before the delayed allocation blocks are forced to disk. + + noinit_itable + Do not initialize any uninitialized inode table blocks in the + background. This feature may be used by installation CD's so that the + install process can complete as quickly as possible; the inode table + initialization process would then be deferred until the next time the + file system is unmounted. + + init_itable=n + The lazy itable init code will wait n times the number of milliseconds + it took to zero out the previous block group's inode table. This + minimizes the impact on the system performance while file system's + inode table is being initialized. + + discard, nodiscard(*) + Controls whether ext4 should issue discard/TRIM commands to the + underlying block device when blocks are freed. This is useful for SSD + devices and sparse/thinly-provisioned LUNs, but it is off by default + until sufficient testing has been done. + + nouid32 + Disables 32-bit UIDs and GIDs. This is for interoperability with + older kernels which only store and expect 16-bit values. + + block_validity(*), noblock_validity + These options enable or disable the in-kernel facility for tracking + filesystem metadata blocks within internal data structures. This + allows multi- block allocator and other routines to notice bugs or + corrupted allocation bitmaps which cause blocks to be allocated which + overlap with filesystem metadata blocks. + + dioread_lock, dioread_nolock + Controls whether or not ext4 should use the DIO read locking. If the + dioread_nolock option is specified ext4 will allocate uninitialized + extent before buffer write and convert the extent to initialized after + IO completes. This approach allows ext4 code to avoid using inode + mutex, which improves scalability on high speed storages. However this + does not work with data journaling and dioread_nolock option will be + ignored with kernel warning. Note that dioread_nolock code path is only + used for extent-based files. Because of the restrictions this options + comprises it is off by default (e.g. dioread_lock). + + max_dir_size_kb=n + This limits the size of directories so that any attempt to expand them + beyond the specified limit in kilobytes will cause an ENOSPC error. + This is useful in memory constrained environments, where a very large + directory can cause severe performance problems or even provoke the Out + Of Memory killer. (For example, if there is only 512mb memory + available, a 176mb directory may seriously cramp the system's style.) + + i_version + Enable 64-bit inode version support. This option is off by default. + + dax + Use direct access (no page cache). See + Documentation/filesystems/dax.txt. Note that this option is + incompatible with data=journal. + +Data Mode +========= +There are 3 different data modes: + +* writeback mode + + In data=writeback mode, ext4 does not journal data at all. This mode provides + a similar level of journaling as that of XFS, JFS, and ReiserFS in its default + mode - metadata journaling. A crash+recovery can cause incorrect data to + appear in files which were written shortly before the crash. This mode will + typically provide the best ext4 performance. + +* ordered mode + + In data=ordered mode, ext4 only officially journals metadata, but it logically + groups metadata information related to data changes with the data blocks into + a single unit called a transaction. When it's time to write the new metadata + out to disk, the associated data blocks are written first. In general, this + mode performs slightly slower than writeback but significantly faster than + journal mode. + +* journal mode + + data=journal mode provides full data and metadata journaling. All new data is + written to the journal first, and then to its final location. In the event of + a crash, the journal can be replayed, bringing both data and metadata into a + consistent state. This mode is the slowest except when data needs to be read + from and written to disk at the same time where it outperforms all others + modes. Enabling this mode will disable delayed allocation and O_DIRECT + support. + +/proc entries +============= + +Information about mounted ext4 file systems can be found in +/proc/fs/ext4. Each mounted filesystem will have a directory in +/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or +/proc/fs/ext4/dm-0). The files in each per-device directory are shown +in table below. + +Files in /proc/fs/ext4/ + + mb_groups + details of multiblock allocator buddy cache of free blocks + +/sys entries +============ + +Information about mounted ext4 file systems can be found in +/sys/fs/ext4. Each mounted filesystem will have a directory in +/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or +/sys/fs/ext4/dm-0). The files in each per-device directory are shown +in table below. + +Files in /sys/fs/ext4/: + +(see also Documentation/ABI/testing/sysfs-fs-ext4) + + delayed_allocation_blocks + This file is read-only and shows the number of blocks that are dirty in + the page cache, but which do not have their location in the filesystem + allocated yet. + + inode_goal + Tuning parameter which (if non-zero) controls the goal inode used by + the inode allocator in preference to all other allocation heuristics. + This is intended for debugging use only, and should be 0 on production + systems. + + inode_readahead_blks + Tuning parameter which controls the maximum number of inode table + blocks that ext4's inode table readahead algorithm will pre-read into + the buffer cache. + + lifetime_write_kbytes + This file is read-only and shows the number of kilobytes of data that + have been written to this filesystem since it was created. + + max_writeback_mb_bump + The maximum number of megabytes the writeback code will try to write + out before move on to another inode. + + mb_group_prealloc + The multiblock allocator will round up allocation requests to a + multiple of this tuning parameter if the stripe size is not set in the + ext4 superblock + + mb_max_to_scan + The maximum number of extents the multiblock allocator will search to + find the best extent. + + mb_min_to_scan + The minimum number of extents the multiblock allocator will search to + find the best extent. + + mb_order2_req + Tuning parameter which controls the minimum size for requests (as a + power of 2) where the buddy cache is used. + + mb_stats + Controls whether the multiblock allocator should collect statistics, + which are shown during the unmount. 1 means to collect statistics, 0 + means not to collect statistics. + + mb_stream_req + Files which have fewer blocks than this tunable parameter will have + their blocks allocated out of a block group specific preallocation + pool, so that small files are packed closely together. Each large file + will have its blocks allocated out of its own unique preallocation + pool. + + session_write_kbytes + This file is read-only and shows the number of kilobytes of data that + have been written to this filesystem since it was mounted. + + reserved_clusters + This is RW file and contains number of reserved clusters in the file + system which will be used in the specific situations to avoid costly + zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or + 4096 clusters, whichever is smaller and this can be changed however it + can never exceed number of clusters in the file system. If there is not + enough space for the reserved space when mounting the file mount will + _not_ fail. + +Ioctls +====== + +There is some Ext4 specific functionality which can be accessed by applications +through the system call interfaces. The list of all Ext4 specific ioctls are +shown in the table below. + +Table of Ext4 specific ioctls + + EXT4_IOC_GETFLAGS + Get additional attributes associated with inode. The ioctl argument is + an integer bitfield, with bit values described in ext4.h. This ioctl is + an alias for FS_IOC_GETFLAGS. + + EXT4_IOC_SETFLAGS + Set additional attributes associated with inode. The ioctl argument is + an integer bitfield, with bit values described in ext4.h. This ioctl is + an alias for FS_IOC_SETFLAGS. + + EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD + Get the inode i_generation number stored for each inode. The + i_generation number is normally changed only when new inode is created + and it is particularly useful for network filesystems. The '_OLD' + version of this ioctl is an alias for FS_IOC_GETVERSION. + + EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD + Set the inode i_generation number stored for each inode. The '_OLD' + version of this ioctl is an alias for FS_IOC_SETVERSION. + + EXT4_IOC_GROUP_EXTEND + This ioctl has the same purpose as the resize mount option. It allows + to resize filesystem to the end of the last existing block group, + further resize has to be done with resize2fs, either online, or + offline. The argument points to the unsigned logn number representing + the filesystem new block count. + + EXT4_IOC_MOVE_EXT + Move the block extents from orig_fd (the one this ioctl is pointing to) + to the donor_fd (the one specified in move_extent structure passed as + an argument to this ioctl). Then, exchange inode metadata between + orig_fd and donor_fd. This is especially useful for online + defragmentation, because the allocator has the opportunity to allocate + moved blocks better, ideally into one contiguous extent. + + EXT4_IOC_GROUP_ADD + Add a new group descriptor to an existing or new group descriptor + block. The new group descriptor is described by ext4_new_group_input + structure, which is passed as an argument to this ioctl. This is + especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which + allows online resize of the filesystem to the end of the last existing + block group. Those two ioctls combined is used in userspace online + resize tool (e.g. resize2fs). + + EXT4_IOC_MIGRATE + This ioctl operates on the filesystem itself. It converts (migrates) + ext3 indirect block mapped inode to ext4 extent mapped inode by walking + through indirect block mapping of the original inode and converting + contiguous block ranges into ext4 extents of the temporary inode. Then, + inodes are swapped. This ioctl might help, when migrating from ext3 to + ext4 filesystem, however suggestion is to create fresh ext4 filesystem + and copy data from the backup. Note, that filesystem has to support + extents for this ioctl to work. + + EXT4_IOC_ALLOC_DA_BLKS + Force all of the delay allocated blocks to be allocated to preserve + application-expected ext3 behaviour. Note that this will also start + triggering a write of the data blocks, but this behaviour may change in + the future as it is not necessary and has been done this way only for + sake of simplicity. + + EXT4_IOC_RESIZE_FS + Resize the filesystem to a new size. The number of blocks of resized + filesystem is passed in via 64 bit integer argument. The kernel + allocates bitmaps and inode table, the userspace tool thus just passes + the new number of blocks. + + EXT4_IOC_SWAP_BOOT + Swap i_blocks and associated attributes (like i_blocks, i_size, + i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO + (#5). This is typically used to store a boot loader in a secure part of + the filesystem, where it can't be changed by a normal user by accident. + The data blocks of the previous boot loader will be associated with the + given inode. + +References +========== + +kernel source: + + +programs: http://e2fsprogs.sourceforge.net/ + +useful links: http://fedoraproject.org/wiki/ext3-devel + http://www.bullopensource.org/ext4/ + http://ext4.wiki.kernel.org/index.php/Main_Page + http://fedoraproject.org/wiki/Features/Ext4 diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 0873685bab0f..965745d5fb9a 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -71,6 +71,7 @@ configure specific aspects of kernel behavior to your liking. java ras bcache + ext4 pm/index thunderbolt LSM/index diff --git a/Documentation/conf.py b/Documentation/conf.py index 05dad6bda787..4d32c01e1e16 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -383,6 +383,8 @@ latex_documents = [ 'The kernel development community', 'manual'), ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', 'The kernel development community', 'manual'), + ('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide', + 'ext4 Community', 'manual'), ('filesystems/ext4/index', 'ext4.tex', 'ext4 Filesystem', 'ext4 Filesystem Developers', 'manual'), ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', diff --git a/Documentation/filesystems/ext4/ext4.rst b/Documentation/filesystems/ext4/ext4.rst deleted file mode 100644 index e2b6bb7c2730..000000000000 --- a/Documentation/filesystems/ext4/ext4.rst +++ /dev/null @@ -1,574 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -======================== -General Information -======================== - -Ext4 is an advanced level of the ext3 filesystem which incorporates -scalability and reliability enhancements for supporting large filesystems -(64 bit) in keeping with increasing disk capacities and state-of-the-art -feature requirements. - -Mailing list: linux-ext4@vger.kernel.org -Web site: http://ext4.wiki.kernel.org - - -Quick usage instructions -======================== - -Note: More extensive information for getting started with ext4 can be -found at the ext4 wiki site at the URL: -http://ext4.wiki.kernel.org/index.php/Ext4_Howto - - - The latest version of e2fsprogs can be found at: - - https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ - - or - - http://sourceforge.net/project/showfiles.php?group_id=2406 - - or grab the latest git repository from: - - https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git - - - Create a new filesystem using the ext4 filesystem type: - - # mke2fs -t ext4 /dev/hda1 - - Or to configure an existing ext3 filesystem to support extents: - - # tune2fs -O extents /dev/hda1 - - If the filesystem was created with 128 byte inodes, it can be - converted to use 256 byte for greater efficiency via: - - # tune2fs -I 256 /dev/hda1 - - - Mounting: - - # mount -t ext4 /dev/hda1 /wherever - - - When comparing performance with other filesystems, it's always - important to try multiple workloads; very often a subtle change in a - workload parameter can completely change the ranking of which - filesystems do well compared to others. When comparing versus ext3, - note that ext4 enables write barriers by default, while ext3 does - not enable write barriers by default. So it is useful to use - explicitly specify whether barriers are enabled or not when via the - '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems - for a fair comparison. When tuning ext3 for best benchmark numbers, - it is often worthwhile to try changing the data journaling mode; '-o - data=writeback' can be faster for some workloads. (Note however that - running mounted with data=writeback can potentially leave stale data - exposed in recently written files in case of an unclean shutdown, - which could be a security exposure in some situations.) Configuring - the filesystem with a large journal can also be helpful for - metadata-intensive workloads. - -Features -======== - -Currently Available -------------------- - -* ability to use filesystems > 16TB (e2fsprogs support not available yet) -* extent format reduces metadata overhead (RAM, IO for access, transactions) -* extent format more robust in face of on-disk corruption due to magics, -* internal redundancy in tree -* improved file allocation (multi-block alloc) -* lift 32000 subdirectory limit imposed by i_links_count[1] -* nsec timestamps for mtime, atime, ctime, create time -* inode version field on disk (NFSv4, Lustre) -* reduced e2fsck time via uninit_bg feature -* journal checksumming for robustness, performance -* persistent file preallocation (e.g for streaming media, databases) -* ability to pack bitmaps and inode tables into larger virtual groups via the - flex_bg feature -* large file support -* inode allocation using large virtual block groups via flex_bg -* delayed allocation -* large block (up to pagesize) support -* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force - the ordering) - -[1] Filesystems with a block size of 1k may see a limit imposed by the -directory hash tree having a maximum depth of two. - -Options -======= - -When mounting an ext4 filesystem, the following option are accepted: -(*) == default - - ro - Mount filesystem read only. Note that ext4 will replay the journal (and - thus write to the partition) even when mounted "read only". The mount - options "ro,noload" can be used to prevent writes to the filesystem. - - journal_checksum - Enable checksumming of the journal transactions. This will allow the - recovery code in e2fsck and the kernel to detect corruption in the - kernel. It is a compatible change and will be ignored by older - kernels. - - journal_async_commit - Commit block can be written to disk without waiting for descriptor - blocks. If enabled older kernels cannot mount the device. This will - enable 'journal_checksum' internally. - - journal_path=path, journal_dev=devnum - When the external journal device's major/minor numbers have changed, - these options allow the user to specify the new journal location. The - journal device is identified through either its new major/minor numbers - encoded in devnum, or via a path to the device. - - norecovery, noload - Don't load the journal on mounting. Note that if the filesystem was - not unmounted cleanly, skipping the journal replay will lead to the - filesystem containing inconsistencies that can lead to any number of - problems. - - data=journal - All data are committed into the journal prior to being written into the - main file system. Enabling this mode will disable delayed allocation - and O_DIRECT support. - - data=ordered (*) - All data are forced directly out to the main file system prior to its - metadata being committed to the journal. - - data=writeback - Data ordering is not preserved, data may be written into the main file - system after its metadata has been committed to the journal. - - commit=nrsec (*) - Ext4 can be told to sync all its data and metadata every 'nrsec' - seconds. The default value is 5 seconds. This means that if you lose - your power, you will lose as much as the latest 5 seconds of work (your - filesystem will not be damaged though, thanks to the journaling). This - default value (or any low value) will hurt performance, but it's good - for data-safety. Setting it to 0 will have the same effect as leaving - it at the default (5 seconds). Setting it to very large values will - improve performance. - - barrier=<0|1(*)>, barrier(*), nobarrier - This enables/disables the use of write barriers in the jbd code. - barrier=0 disables, barrier=1 enables. This also requires an IO stack - which can support barriers, and if jbd gets an error on a barrier - write, it will disable again with a warning. Write barriers enforce - proper on-disk ordering of journal commits, making volatile disk write - caches safe to use, at some performance penalty. If your disks are - battery-backed in one way or another, disabling barriers may safely - improve performance. The mount options "barrier" and "nobarrier" can - also be used to enable or disable barriers, for consistency with other - ext4 mount options. - - inode_readahead_blks=n - This tuning parameter controls the maximum number of inode table blocks - that ext4's inode table readahead algorithm will pre-read into the - buffer cache. The default value is 32 blocks. - - nouser_xattr - Disables Extended User Attributes. See the attr(5) manual page for - more information about extended attributes. - - noacl - This option disables POSIX Access Control List support. If ACL support - is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL - is enabled by default on mount. See the acl(5) manual page for more - information about acl. - - bsddf (*) - Make 'df' act like BSD. - - minixdf - Make 'df' act like Minix. - - debug - Extra debugging information is sent to syslog. - - abort - Simulate the effects of calling ext4_abort() for debugging purposes. - This is normally used while remounting a filesystem which is already - mounted. - - errors=remount-ro - Remount the filesystem read-only on an error. - - errors=continue - Keep going on a filesystem error. - - errors=panic - Panic and halt the machine if an error occurs. (These mount options - override the errors behavior specified in the superblock, which can be - configured using tune2fs) - - data_err=ignore(*) - Just print an error message if an error occurs in a file data buffer in - ordered mode. - data_err=abort - Abort the journal if an error occurs in a file data buffer in ordered - mode. - - grpid | bsdgroups - New objects have the group ID of their parent. - - nogrpid (*) | sysvgroups - New objects have the group ID of their creator. - - resgid=n - The group ID which may use the reserved blocks. - - resuid=n - The user ID which may use the reserved blocks. - - sb= - Use alternate superblock at this location. - - quota, noquota, grpquota, usrquota - These options are ignored by the filesystem. They are used only by - quota tools to recognize volumes where quota should be turned on. See - documentation in the quota-tools package for more details - (http://sourceforge.net/projects/linuxquota). - - jqfmt=, usrjquota=, grpjquota= - These options tell filesystem details about quota so that quota - information can be properly updated during journal replay. They replace - the above quota options. See documentation in the quota-tools package - for more details (http://sourceforge.net/projects/linuxquota). - - stripe=n - Number of filesystem blocks that mballoc will try to use for allocation - size and alignment. For RAID5/6 systems this should be the number of - data disks * RAID chunk size in file system blocks. - - delalloc (*) - Defer block allocation until just before ext4 writes out the block(s) - in question. This allows ext4 to better allocation decisions more - efficiently. - - nodelalloc - Disable delayed allocation. Blocks are allocated when the data is - copied from userspace to the page cache, either via the write(2) system - call or when an mmap'ed page which was previously unallocated is - written for the first time. - - max_batch_time=usec - Maximum amount of time ext4 should wait for additional filesystem - operations to be batch together with a synchronous write operation. - Since a synchronous write operation is going to force a commit and then - a wait for the I/O complete, it doesn't cost much, and can be a huge - throughput win, we wait for a small amount of time to see if any other - transactions can piggyback on the synchronous write. The algorithm - used is designed to automatically tune for the speed of the disk, by - measuring the amount of time (on average) that it takes to finish - committing a transaction. Call this time the "commit time". If the - time that the transaction has been running is less than the commit - time, ext4 will try sleeping for the commit time to see if other - operations will join the transaction. The commit time is capped by - the max_batch_time, which defaults to 15000us (15ms). This - optimization can be turned off entirely by setting max_batch_time to 0. - - min_batch_time=usec - This parameter sets the commit time (as described above) to be at least - min_batch_time. It defaults to zero microseconds. Increasing this - parameter may improve the throughput of multi-threaded, synchronous - workloads on very fast disks, at the cost of increasing latency. - - journal_ioprio=prio - The I/O priority (from 0 to 7, where 0 is the highest priority) which - should be used for I/O operations submitted by kjournald2 during a - commit operation. This defaults to 3, which is a slightly higher - priority than the default I/O priority. - - auto_da_alloc(*), noauto_da_alloc - Many broken applications don't use fsync() when replacing existing - files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/ - rename("foo.new", "foo"), or worse yet, fd = open("foo", - O_TRUNC)/write(fd,..)/close(fd). If auto_da_alloc is enabled, ext4 - will detect the replace-via-rename and replace-via-truncate patterns - and force that any delayed allocation blocks are allocated such that at - the next journal commit, in the default data=ordered mode, the data - blocks of the new file are forced to disk before the rename() operation - is committed. This provides roughly the same level of guarantees as - ext3, and avoids the "zero-length" problem that can happen when a - system crashes before the delayed allocation blocks are forced to disk. - - noinit_itable - Do not initialize any uninitialized inode table blocks in the - background. This feature may be used by installation CD's so that the - install process can complete as quickly as possible; the inode table - initialization process would then be deferred until the next time the - file system is unmounted. - - init_itable=n - The lazy itable init code will wait n times the number of milliseconds - it took to zero out the previous block group's inode table. This - minimizes the impact on the system performance while file system's - inode table is being initialized. - - discard, nodiscard(*) - Controls whether ext4 should issue discard/TRIM commands to the - underlying block device when blocks are freed. This is useful for SSD - devices and sparse/thinly-provisioned LUNs, but it is off by default - until sufficient testing has been done. - - nouid32 - Disables 32-bit UIDs and GIDs. This is for interoperability with - older kernels which only store and expect 16-bit values. - - block_validity(*), noblock_validity - These options enable or disable the in-kernel facility for tracking - filesystem metadata blocks within internal data structures. This - allows multi- block allocator and other routines to notice bugs or - corrupted allocation bitmaps which cause blocks to be allocated which - overlap with filesystem metadata blocks. - - dioread_lock, dioread_nolock - Controls whether or not ext4 should use the DIO read locking. If the - dioread_nolock option is specified ext4 will allocate uninitialized - extent before buffer write and convert the extent to initialized after - IO completes. This approach allows ext4 code to avoid using inode - mutex, which improves scalability on high speed storages. However this - does not work with data journaling and dioread_nolock option will be - ignored with kernel warning. Note that dioread_nolock code path is only - used for extent-based files. Because of the restrictions this options - comprises it is off by default (e.g. dioread_lock). - - max_dir_size_kb=n - This limits the size of directories so that any attempt to expand them - beyond the specified limit in kilobytes will cause an ENOSPC error. - This is useful in memory constrained environments, where a very large - directory can cause severe performance problems or even provoke the Out - Of Memory killer. (For example, if there is only 512mb memory - available, a 176mb directory may seriously cramp the system's style.) - - i_version - Enable 64-bit inode version support. This option is off by default. - - dax - Use direct access (no page cache). See - Documentation/filesystems/dax.txt. Note that this option is - incompatible with data=journal. - -Data Mode -========= -There are 3 different data modes: - -* writeback mode - - In data=writeback mode, ext4 does not journal data at all. This mode provides - a similar level of journaling as that of XFS, JFS, and ReiserFS in its default - mode - metadata journaling. A crash+recovery can cause incorrect data to - appear in files which were written shortly before the crash. This mode will - typically provide the best ext4 performance. - -* ordered mode - - In data=ordered mode, ext4 only officially journals metadata, but it logically - groups metadata information related to data changes with the data blocks into - a single unit called a transaction. When it's time to write the new metadata - out to disk, the associated data blocks are written first. In general, this - mode performs slightly slower than writeback but significantly faster than - journal mode. - -* journal mode - - data=journal mode provides full data and metadata journaling. All new data is - written to the journal first, and then to its final location. In the event of - a crash, the journal can be replayed, bringing both data and metadata into a - consistent state. This mode is the slowest except when data needs to be read - from and written to disk at the same time where it outperforms all others - modes. Enabling this mode will disable delayed allocation and O_DIRECT - support. - -/proc entries -============= - -Information about mounted ext4 file systems can be found in -/proc/fs/ext4. Each mounted filesystem will have a directory in -/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or -/proc/fs/ext4/dm-0). The files in each per-device directory are shown -in table below. - -Files in /proc/fs/ext4/ - - mb_groups - details of multiblock allocator buddy cache of free blocks - -/sys entries -============ - -Information about mounted ext4 file systems can be found in -/sys/fs/ext4. Each mounted filesystem will have a directory in -/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or -/sys/fs/ext4/dm-0). The files in each per-device directory are shown -in table below. - -Files in /sys/fs/ext4/: - -(see also Documentation/ABI/testing/sysfs-fs-ext4) - - delayed_allocation_blocks - This file is read-only and shows the number of blocks that are dirty in - the page cache, but which do not have their location in the filesystem - allocated yet. - - inode_goal - Tuning parameter which (if non-zero) controls the goal inode used by - the inode allocator in preference to all other allocation heuristics. - This is intended for debugging use only, and should be 0 on production - systems. - - inode_readahead_blks - Tuning parameter which controls the maximum number of inode table - blocks that ext4's inode table readahead algorithm will pre-read into - the buffer cache. - - lifetime_write_kbytes - This file is read-only and shows the number of kilobytes of data that - have been written to this filesystem since it was created. - - max_writeback_mb_bump - The maximum number of megabytes the writeback code will try to write - out before move on to another inode. - - mb_group_prealloc - The multiblock allocator will round up allocation requests to a - multiple of this tuning parameter if the stripe size is not set in the - ext4 superblock - - mb_max_to_scan - The maximum number of extents the multiblock allocator will search to - find the best extent. - - mb_min_to_scan - The minimum number of extents the multiblock allocator will search to - find the best extent. - - mb_order2_req - Tuning parameter which controls the minimum size for requests (as a - power of 2) where the buddy cache is used. - - mb_stats - Controls whether the multiblock allocator should collect statistics, - which are shown during the unmount. 1 means to collect statistics, 0 - means not to collect statistics. - - mb_stream_req - Files which have fewer blocks than this tunable parameter will have - their blocks allocated out of a block group specific preallocation - pool, so that small files are packed closely together. Each large file - will have its blocks allocated out of its own unique preallocation - pool. - - session_write_kbytes - This file is read-only and shows the number of kilobytes of data that - have been written to this filesystem since it was mounted. - - reserved_clusters - This is RW file and contains number of reserved clusters in the file - system which will be used in the specific situations to avoid costly - zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or - 4096 clusters, whichever is smaller and this can be changed however it - can never exceed number of clusters in the file system. If there is not - enough space for the reserved space when mounting the file mount will - _not_ fail. - -Ioctls -====== - -There is some Ext4 specific functionality which can be accessed by applications -through the system call interfaces. The list of all Ext4 specific ioctls are -shown in the table below. - -Table of Ext4 specific ioctls - - EXT4_IOC_GETFLAGS - Get additional attributes associated with inode. The ioctl argument is - an integer bitfield, with bit values described in ext4.h. This ioctl is - an alias for FS_IOC_GETFLAGS. - - EXT4_IOC_SETFLAGS - Set additional attributes associated with inode. The ioctl argument is - an integer bitfield, with bit values described in ext4.h. This ioctl is - an alias for FS_IOC_SETFLAGS. - - EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD - Get the inode i_generation number stored for each inode. The - i_generation number is normally changed only when new inode is created - and it is particularly useful for network filesystems. The '_OLD' - version of this ioctl is an alias for FS_IOC_GETVERSION. - - EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD - Set the inode i_generation number stored for each inode. The '_OLD' - version of this ioctl is an alias for FS_IOC_SETVERSION. - - EXT4_IOC_GROUP_EXTEND - This ioctl has the same purpose as the resize mount option. It allows - to resize filesystem to the end of the last existing block group, - further resize has to be done with resize2fs, either online, or - offline. The argument points to the unsigned logn number representing - the filesystem new block count. - - EXT4_IOC_MOVE_EXT - Move the block extents from orig_fd (the one this ioctl is pointing to) - to the donor_fd (the one specified in move_extent structure passed as - an argument to this ioctl). Then, exchange inode metadata between - orig_fd and donor_fd. This is especially useful for online - defragmentation, because the allocator has the opportunity to allocate - moved blocks better, ideally into one contiguous extent. - - EXT4_IOC_GROUP_ADD - Add a new group descriptor to an existing or new group descriptor - block. The new group descriptor is described by ext4_new_group_input - structure, which is passed as an argument to this ioctl. This is - especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which - allows online resize of the filesystem to the end of the last existing - block group. Those two ioctls combined is used in userspace online - resize tool (e.g. resize2fs). - - EXT4_IOC_MIGRATE - This ioctl operates on the filesystem itself. It converts (migrates) - ext3 indirect block mapped inode to ext4 extent mapped inode by walking - through indirect block mapping of the original inode and converting - contiguous block ranges into ext4 extents of the temporary inode. Then, - inodes are swapped. This ioctl might help, when migrating from ext3 to - ext4 filesystem, however suggestion is to create fresh ext4 filesystem - and copy data from the backup. Note, that filesystem has to support - extents for this ioctl to work. - - EXT4_IOC_ALLOC_DA_BLKS - Force all of the delay allocated blocks to be allocated to preserve - application-expected ext3 behaviour. Note that this will also start - triggering a write of the data blocks, but this behaviour may change in - the future as it is not necessary and has been done this way only for - sake of simplicity. - - EXT4_IOC_RESIZE_FS - Resize the filesystem to a new size. The number of blocks of resized - filesystem is passed in via 64 bit integer argument. The kernel - allocates bitmaps and inode table, the userspace tool thus just passes - the new number of blocks. - - EXT4_IOC_SWAP_BOOT - Swap i_blocks and associated attributes (like i_blocks, i_size, - i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO - (#5). This is typically used to store a boot loader in a secure part of - the filesystem, where it can't be changed by a normal user by accident. - The data blocks of the previous boot loader will be associated with the - given inode. - -References -========== - -kernel source: - - -programs: http://e2fsprogs.sourceforge.net/ - -useful links: http://fedoraproject.org/wiki/ext3-devel - http://www.bullopensource.org/ext4/ - http://ext4.wiki.kernel.org/index.php/Main_Page - http://fedoraproject.org/wiki/Features/Ext4 diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst index 71121605558c..427bc115012e 100644 --- a/Documentation/filesystems/ext4/index.rst +++ b/Documentation/filesystems/ext4/index.rst @@ -13,5 +13,4 @@ the ext4 community. :maxdepth: 5 :numbered: - ext4 ondisk/index -- cgit v1.2.3 From 8a98ec7c7b3901330a036af0f62f523c31d763da Mon Sep 17 00:00:00 2001 From: Darrick J. Wong Date: Fri, 5 Oct 2018 19:20:08 -0400 Subject: docs: promote the ext4 data structures book to top level Move the ext4 data structures book to Documentation/filesystems/ext4/ since the administrative information moved elsewhere. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- Documentation/conf.py | 4 +- Documentation/filesystems/ext4/about.rst | 44 ++ Documentation/filesystems/ext4/allocators.rst | 56 ++ Documentation/filesystems/ext4/attributes.rst | 191 +++++ Documentation/filesystems/ext4/bigalloc.rst | 22 + Documentation/filesystems/ext4/bitmaps.rst | 28 + Documentation/filesystems/ext4/blockgroup.rst | 135 ++++ Documentation/filesystems/ext4/blockmap.rst | 49 ++ Documentation/filesystems/ext4/blocks.rst | 142 ++++ Documentation/filesystems/ext4/checksums.rst | 73 ++ Documentation/filesystems/ext4/directory.rst | 426 +++++++++++ Documentation/filesystems/ext4/dynamic.rst | 12 + Documentation/filesystems/ext4/eainode.rst | 18 + Documentation/filesystems/ext4/globals.rst | 13 + Documentation/filesystems/ext4/group_descr.rst | 170 +++++ Documentation/filesystems/ext4/ifork.rst | 194 +++++ Documentation/filesystems/ext4/index.rst | 18 +- Documentation/filesystems/ext4/inlinedata.rst | 37 + Documentation/filesystems/ext4/inodes.rst | 576 +++++++++++++++ Documentation/filesystems/ext4/journal.rst | 611 ++++++++++++++++ Documentation/filesystems/ext4/mmp.rst | 77 ++ Documentation/filesystems/ext4/ondisk/about.rst | 44 -- .../filesystems/ext4/ondisk/allocators.rst | 56 -- .../filesystems/ext4/ondisk/attributes.rst | 191 ----- Documentation/filesystems/ext4/ondisk/bigalloc.rst | 22 - Documentation/filesystems/ext4/ondisk/bitmaps.rst | 28 - .../filesystems/ext4/ondisk/blockgroup.rst | 135 ---- Documentation/filesystems/ext4/ondisk/blockmap.rst | 49 -- Documentation/filesystems/ext4/ondisk/blocks.rst | 142 ---- .../filesystems/ext4/ondisk/checksums.rst | 73 -- .../filesystems/ext4/ondisk/directory.rst | 426 ----------- Documentation/filesystems/ext4/ondisk/dynamic.rst | 12 - Documentation/filesystems/ext4/ondisk/eainode.rst | 18 - Documentation/filesystems/ext4/ondisk/globals.rst | 13 - .../filesystems/ext4/ondisk/group_descr.rst | 170 ----- Documentation/filesystems/ext4/ondisk/ifork.rst | 194 ----- Documentation/filesystems/ext4/ondisk/index.rst | 9 - .../filesystems/ext4/ondisk/inlinedata.rst | 37 - Documentation/filesystems/ext4/ondisk/inodes.rst | 576 --------------- Documentation/filesystems/ext4/ondisk/journal.rst | 611 ---------------- Documentation/filesystems/ext4/ondisk/mmp.rst | 77 -- Documentation/filesystems/ext4/ondisk/overview.rst | 26 - .../filesystems/ext4/ondisk/special_inodes.rst | 38 - Documentation/filesystems/ext4/ondisk/super.rst | 801 --------------------- Documentation/filesystems/ext4/overview.rst | 26 + Documentation/filesystems/ext4/special_inodes.rst | 38 + Documentation/filesystems/ext4/super.rst | 801 +++++++++++++++++++++ 47 files changed, 3749 insertions(+), 3760 deletions(-) create mode 100644 Documentation/filesystems/ext4/about.rst create mode 100644 Documentation/filesystems/ext4/allocators.rst create mode 100644 Documentation/filesystems/ext4/attributes.rst create mode 100644 Documentation/filesystems/ext4/bigalloc.rst create mode 100644 Documentation/filesystems/ext4/bitmaps.rst create mode 100644 Documentation/filesystems/ext4/blockgroup.rst create mode 100644 Documentation/filesystems/ext4/blockmap.rst create mode 100644 Documentation/filesystems/ext4/blocks.rst create mode 100644 Documentation/filesystems/ext4/checksums.rst create mode 100644 Documentation/filesystems/ext4/directory.rst create mode 100644 Documentation/filesystems/ext4/dynamic.rst create mode 100644 Documentation/filesystems/ext4/eainode.rst create mode 100644 Documentation/filesystems/ext4/globals.rst create mode 100644 Documentation/filesystems/ext4/group_descr.rst create mode 100644 Documentation/filesystems/ext4/ifork.rst create mode 100644 Documentation/filesystems/ext4/inlinedata.rst create mode 100644 Documentation/filesystems/ext4/inodes.rst create mode 100644 Documentation/filesystems/ext4/journal.rst create mode 100644 Documentation/filesystems/ext4/mmp.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/about.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/allocators.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/attributes.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/bigalloc.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/bitmaps.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/blockgroup.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/blockmap.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/blocks.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/checksums.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/directory.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/dynamic.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/eainode.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/globals.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/group_descr.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/ifork.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/index.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/inlinedata.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/inodes.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/journal.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/mmp.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/overview.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/special_inodes.rst delete mode 100644 Documentation/filesystems/ext4/ondisk/super.rst create mode 100644 Documentation/filesystems/ext4/overview.rst create mode 100644 Documentation/filesystems/ext4/special_inodes.rst create mode 100644 Documentation/filesystems/ext4/super.rst diff --git a/Documentation/conf.py b/Documentation/conf.py index 4d32c01e1e16..ede67ccafc29 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -385,8 +385,8 @@ latex_documents = [ 'The kernel development community', 'manual'), ('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide', 'ext4 Community', 'manual'), - ('filesystems/ext4/index', 'ext4.tex', 'ext4 Filesystem', - 'ext4 Filesystem Developers', 'manual'), + ('filesystems/ext4/index', 'ext4-data-structures.tex', + 'ext4 Data Structures and Algorithms', 'ext4 Community', 'manual'), ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', 'The kernel development community', 'manual'), ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', diff --git a/Documentation/filesystems/ext4/about.rst b/Documentation/filesystems/ext4/about.rst new file mode 100644 index 000000000000..0aadba052264 --- /dev/null +++ b/Documentation/filesystems/ext4/about.rst @@ -0,0 +1,44 @@ +.. SPDX-License-Identifier: GPL-2.0 + +About this Book +=============== + +This document attempts to describe the on-disk format for ext4 +filesystems. The same general ideas should apply to ext2/3 filesystems +as well, though they do not support all the features that ext4 supports, +and the fields will be shorter. + +**NOTE**: This is a work in progress, based on notes that the author +(djwong) made while picking apart a filesystem by hand. The data +structure definitions should be current as of Linux 4.18 and +e2fsprogs-1.44. All comments and corrections are welcome, since there is +undoubtedly plenty of lore that might not be reflected in freshly +created demonstration filesystems. + +License +------- +This book is licensed under the terms of the GNU Public License, v2. + +Terminology +----------- + +ext4 divides a storage device into an array of logical blocks both to +reduce bookkeeping overhead and to increase throughput by forcing larger +transfer sizes. Generally, the block size will be 4KiB (the same size as +pages on x86 and the block layer's default block size), though the +actual size is calculated as 2 ^ (10 + ``sb.s_log_block_size``) bytes. +Throughout this document, disk locations are given in terms of these +logical blocks, not raw LBAs, and not 1024-byte blocks. For the sake of +convenience, the logical block size will be referred to as +``$block_size`` throughout the rest of the document. + +When referenced in ``preformatted text`` blocks, ``sb`` refers to fields +in the super block, and ``inode`` refers to fields in an inode table +entry. + +Other References +---------------- + +Also see http://www.nongnu.org/ext2-doc/ for quite a collection of +information about ext2/3. Here's another old reference: +http://wiki.osdev.org/Ext2 diff --git a/Documentation/filesystems/ext4/allocators.rst b/Documentation/filesystems/ext4/allocators.rst new file mode 100644 index 000000000000..7aa85152ace3 --- /dev/null +++ b/Documentation/filesystems/ext4/allocators.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Block and Inode Allocation Policy +--------------------------------- + +ext4 recognizes (better than ext3, anyway) that data locality is +generally a desirably quality of a filesystem. On a spinning disk, +keeping related blocks near each other reduces the amount of movement +that the head actuator and disk must perform to access a data block, +thus speeding up disk IO. On an SSD there of course are no moving parts, +but locality can increase the size of each transfer request while +reducing the total number of requests. This locality may also have the +effect of concentrating writes on a single erase block, which can speed +up file rewrites significantly. Therefore, it is useful to reduce +fragmentation whenever possible. + +The first tool that ext4 uses to combat fragmentation is the multi-block +allocator. When a file is first created, the block allocator +speculatively allocates 8KiB of disk space to the file on the assumption +that the space will get written soon. When the file is closed, the +unused speculative allocations are of course freed, but if the +speculation is correct (typically the case for full writes of small +files) then the file data gets written out in a single multi-block +extent. A second related trick that ext4 uses is delayed allocation. +Under this scheme, when a file needs more blocks to absorb file writes, +the filesystem defers deciding the exact placement on the disk until all +the dirty buffers are being written out to disk. By not committing to a +particular placement until it's absolutely necessary (the commit timeout +is hit, or sync() is called, or the kernel runs out of memory), the hope +is that the filesystem can make better location decisions. + +The third trick that ext4 (and ext3) uses is that it tries to keep a +file's data blocks in the same block group as its inode. This cuts down +on the seek penalty when the filesystem first has to read a file's inode +to learn where the file's data blocks live and then seek over to the +file's data blocks to begin I/O operations. + +The fourth trick is that all the inodes in a directory are placed in the +same block group as the directory, when feasible. The working assumption +here is that all the files in a directory might be related, therefore it +is useful to try to keep them all together. + +The fifth trick is that the disk volume is cut up into 128MB block +groups; these mini-containers are used as outlined above to try to +maintain data locality. However, there is a deliberate quirk -- when a +directory is created in the root directory, the inode allocator scans +the block groups and puts that directory into the least heavily loaded +block group that it can find. This encourages directories to spread out +over a disk; as the top-level directory/file blobs fill up one block +group, the allocators simply move on to the next block group. Allegedly +this scheme evens out the loading on the block groups, though the author +suspects that the directories which are so unlucky as to land towards +the end of a spinning drive get a raw deal performance-wise. + +Of course if all of these mechanisms fail, one can always use e4defrag +to defragment files. diff --git a/Documentation/filesystems/ext4/attributes.rst b/Documentation/filesystems/ext4/attributes.rst new file mode 100644 index 000000000000..54386a010a8d --- /dev/null +++ b/Documentation/filesystems/ext4/attributes.rst @@ -0,0 +1,191 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Extended Attributes +------------------- + +Extended attributes (xattrs) are typically stored in a separate data +block on the disk and referenced from inodes via ``inode.i_file_acl*``. +The first use of extended attributes seems to have been for storing file +ACLs and other security data (selinux). With the ``user_xattr`` mount +option it is possible for users to store extended attributes so long as +all attribute names begin with “user”; this restriction seems to have +disappeared as of Linux 3.0. + +There are two places where extended attributes can be found. The first +place is between the end of each inode entry and the beginning of the +next inode entry. For example, if inode.i\_extra\_isize = 28 and +sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes +available for in-inode extended attribute storage. The second place +where extended attributes can be found is in the block pointed to by +``inode.i_file_acl``. As of Linux 3.11, it is not possible for this +block to contain a pointer to a second extended attribute block (or even +the remaining blocks of a cluster). In theory it is possible for each +attribute's value to be stored in a separate data block, though as of +Linux 3.11 the code does not permit this. + +Keys are generally assumed to be ASCIIZ strings, whereas values can be +strings or binary data. + +Extended attributes, when stored after the inode, have a header +``ext4_xattr_ibody_header`` that is 4 bytes long: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - h\_magic + - Magic number for identification, 0xEA020000. This value is set by the + Linux driver, though e2fsprogs doesn't seem to check it(?) + +The beginning of an extended attribute block is in +``struct ext4_xattr_header``, which is 32 bytes long: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - h\_magic + - Magic number for identification, 0xEA020000. + * - 0x4 + - \_\_le32 + - h\_refcount + - Reference count. + * - 0x8 + - \_\_le32 + - h\_blocks + - Number of disk blocks used. + * - 0xC + - \_\_le32 + - h\_hash + - Hash value of all attributes. + * - 0x10 + - \_\_le32 + - h\_checksum + - Checksum of the extended attribute block. + * - 0x14 + - \_\_u32 + - h\_reserved[2] + - Zero. + +The checksum is calculated against the FS UUID, the 64-bit block number +of the extended attribute block, and the entire block (header + +entries). + +Following the ``struct ext4_xattr_header`` or +``struct ext4_xattr_ibody_header`` is an array of +``struct ext4_xattr_entry``; each of these entries is at least 16 bytes +long. When stored in an external block, the ``struct ext4_xattr_entry`` +entries must be stored in sorted order. The sort order is +``e_name_index``, then ``e_name_len``, and finally ``e_name``. +Attributes stored inside an inode do not need be stored in sorted order. + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_u8 + - e\_name\_len + - Length of name. + * - 0x1 + - \_\_u8 + - e\_name\_index + - Attribute name index. There is a discussion of this below. + * - 0x2 + - \_\_le16 + - e\_value\_offs + - Location of this attribute's value on the disk block where it is stored. + Multiple attributes can share the same value. For an inode attribute + this value is relative to the start of the first entry; for a block this + value is relative to the start of the block (i.e. the header). + * - 0x4 + - \_\_le32 + - e\_value\_inum + - The inode where the value is stored. Zero indicates the value is in the + same block as this entry. This field is only used if the + INCOMPAT\_EA\_INODE feature is enabled. + * - 0x8 + - \_\_le32 + - e\_value\_size + - Length of attribute value. + * - 0xC + - \_\_le32 + - e\_hash + - Hash value of attribute name and attribute value. The kernel doesn't + update the hash for in-inode attributes, so for that case this value + must be zero, because e2fsck validates any non-zero hash regardless of + where the xattr lives. + * - 0x10 + - char + - e\_name[e\_name\_len] + - Attribute name. Does not include trailing NULL. + +Attribute values can follow the end of the entry table. There appears to +be a requirement that they be aligned to 4-byte boundaries. The values +are stored starting at the end of the block and grow towards the +xattr\_header/xattr\_entry table. When the two collide, the overflow is +put into a separate disk block. If the disk block fills up, the +filesystem returns -ENOSPC. + +The first four fields of the ``ext4_xattr_entry`` are set to zero to +mark the end of the key list. + +Attribute Name Indices +~~~~~~~~~~~~~~~~~~~~~~ + +Logically speaking, extended attributes are a series of key=value pairs. +The keys are assumed to be NULL-terminated strings. To reduce the amount +of on-disk space that the keys consume, the beginning of the key string +is matched against the attribute name index. If a match is found, the +attribute name index field is set, and matching string is removed from +the key name. Here is a map of name index values to key prefixes: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Name Index + - Key Prefix + * - 0 + - (no prefix) + * - 1 + - “user.” + * - 2 + - “system.posix\_acl\_access” + * - 3 + - “system.posix\_acl\_default” + * - 4 + - “trusted.” + * - 6 + - “security.” + * - 7 + - “system.” (inline\_data only?) + * - 8 + - “system.richacl” (SuSE kernels only?) + +For example, if the attribute key is “user.fubar”, the attribute name +index is set to 1 and the “fubar” name is recorded on disk. + +POSIX ACLs +~~~~~~~~~~ + +POSIX ACLs are stored in a reduced version of the Linux kernel (and +libacl's) internal ACL format. The key difference is that the version +number is different (1) and the ``e_id`` field is only stored for named +user and group ACLs. diff --git a/Documentation/filesystems/ext4/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst new file mode 100644 index 000000000000..c6d88557553c --- /dev/null +++ b/Documentation/filesystems/ext4/bigalloc.rst @@ -0,0 +1,22 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Bigalloc +-------- + +At the moment, the default size of a block is 4KiB, which is a commonly +supported page size on most MMU-capable hardware. This is fortunate, as +ext4 code is not prepared to handle the case where the block size +exceeds the page size. However, for a filesystem of mostly huge files, +it is desirable to be able to allocate disk blocks in units of multiple +blocks to reduce both fragmentation and metadata overhead. The +`bigalloc `__ feature provides exactly this ability. The +administrator can set a block cluster size at mkfs time (which is stored +in the s\_log\_cluster\_size field in the superblock); from then on, the +block bitmaps track clusters, not individual blocks. This means that +block groups can be several gigabytes in size (instead of just 128MiB); +however, the minimum allocation unit becomes a cluster, not a block, +even for directories. TaoBao had a patchset to extend the “use units of +clusters instead of blocks” to the extent tree, though it is not clear +where those patches went-- they eventually morphed into “extent tree v2” +but that code has not landed as of May 2015. + diff --git a/Documentation/filesystems/ext4/bitmaps.rst b/Documentation/filesystems/ext4/bitmaps.rst new file mode 100644 index 000000000000..c7546dbc197a --- /dev/null +++ b/Documentation/filesystems/ext4/bitmaps.rst @@ -0,0 +1,28 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Block and inode Bitmaps +----------------------- + +The data block bitmap tracks the usage of data blocks within the block +group. + +The inode bitmap records which entries in the inode table are in use. + +As with most bitmaps, one bit represents the usage status of one data +block or inode table entry. This implies a block group size of 8 \* +number\_of\_bytes\_in\_a\_logical\_block. + +NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts +of the kernel and e2fsprogs code pretends that the block bitmap contains +zeros (i.e. all blocks in the group are free). However, it is not +necessarily the case that no blocks are in use -- if ``meta_bg`` is set, +the bitmaps and group descriptor live inside the group. Unfortunately, +ext2fs\_test\_block\_bitmap2() will return '0' for those locations, +which produces confusing debugfs output. + +Inode Table +----------- +Inode tables are statically allocated at mkfs time. Each block group +descriptor points to the start of the table, and the superblock records +the number of inodes per group. See the section on inodes for more +information. diff --git a/Documentation/filesystems/ext4/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst new file mode 100644 index 000000000000..baf888e4c06a --- /dev/null +++ b/Documentation/filesystems/ext4/blockgroup.rst @@ -0,0 +1,135 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Layout +------ + +The layout of a standard block group is approximately as follows (each +of these fields is discussed in a separate section below): + +.. list-table:: + :widths: 1 1 1 1 1 1 1 1 + :header-rows: 1 + + * - Group 0 Padding + - ext4 Super Block + - Group Descriptors + - Reserved GDT Blocks + - Data Block Bitmap + - inode Bitmap + - inode Table + - Data Blocks + * - 1024 bytes + - 1 block + - many blocks + - many blocks + - 1 block + - 1 block + - many blocks + - many more blocks + +For the special case of block group 0, the first 1024 bytes are unused, +to allow for the installation of x86 boot sectors and other oddities. +The superblock will start at offset 1024 bytes, whichever block that +happens to be (usually 0). However, if for some reason the block size = +1024, then block 0 is marked in use and the superblock goes in block 1. +For all other block groups, there is no padding. + +The ext4 driver primarily works with the superblock and the group +descriptors that are found in block group 0. Redundant copies of the +superblock and group descriptors are written to some of the block groups +across the disk in case the beginning of the disk gets trashed, though +not all block groups necessarily host a redundant copy (see following +paragraph for more details). If the group does not have a redundant +copy, the block group begins with the data block bitmap. Note also that +when the filesystem is freshly formatted, mkfs will allocate “reserve +GDT block” space after the block group descriptors and before the start +of the block bitmaps to allow for future expansion of the filesystem. By +default, a filesystem is allowed to increase in size by a factor of +1024x over the original filesystem size. + +The location of the inode table is given by ``grp.bg_inode_table_*``. It +is continuous range of blocks large enough to contain +``sb.s_inodes_per_group * sb.s_inode_size`` bytes. + +As for the ordering of items in a block group, it is generally +established that the super block and the group descriptor table, if +present, will be at the beginning of the block group. The bitmaps and +the inode table can be anywhere, and it is quite possible for the +bitmaps to come after the inode table, or for both to be in different +groups (flex\_bg). Leftover space is used for file data blocks, indirect +block maps, extent tree blocks, and extended attributes. + +Flexible Block Groups +--------------------- + +Starting in ext4, there is a new feature called flexible block groups +(flex\_bg). In a flex\_bg, several block groups are tied together as one +logical block group; the bitmap spaces and the inode table space in the +first block group of the flex\_bg are expanded to include the bitmaps +and inode tables of all other block groups in the flex\_bg. For example, +if the flex\_bg size is 4, then group 0 will contain (in order) the +superblock, group descriptors, data block bitmaps for groups 0-3, inode +bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining +space in group 0 is for file data. The effect of this is to group the +block metadata close together for faster loading, and to enable large +files to be continuous on disk. Backup copies of the superblock and +group descriptors are always at the beginning of block groups, even if +flex\_bg is enabled. The number of block groups that make up a flex\_bg +is given by 2 ^ ``sb.s_log_groups_per_flex``. + +Meta Block Groups +----------------- + +Without the option META\_BG, for safety concerns, all block group +descriptors copies are kept in the first block group. Given the default +128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4 +can have at most 2^27/64 = 2^21 block groups. This limits the entire +filesystem size to 2^21 ∗ 2^27 = 2^48bytes or 256TiB. + +The solution to this problem is to use the metablock group feature +(META\_BG), which is already in ext3 for all 2.6 releases. With the +META\_BG feature, ext4 filesystems are partitioned into many metablock +groups. Each metablock group is a cluster of block groups whose group +descriptor structures can be stored in a single disk block. For ext4 +filesystems with 4 KB block size, a single metablock group partition +includes 64 block groups, or 8 GiB of disk space. The metablock group +feature moves the location of the group descriptors from the congested +first block group of the whole filesystem into the first group of each +metablock group itself. The backups are in the second and last group of +each metablock group. This increases the 2^21 maximum block groups limit +to the hard limit 2^32, allowing support for a 512PiB filesystem. + +The change in the filesystem format replaces the current scheme where +the superblock is followed by a variable-length set of block group +descriptors. Instead, the superblock and a single block group descriptor +block is placed at the beginning of the first, second, and last block +groups in a meta-block group. A meta-block group is a collection of +block groups which can be described by a single block group descriptor +block. Since the size of the block group descriptor structure is 32 +bytes, a meta-block group contains 32 block groups for filesystems with +a 1KB block size, and 128 block groups for filesystems with a 4KB +blocksize. Filesystems can either be created using this new block group +descriptor layout, or existing filesystems can be resized on-line, and +the field s\_first\_meta\_bg in the superblock will indicate the first +block group using this new layout. + +Please see an important note about ``BLOCK_UNINIT`` in the section about +block and inode bitmaps. + +Lazy Block Group Initialization +------------------------------- + +A new feature for ext4 are three block group descriptor flags that +enable mkfs to skip initializing other parts of the block group +metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean +that the inode and block bitmaps for that group can be calculated and +therefore the on-disk bitmap blocks are not initialized. This is +generally the case for an empty block group or a block group containing +only fixed-location block group metadata. The INODE\_ZEROED flag means +that the inode table has been initialized; mkfs will unset this flag and +rely on the kernel to initialize the inode tables in the background. + +By not writing zeroes to the bitmaps and inode table, mkfs time is +reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM, +but the dumpe2fs output prints this as “uninit\_bg”. They are the same +thing. diff --git a/Documentation/filesystems/ext4/blockmap.rst b/Documentation/filesystems/ext4/blockmap.rst new file mode 100644 index 000000000000..30e25750d88a --- /dev/null +++ b/Documentation/filesystems/ext4/blockmap.rst @@ -0,0 +1,49 @@ +.. SPDX-License-Identifier: GPL-2.0 + ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| i.i\_block Offset | Where It Points | ++=====================+==============================================================================================================================================================================================================================+ +| 0 to 11 | Direct map to file blocks 0 to 11. | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 12 | Indirect block: (file blocks 12 to (``$block_size`` / 4) + 11, or 12 to 1035 if 4KiB blocks) | +| | | +| | +------------------------------+--------------------------------------------------------------------+ | +| | | Indirect Block Offset | Where It Points | | +| | +==============================+====================================================================+ | +| | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | +| | +------------------------------+--------------------------------------------------------------------+ | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 13 | Double-indirect block: (file blocks ``$block_size``/4 + 12 to (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 11, or 1036 to 1049611 if 4KiB blocks) | +| | | +| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | +| | | Double Indirect Block Offset | Where It Points | | +| | +================================+=========================================================================================================+ | +| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | +| | | | | | +| | | | +------------------------------+--------------------------------------------------------------------+ | | +| | | | | Indirect Block Offset | Where It Points | | | +| | | | +==============================+====================================================================+ | | +| | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | +| | | | +------------------------------+--------------------------------------------------------------------+ | | +| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| 14 | Triple-indirect block: (file blocks (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12 to (``$block_size`` / 4) ^ 3 + (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12, or 1049612 to 1074791436 if 4KiB blocks) | +| | | +| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | +| | | Triple Indirect Block Offset | Where It Points | | +| | +================================+================================================================================================================================================+ | +| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) double indirect blocks (1024 if 4KiB blocks) | | +| | | | | | +| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | +| | | | | Double Indirect Block Offset | Where It Points | | | +| | | | +================================+=========================================================================================================+ | | +| | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | | +| | | | | | | | | +| | | | | | +------------------------------+--------------------------------------------------------------------+ | | | +| | | | | | | Indirect Block Offset | Where It Points | | | | +| | | | | | +==============================+====================================================================+ | | | +| | | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | | +| | | | | | +------------------------------+--------------------------------------------------------------------+ | | | +| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | +| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | ++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/Documentation/filesystems/ext4/blocks.rst b/Documentation/filesystems/ext4/blocks.rst new file mode 100644 index 000000000000..73d4dc0f7bda --- /dev/null +++ b/Documentation/filesystems/ext4/blocks.rst @@ -0,0 +1,142 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Blocks +------ + +ext4 allocates storage space in units of “blocks”. A block is a group of +sectors between 1KiB and 64KiB, and the number of sectors must be an +integral power of 2. Blocks are in turn grouped into larger units called +block groups. Block size is specified at mkfs time and typically is +4KiB. You may experience mounting problems if block size is greater than +page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory +pages). By default a filesystem can contain 2^32 blocks; if the '64bit' +feature is enabled, then a filesystem can have 2^64 blocks. + +For 32-bit filesystems, limits are as follows: + +.. list-table:: + :widths: 1 1 1 1 1 + :header-rows: 1 + + * - Item + - 1KiB + - 2KiB + - 4KiB + - 64KiB + * - Blocks + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - Inodes + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - File System Size + - 4TiB + - 8TiB + - 16TiB + - 256PiB + * - Blocks Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Inodes Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Block Group Size + - 8MiB + - 32MiB + - 128MiB + - 32GiB + * - Blocks Per File, Extents + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - Blocks Per File, Block Maps + - 16,843,020 + - 134,480,396 + - 1,074,791,436 + - 4,398,314,962,956 (really 2^32 due to field size limitations) + * - File Size, Extents + - 4TiB + - 8TiB + - 16TiB + - 256TiB + * - File Size, Block Maps + - 16GiB + - 256GiB + - 4TiB + - 256TiB + +For 64-bit filesystems, limits are as follows: + +.. list-table:: + :widths: 1 1 1 1 1 + :header-rows: 1 + + * - Item + - 1KiB + - 2KiB + - 4KiB + - 64KiB + * - Blocks + - 2^64 + - 2^64 + - 2^64 + - 2^64 + * - Inodes + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - File System Size + - 16ZiB + - 32ZiB + - 64ZiB + - 1YiB + * - Blocks Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Inodes Per Block Group + - 8,192 + - 16,384 + - 32,768 + - 524,288 + * - Block Group Size + - 8MiB + - 32MiB + - 128MiB + - 32GiB + * - Blocks Per File, Extents + - 2^32 + - 2^32 + - 2^32 + - 2^32 + * - Blocks Per File, Block Maps + - 16,843,020 + - 134,480,396 + - 1,074,791,436 + - 4,398,314,962,956 (really 2^32 due to field size limitations) + * - File Size, Extents + - 4TiB + - 8TiB + - 16TiB + - 256TiB + * - File Size, Block Maps + - 16GiB + - 256GiB + - 4TiB + - 256TiB + +Note: Files not using extents (i.e. files using block maps) must be +placed within the first 2^32 blocks of a filesystem. Files with extents +must be placed within the first 2^48 blocks of a filesystem. It's not +clear what happens with larger filesystems. diff --git a/Documentation/filesystems/ext4/checksums.rst b/Documentation/filesystems/ext4/checksums.rst new file mode 100644 index 000000000000..5519e253810d --- /dev/null +++ b/Documentation/filesystems/ext4/checksums.rst @@ -0,0 +1,73 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Checksums +--------- + +Starting in early 2012, metadata checksums were added to all major ext4 +and jbd2 data structures. The associated feature flag is metadata\_csum. +The desired checksum algorithm is indicated in the superblock, though as +of October 2012 the only supported algorithm is crc32c. Some data +structures did not have space to fit a full 32-bit checksum, so only the +lower 16 bits are stored. Enabling the 64bit feature increases the data +structure size so that full 32-bit checksums can be stored for many data +structures. However, existing 32-bit filesystems cannot be extended to +enable 64bit mode, at least not without the experimental resize2fs +patches to do so. + +Existing filesystems can have checksumming added by running +``tune2fs -O metadata_csum`` against the underlying device. If tune2fs +encounters directory blocks that lack sufficient empty space to add a +checksum, it will request that you run ``e2fsck -D`` to have the +directories rebuilt with checksums. This has the added benefit of +removing slack space from the directory files and rebalancing the htree +indexes. If you \_ignore\_ this step, your directories will not be +protected by a checksum! + +The following table describes the data elements that go into each type +of checksum. The checksum function is whatever the superblock describes +(crc32c as of October 2013) unless noted otherwise. + +.. list-table:: + :widths: 20 8 50 + :header-rows: 1 + + * - Metadata + - Length + - Ingredients + * - Superblock + - \_\_le32 + - The entire superblock up to the checksum field. The UUID lives inside + the superblock. + * - MMP + - \_\_le32 + - UUID + the entire MMP block up to the checksum field. + * - Extended Attributes + - \_\_le32 + - UUID + the entire extended attribute block. The checksum field is set to + zero. + * - Directory Entries + - \_\_le32 + - UUID + inode number + inode generation + the directory block up to the + fake entry enclosing the checksum field. + * - HTREE Nodes + - \_\_le32 + - UUID + inode number + inode generation + all valid extents + HTREE tail. + The checksum field is set to zero. + * - Extents + - \_\_le32 + - UUID + inode number + inode generation + the entire extent block up to + the checksum field. + * - Bitmaps + - \_\_le32 or \_\_le16 + - UUID + the entire bitmap. Checksums are stored in the group descriptor, + and truncated if the group descriptor size is 32 bytes (i.e. ^64bit) + * - Inodes + - \_\_le32 + - UUID + inode number + inode generation + the entire inode. The checksum + field is set to zero. Each inode has its own checksum. + * - Group Descriptors + - \_\_le16 + - If metadata\_csum, then UUID + group number + the entire descriptor; + else if gdt\_csum, then crc16(UUID + group number + the entire + descriptor). In all cases, only the lower 16 bits are stored. + diff --git a/Documentation/filesystems/ext4/directory.rst b/Documentation/filesystems/ext4/directory.rst new file mode 100644 index 000000000000..614034e24669 --- /dev/null +++ b/Documentation/filesystems/ext4/directory.rst @@ -0,0 +1,426 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Directory Entries +----------------- + +In an ext4 filesystem, a directory is more or less a flat file that maps +an arbitrary byte string (usually ASCII) to an inode number on the +filesystem. There can be many directory entries across the filesystem +that reference the same inode number--these are known as hard links, and +that is why hard links cannot reference files on other filesystems. As +such, directory entries are found by reading the data block(s) +associated with a directory file for the particular directory entry that +is desired. + +Linear (Classic) Directories +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, each directory lists its entries in an “almost-linear” +array. I write “almost” because it's not a linear array in the memory +sense because directory entries are not split across filesystem blocks. +Therefore, it is more accurate to say that a directory is a series of +data blocks and that each block contains a linear array of directory +entries. The end of each per-block array is signified by reaching the +end of the block; the last entry in the block has a record length that +takes it all the way to the end of the block. The end of the entire +directory is of course signified by reaching the end of the file. Unused +directory entries are signified by inode = 0. By default the filesystem +uses ``struct ext4_dir_entry_2`` for directory entries unless the +“filetype” feature flag is not set, in which case it uses +``struct ext4_dir_entry``. + +The original directory entry format is ``struct ext4_dir_entry``, which +is at most 263 bytes long, though on disk you'll need to reference +``dirent.rec_len`` to know for sure. + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - inode + - Number of the inode that this directory entry points to. + * - 0x4 + - \_\_le16 + - rec\_len + - Length of this directory entry. Must be a multiple of 4. + * - 0x6 + - \_\_le16 + - name\_len + - Length of the file name. + * - 0x8 + - char + - name[EXT4\_NAME\_LEN] + - File name. + +Since file names cannot be longer than 255 bytes, the new directory +entry format shortens the rec\_len field and uses the space for a file +type flag, probably to avoid having to load every inode during directory +tree traversal. This format is ``ext4_dir_entry_2``, which is at most +263 bytes long, though on disk you'll need to reference +``dirent.rec_len`` to know for sure. + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - inode + - Number of the inode that this directory entry points to. + * - 0x4 + - \_\_le16 + - rec\_len + - Length of this directory entry. + * - 0x6 + - \_\_u8 + - name\_len + - Length of the file name. + * - 0x7 + - \_\_u8 + - file\_type + - File type code, see ftype_ table below. + * - 0x8 + - char + - name[EXT4\_NAME\_LEN] + - File name. + +.. _ftype: + +The directory file type is one of the following values: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x0 + - Unknown. + * - 0x1 + - Regular file. + * - 0x2 + - Directory. + * - 0x3 + - Character device file. + * - 0x4 + - Block device file. + * - 0x5 + - FIFO. + * - 0x6 + - Socket. + * - 0x7 + - Symbolic link. + +In order to add checksums to these classic directory blocks, a phony +``struct ext4_dir_entry`` is placed at the end of each leaf block to +hold the checksum. The directory entry is 12 bytes long. The inode +number and name\_len fields are set to zero to fool old software into +ignoring an apparently empty directory entry, and the checksum is stored +in the place where the name normally goes. The structure is +``struct ext4_dir_entry_tail``: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - det\_reserved\_zero1 + - Inode number, which must be zero. + * - 0x4 + - \_\_le16 + - det\_rec\_len + - Length of this directory entry, which must be 12. + * - 0x6 + - \_\_u8 + - det\_reserved\_zero2 + - Length of the file name, which must be zero. + * - 0x7 + - \_\_u8 + - det\_reserved\_ft + - File type, which must be 0xDE. + * - 0x8 + - \_\_le32 + - det\_checksum + - Directory leaf block checksum. + +The leaf directory block checksum is calculated against the FS UUID, the +directory's inode number, the directory's inode generation number, and +the entire directory entry block up to (but not including) the fake +directory entry. + +Hash Tree Directories +~~~~~~~~~~~~~~~~~~~~~ + +A linear array of directory entries isn't great for performance, so a +new feature was added to ext3 to provide a faster (but peculiar) +balanced tree keyed off a hash of the directory entry name. If the +EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a +hashed btree (htree) to organize and find directory entries. For +backwards read-only compatibility with ext2, this tree is actually +hidden inside the directory file, masquerading as “empty” directory data +blocks! It was stated previously that the end of the linear directory +entry table was signified with an entry pointing to inode 0; this is +(ab)used to fool the old linear-scan algorithm into thinking that the +rest of the directory block is empty so that it moves on. + +The root of the tree always lives in the first data block of the +directory. By ext2 custom, the '.' and '..' entries must appear at the +beginning of this first block, so they are put here as two +``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of +the root node contains metadata about the tree and finally a hash->block +map to find nodes that are lower in the htree. If +``dx_root.info.indirect_levels`` is non-zero then the htree has two +levels; the data block pointed to by the root node's map is an interior +node, which is indexed by a minor hash. Interior nodes in this tree +contains a zeroed out ``struct ext4_dir_entry_2`` followed by a +minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear +array of all ``struct ext4_dir_entry_2``; all of these entries +(presumably) hash to the same value. If there is an overflow, the +entries simply overflow into the next leaf node, and the +least-significant bit of the hash (in the interior node map) that gets +us to this next leaf node is set. + +To traverse the directory as a htree, the code calculates the hash of +the desired file name and uses it to find the corresponding block +number. If the tree is flat, the block is a linear array of directory +entries that can be searched; otherwise, the minor hash of the file name +is computed and used against this second block to find the corresponding +third block number. That third block number will be a linear array of +directory entries. + +To traverse the directory as a linear array (such as the old code does), +the code simply reads every data block in the directory. The blocks used +for the htree will appear to have no entries (aside from '.' and '..') +and so only the leaf nodes will appear to have any interesting content. + +The root of the htree is in ``struct dx_root``, which is the full length +of a data block: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - dot.inode + - inode number of this directory. + * - 0x4 + - \_\_le16 + - dot.rec\_len + - Length of this record, 12. + * - 0x6 + - u8 + - dot.name\_len + - Length of the name, 1. + * - 0x7 + - u8 + - dot.file\_type + - File type of this entry, 0x2 (directory) (if the feature flag is set). + * - 0x8 + - char + - dot.name[4] + - “.\\0\\0\\0” + * - 0xC + - \_\_le32 + - dotdot.inode + - inode number of parent directory. + * - 0x10 + - \_\_le16 + - dotdot.rec\_len + - block\_size - 12. The record length is long enough to cover all htree + data. + * - 0x12 + - u8 + - dotdot.name\_len + - Length of the name, 2. + * - 0x13 + - u8 + - dotdot.file\_type + - File type of this entry, 0x2 (directory) (if the feature flag is set). + * - 0x14 + - char + - dotdot\_name[4] + - “..\\0\\0” + * - 0x18 + - \_\_le32 + - struct dx\_root\_info.reserved\_zero + - Zero. + * - 0x1C + - u8 + - struct dx\_root\_info.hash\_version + - Hash type, see dirhash_ table below. + * - 0x1D + - u8 + - struct dx\_root\_info.info\_length + - Length of the tree information, 0x8. + * - 0x1E + - u8 + - struct dx\_root\_info.indirect\_levels + - Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR + feature is set; cannot be larger than 2 otherwise. + * - 0x1F + - u8 + - struct dx\_root\_info.unused\_flags + - + * - 0x20 + - \_\_le16 + - limit + - Maximum number of dx\_entries that can follow this header, plus 1 for + the header itself. + * - 0x22 + - \_\_le16 + - count + - Actual number of dx\_entries that follow this header, plus 1 for the + header itself. + * - 0x24 + - \_\_le32 + - block + - The block number (within the directory file) that goes with hash=0. + * - 0x28 + - struct dx\_entry + - entries[0] + - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. + +.. _dirhash: + +The directory hash is one of the following values: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x0 + - Legacy. + * - 0x1 + - Half MD4. + * - 0x2 + - Tea. + * - 0x3 + - Legacy, unsigned. + * - 0x4 + - Half MD4, unsigned. + * - 0x5 + - Tea, unsigned. + +Interior nodes of an htree are recorded as ``struct dx_node``, which is +also the full length of a data block: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - fake.inode + - Zero, to make it look like this entry is not in use. + * - 0x4 + - \_\_le16 + - fake.rec\_len + - The size of the block, in order to hide all of the dx\_node data. + * - 0x6 + - u8 + - name\_len + - Zero. There is no name for this “unused” directory entry. + * - 0x7 + - u8 + - file\_type + - Zero. There is no file type for this “unused” directory entry. + * - 0x8 + - \_\_le16 + - limit + - Maximum number of dx\_entries that can follow this header, plus 1 for + the header itself. + * - 0xA + - \_\_le16 + - count + - Actual number of dx\_entries that follow this header, plus 1 for the + header itself. + * - 0xE + - \_\_le32 + - block + - The block number (within the directory file) that goes with the lowest + hash value of this block. This value is stored in the parent block. + * - 0x12 + - struct dx\_entry + - entries[0] + - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. + +The hash maps that exist in both ``struct dx_root`` and +``struct dx_node`` are recorded as ``struct dx_entry``, which is 8 bytes +long: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - hash + - Hash code. + * - 0x4 + - \_\_le32 + - block + - Block number (within the directory file, not filesystem blocks) of the + next node in the htree. + +(If you think this is all quite clever and peculiar, so does the +author.) + +If metadata checksums are enabled, the last 8 bytes of the directory +block (precisely the length of one dx\_entry) are used to store a +``struct dx_tail``, which contains the checksum. The ``limit`` and +``count`` entries in the dx\_root/dx\_node structures are adjusted as +necessary to fit the dx\_tail into the block. If there is no space for +the dx\_tail, the user is notified to run e2fsck -D to rebuild the +directory index (which will ensure that there's space for the checksum. +The dx\_tail structure is 8 bytes long and looks like this: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - u32 + - dt\_reserved + - Zero. + * - 0x4 + - \_\_le32 + - dt\_checksum + - Checksum of the htree directory block. + +The checksum is calculated against the FS UUID, the htree index header +(dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in +use, and the tail block (dx\_tail). diff --git a/Documentation/filesystems/ext4/dynamic.rst b/Documentation/filesystems/ext4/dynamic.rst new file mode 100644 index 000000000000..bb0c84333341 --- /dev/null +++ b/Documentation/filesystems/ext4/dynamic.rst @@ -0,0 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Dynamic Structures +================== + +Dynamic metadata are created on the fly when files and blocks are +allocated to files. + +.. include:: inodes.rst +.. include:: ifork.rst +.. include:: directory.rst +.. include:: attributes.rst diff --git a/Documentation/filesystems/ext4/eainode.rst b/Documentation/filesystems/ext4/eainode.rst new file mode 100644 index 000000000000..ecc0d01a0a72 --- /dev/null +++ b/Documentation/filesystems/ext4/eainode.rst @@ -0,0 +1,18 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Large Extended Attribute Values +------------------------------- + +To enable ext4 to store extended attribute values that do not fit in the +inode or in the single extended attribute block attached to an inode, +the EA\_INODE feature allows us to store the value in the data blocks of +a regular file inode. This “EA inode” is linked only from the extended +attribute name index and must not appear in a directory entry. The +inode's i\_atime field is used to store a checksum of the xattr value; +and i\_ctime/i\_version store a 64-bit reference count, which enables +sharing of large xattr values between multiple owning inodes. For +backward compatibility with older versions of this feature, the +i\_mtime/i\_generation *may* store a back-reference to the inode number +and i\_generation of the **one** owning inode (in cases where the EA +inode is not referenced by multiple inodes) to verify that the EA inode +is the correct one being accessed. diff --git a/Documentation/filesystems/ext4/globals.rst b/Documentation/filesystems/ext4/globals.rst new file mode 100644 index 000000000000..368bf7662b96 --- /dev/null +++ b/Documentation/filesystems/ext4/globals.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Global Structures +================= + +The filesystem is sharded into a number of block groups, each of which +have static metadata at fixed locations. + +.. include:: super.rst +.. include:: group_descr.rst +.. include:: bitmaps.rst +.. include:: mmp.rst +.. include:: journal.rst diff --git a/Documentation/filesystems/ext4/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst new file mode 100644 index 000000000000..0f783ed88592 --- /dev/null +++ b/Documentation/filesystems/ext4/group_descr.rst @@ -0,0 +1,170 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Block Group Descriptors +----------------------- + +Each block group on the filesystem has one of these descriptors +associated with it. As noted in the Layout section above, the group +descriptors (if present) are the second item in the block group. The +standard configuration is for each block group to contain a full copy of +the block group descriptor table unless the sparse\_super feature flag +is set. + +Notice how the group descriptor records the location of both bitmaps and +the inode table (i.e. they can float). This means that within a block +group, the only data structures with fixed locations are the superblock +and the group descriptor table. The flex\_bg mechanism uses this +property to group several block groups into a flex group and lay out all +of the groups' bitmaps and inode tables into one long run in the first +group of the flex group. + +If the meta\_bg feature flag is set, then several block groups are +grouped together into a meta group. Note that in the meta\_bg case, +however, the first and last two block groups within the larger meta +group contain only group descriptors for the groups inside the meta +group. + +flex\_bg and meta\_bg do not appear to be mutually exclusive features. + +In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the +block group descriptor was only 32 bytes long and therefore ends at +bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the +block group descriptor expands to at least the 64 bytes described below; +the size is stored in the superblock. + +If gdt\_csum is set and metadata\_csum is not set, the block group +checksum is the crc16 of the FS UUID, the group number, and the group +descriptor structure. If metadata\_csum is set, then the block group +checksum is the lower 16 bits of the checksum of the FS UUID, the group +number, and the group descriptor structure. Both block and inode bitmap +checksums are calculated against the FS UUID, the group number, and the +entire bitmap. + +The block group descriptor is laid out in ``struct ext4_group_desc``. + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - bg\_block\_bitmap\_lo + - Lower 32-bits of location of block bitmap. + * - 0x4 + - \_\_le32 + - bg\_inode\_bitmap\_lo + - Lower 32-bits of location of inode bitmap. + * - 0x8 + - \_\_le32 + - bg\_inode\_table\_lo + - Lower 32-bits of location of inode table. + * - 0xC + - \_\_le16 + - bg\_free\_blocks\_count\_lo + - Lower 16-bits of free block count. + * - 0xE + - \_\_le16 + - bg\_free\_inodes\_count\_lo + - Lower 16-bits of free inode count. + * - 0x10 + - \_\_le16 + - bg\_used\_dirs\_count\_lo + - Lower 16-bits of directory count. + * - 0x12 + - \_\_le16 + - bg\_flags + - Block group flags. See the bgflags_ table below. + * - 0x14 + - \_\_le32 + - bg\_exclude\_bitmap\_lo + - Lower 32-bits of location of snapshot exclusion bitmap. + * - 0x18 + - \_\_le16 + - bg\_block\_bitmap\_csum\_lo + - Lower 16-bits of the block bitmap checksum. + * - 0x1A + - \_\_le16 + - bg\_inode\_bitmap\_csum\_lo + - Lower 16-bits of the inode bitmap checksum. + * - 0x1C + - \_\_le16 + - bg\_itable\_unused\_lo + - Lower 16-bits of unused inode count. If set, we needn't scan past the + ``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the + inode table for this group. + * - 0x1E + - \_\_le16 + - bg\_checksum + - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the + RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) & + 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set. + * - + - + - + - These fields only exist if the 64bit feature is enabled and s_desc_size + > 32. + * - 0x20 + - \_\_le32 + - bg\_block\_bitmap\_hi + - Upper 32-bits of location of block bitmap. + * - 0x24 + - \_\_le32 + - bg\_inode\_bitmap\_hi + - Upper 32-bits of location of inodes bitmap. + * - 0x28 + - \_\_le32 + - bg\_inode\_table\_hi + - Upper 32-bits of location of inodes table. + * - 0x2C + - \_\_le16 + - bg\_free\_blocks\_count\_hi + - Upper 16-bits of free block count. + * - 0x2E + - \_\_le16 + - bg\_free\_inodes\_count\_hi + - Upper 16-bits of free inode count. + * - 0x30 + - \_\_le16 + - bg\_used\_dirs\_count\_hi + - Upper 16-bits of directory count. + * - 0x32 + - \_\_le16 + - bg\_itable\_unused\_hi + - Upper 16-bits of unused inode count. + * - 0x34 + - \_\_le32 + - bg\_exclude\_bitmap\_hi + - Upper 32-bits of location of snapshot exclusion bitmap. + * - 0x38 + - \_\_le16 + - bg\_block\_bitmap\_csum\_hi + - Upper 16-bits of the block bitmap checksum. + * - 0x3A + - \_\_le16 + - bg\_inode\_bitmap\_csum\_hi + - Upper 16-bits of the inode bitmap checksum. + * - 0x3C + - \_\_u32 + - bg\_reserved + - Padding to 64 bytes. + +.. _bgflags: + +Block group flags can be any combination of the following: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT). + * - 0x2 + - block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT). + * - 0x4 + - inode table is zeroed (EXT4\_BG\_INODE\_ZEROED). diff --git a/Documentation/filesystems/ext4/ifork.rst b/Documentation/filesystems/ext4/ifork.rst new file mode 100644 index 000000000000..b9816d5a896b --- /dev/null +++ b/Documentation/filesystems/ext4/ifork.rst @@ -0,0 +1,194 @@ +.. SPDX-License-Identifier: GPL-2.0 + +The Contents of inode.i\_block +------------------------------ + +Depending on the type of file an inode describes, the 60 bytes of +storage in ``inode.i_block`` can be used in different ways. In general, +regular files and directories will use it for file block indexing +information, and special files will use it for special purposes. + +Symbolic Links +~~~~~~~~~~~~~~ + +The target of a symbolic link will be stored in this field if the target +string is less than 60 bytes long. Otherwise, either extents or block +maps will be used to allocate data blocks to store the link target. + +Direct/Indirect Block Addressing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In ext2/3, file block numbers were mapped to logical block numbers by +means of an (up to) three level 1-1 block map. To find the logical block +that stores a particular file block, the code would navigate through +this increasingly complicated structure. Notice that there is neither a +magic number nor a checksum to provide any level of confidence that the +block isn't full of garbage. + +.. ifconfig:: builder != 'latex' + + .. include:: blockmap.rst + +.. ifconfig:: builder == 'latex' + + [Table omitted because LaTeX doesn't support nested tables.] + +Note that with this block mapping scheme, it is necessary to fill out a +lot of mapping data even for a large contiguous file! This inefficiency +led to the creation of the extent mapping scheme, discussed below. + +Notice also that a file using this mapping scheme cannot be placed +higher than 2^32 blocks. + +Extent Tree +~~~~~~~~~~~ + +In ext4, the file to logical block map has been replaced with an extent +tree. Under the old scheme, allocating a contiguous run of 1,000 blocks +requires an indirect block to map all 1,000 entries; with extents, the +mapping is reduced to a single ``struct ext4_extent`` with +``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate +very large files with a single extent, at a considerable reduction in +metadata block use, and some improvement in disk efficiency. The inode +must have the extents flag (0x80000) flag set for this feature to be in +use. + +Extents are arranged as a tree. Each node of the tree begins with a +``struct ext4_extent_header``. If the node is an interior node +(``eh.eh_depth`` > 0), the header is followed by ``eh.eh_entries`` +instances of ``struct ext4_extent_idx``; each of these index entries +points to a block containing more nodes in the extent tree. If the node +is a leaf node (``eh.eh_depth == 0``), then the header is followed by +``eh.eh_entries`` instances of ``struct ext4_extent``; these instances +point to the file's data blocks. The root node of the extent tree is +stored in ``inode.i_block``, which allows for the first four extents to +be recorded without the use of extra metadata blocks. + +The extent tree header is recorded in ``struct ext4_extent_header``, +which is 12 bytes long: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - eh\_magic + - Magic number, 0xF30A. + * - 0x2 + - \_\_le16 + - eh\_entries + - Number of valid entries following the header. + * - 0x4 + - \_\_le16 + - eh\_max + - Maximum number of entries that could follow the header. + * - 0x6 + - \_\_le16 + - eh\_depth + - Depth of this extent node in the extent tree. 0 = this extent node + points to data blocks; otherwise, this extent node points to other + extent nodes. The extent tree can be at most 5 levels deep: a logical + block number can be at most ``2^32``, and the smallest ``n`` that + satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5. + * - 0x8 + - \_\_le32 + - eh\_generation + - Generation of the tree. (Used by Lustre, but not standard ext4). + +Internal nodes of the extent tree, also known as index nodes, are +recorded as ``struct ext4_extent_idx``, and are 12 bytes long: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - ei\_block + - This index node covers file blocks from 'block' onward. + * - 0x4 + - \_\_le32 + - ei\_leaf\_lo + - Lower 32-bits of the block number of the extent node that is the next + level lower in the tree. The tree node pointed to can be either another + internal node or a leaf node, described below. + * - 0x8 + - \_\_le16 + - ei\_leaf\_hi + - Upper 16-bits of the previous field. + * - 0xA + - \_\_u16 + - ei\_unused + - + +Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, +and are also 12 bytes long: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - ee\_block + - First file block number that this extent covers. + * - 0x4 + - \_\_le16 + - ee\_len + - Number of blocks covered by extent. If the value of this field is <= + 32768, the extent is initialized. If the value of the field is > 32768, + the extent is uninitialized and the actual extent length is ``ee_len`` - + 32768. Therefore, the maximum length of a initialized extent is 32768 + blocks, and the maximum length of an uninitialized extent is 32767. + * - 0x6 + - \_\_le16 + - ee\_start\_hi + - Upper 16-bits of the block number to which this extent points. + * - 0x8 + - \_\_le32 + - ee\_start\_lo + - Lower 32-bits of the block number to which this extent points. + +Prior to the introduction of metadata checksums, the extent header + +extent entries always left at least 4 bytes of unallocated space at the +end of each extent tree data block (because (2^x % 12) >= 4). Therefore, +the 32-bit checksum is inserted into this space. The 4 extents in the +inode do not need checksumming, since the inode is already checksummed. +The checksum is calculated against the FS UUID, the inode number, the +inode generation, and the entire extent block leading up to (but not +including) the checksum itself. + +``struct ext4_extent_tail`` is 4 bytes long: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - eb\_checksum + - Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock) + +Inline Data +~~~~~~~~~~~ + +If the inline data feature is enabled for the filesystem and the flag is +set for the inode, it is possible that the first 60 bytes of the file +data are stored here. diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst index 427bc115012e..3be3e54d480d 100644 --- a/Documentation/filesystems/ext4/index.rst +++ b/Documentation/filesystems/ext4/index.rst @@ -1,16 +1,14 @@ .. SPDX-License-Identifier: GPL-2.0 -=============== -ext4 Filesystem -=============== - -General usage and on-disk artifacts writen by ext4. More documentation may -be ported from the wiki as time permits. This should be considered the -canonical source of information as the details here have been reviewed by -the ext4 community. +=================================== +ext4 Data Structures and Algorithms +=================================== .. toctree:: - :maxdepth: 5 + :maxdepth: 6 :numbered: - ondisk/index + about.rst + overview.rst + globals.rst + dynamic.rst diff --git a/Documentation/filesystems/ext4/inlinedata.rst b/Documentation/filesystems/ext4/inlinedata.rst new file mode 100644 index 000000000000..d1075178ce0b --- /dev/null +++ b/Documentation/filesystems/ext4/inlinedata.rst @@ -0,0 +1,37 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Inline Data +----------- + +The inline data feature was designed to handle the case that a file's +data is so tiny that it readily fits inside the inode, which +(theoretically) reduces disk block consumption and reduces seeks. If the +file is smaller than 60 bytes, then the data are stored inline in +``inode.i_block``. If the rest of the file would fit inside the extended +attribute space, then it might be found as an extended attribute +“system.data” within the inode body (“ibody EA”). This of course +constrains the amount of extended attributes one can attach to an inode. +If the data size increases beyond i\_block + ibody EA, a regular block +is allocated and the contents moved to that block. + +Pending a change to compact the extended attribute key used to store +inline data, one ought to be able to store 160 bytes of data in a +256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to +that, the limit was 156 bytes due to inefficient use of inode space. + +The inline data feature requires the presence of an extended attribute +for “system.data”, even if the attribute value is zero length. + +Inline Directories +~~~~~~~~~~~~~~~~~~ + +The first four bytes of i\_block are the inode number of the parent +directory. Following that is a 56-byte space for an array of directory +entries; see ``struct ext4_dir_entry``. If there is a “system.data” +attribute in the inode body, the EA value is an array of +``struct ext4_dir_entry`` as well. Note that for inline directories, the +i\_block and EA space are treated as separate dirent blocks; directory +entries cannot span the two. + +Inline directory entries are not checksummed, as the inode checksum +should protect all inline data contents. diff --git a/Documentation/filesystems/ext4/inodes.rst b/Documentation/filesystems/ext4/inodes.rst new file mode 100644 index 000000000000..6bd35e506b6f --- /dev/null +++ b/Documentation/filesystems/ext4/inodes.rst @@ -0,0 +1,576 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Index Nodes +----------- + +In a regular UNIX filesystem, the inode stores all the metadata +pertaining to the file (time stamps, block maps, extended attributes, +etc), not the directory entry. To find the information associated with a +file, one must traverse the directory files to find the directory entry +associated with a file, then load the inode to find the metadata for +that file. ext4 appears to cheat (for performance reasons) a little bit +by storing a copy of the file type (normally stored in the inode) in the +directory entry. (Compare all this to FAT, which stores all the file +information directly in the directory entry, but does not support hard +links and is in general more seek-happy than ext4 due to its simpler +block allocator and extensive use of linked lists.) + +The inode table is a linear array of ``struct ext4_inode``. The table is +sized to have enough blocks to store at least +``sb.s_inode_size * sb.s_inodes_per_group`` bytes. The number of the +block group containing an inode can be calculated as +``(inode_number - 1) / sb.s_inodes_per_group``, and the offset into the +group's table is ``(inode_number - 1) % sb.s_inodes_per_group``. There +is no inode 0. + +The inode checksum is calculated against the FS UUID, the inode number, +and the inode structure itself. + +The inode table entry is laid out in ``struct ext4_inode``. + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + :class: longtable + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - i\_mode + - File mode. See the table i_mode_ below. + * - 0x2 + - \_\_le16 + - i\_uid + - Lower 16-bits of Owner UID. + * - 0x4 + - \_\_le32 + - i\_size\_lo + - Lower 32-bits of size in bytes. + * - 0x8 + - \_\_le32 + - i\_atime + - Last access time, in seconds since the epoch. However, if the EA\_INODE + inode flag is set, this inode stores an extended attribute value and + this field contains the checksum of the value. + * - 0xC + - \_\_le32 + - i\_ctime + - Last inode change time, in seconds since the epoch. However, if the + EA\_INODE inode flag is set, this inode stores an extended attribute + value and this field contains the lower 32 bits of the attribute value's + reference count. + * - 0x10 + - \_\_le32 + - i\_mtime + - Last data modification time, in seconds since the epoch. However, if the + EA\_INODE inode flag is set, this inode stores an extended attribute + value and this field contains the number of the inode that owns the + extended attribute. + * - 0x14 + - \_\_le32 + - i\_dtime + - Deletion Time, in seconds since the epoch. + * - 0x18 + - \_\_le16 + - i\_gid + - Lower 16-bits of GID. + * - 0x1A + - \_\_le16 + - i\_links\_count + - Hard link count. Normally, ext4 does not permit an inode to have more + than 65,000 hard links. This applies to files as well as directories, + which means that there cannot be more than 64,998 subdirectories in a + directory (each subdirectory's '..' entry counts as a hard link, as does + the '.' entry in the directory itself). With the DIR\_NLINK feature + enabled, ext4 supports more than 64,998 subdirectories by setting this + field to 1 to indicate that the number of hard links is not known. + * - 0x1C + - \_\_le32 + - i\_blocks\_lo + - Lower 32-bits of “block” count. If the huge\_file feature flag is not + set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks + on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in + ``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi + << 32)`` 512-byte blocks on disk. If huge\_file is set and + EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file + consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on + disk. + * - 0x20 + - \_\_le32 + - i\_flags + - Inode flags. See the table i_flags_ below. + * - 0x24 + - 4 bytes + - i\_osd1 + - See the table i_osd1_ for more details. + * - 0x28 + - 60 bytes + - i\_block[EXT4\_N\_BLOCKS=15] + - Block map or extent tree. See the section “The Contents of inode.i\_block”. + * - 0x64 + - \_\_le32 + - i\_generation + - File version (for NFS). + * - 0x68 + - \_\_le32 + - i\_file\_acl\_lo + - Lower 32-bits of extended attribute block. ACLs are of course one of + many possible extended attributes; I think the name of this field is a + result of the first use of extended attributes being for ACLs. + * - 0x6C + - \_\_le32 + - i\_size\_high / i\_dir\_acl + - Upper 32-bits of file/directory size. In ext2/3 this field was named + i\_dir\_acl, though it was usually set to zero and never used. + * - 0x70 + - \_\_le32 + - i\_obso\_faddr + - (Obsolete) fragment address. + * - 0x74 + - 12 bytes + - i\_osd2 + - See the table i_osd2_ for more details. + * - 0x80 + - \_\_le16 + - i\_extra\_isize + - Size of this inode - 128. Alternately, the size of the extended inode + fields beyond the original ext2 inode, including this field. + * - 0x82 + - \_\_le16 + - i\_checksum\_hi + - Upper 16-bits of the inode checksum. + * - 0x84 + - \_\_le32 + - i\_ctime\_extra + - Extra change time bits. This provides sub-second precision. See Inode + Timestamps section. + * - 0x88 + - \_\_le32 + - i\_mtime\_extra + - Extra modification time bits. This provides sub-second precision. + * - 0x8C + - \_\_le32 + - i\_atime\_extra + - Extra access time bits. This provides sub-second precision. + * - 0x90 + - \_\_le32 + - i\_crtime + - File creation time, in seconds since the epoch. + * - 0x94 + - \_\_le32 + - i\_crtime\_extra + - Extra file creation time bits. This provides sub-second precision. + * - 0x98 + - \_\_le32 + - i\_version\_hi + - Upper 32-bits for version number. + * - 0x9C + - \_\_le32 + - i\_projid + - Project ID. + +.. _i_mode: + +The ``i_mode`` value is a combination of the following flags: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - S\_IXOTH (Others may execute) + * - 0x2 + - S\_IWOTH (Others may write) + * - 0x4 + - S\_IROTH (Others may read) + * - 0x8 + - S\_IXGRP (Group members may execute) + * - 0x10 + - S\_IWGRP (Group members may write) + * - 0x20 + - S\_IRGRP (Group members may read) + * - 0x40 + - S\_IXUSR (Owner may execute) + * - 0x80 + - S\_IWUSR (Owner may write) + * - 0x100 + - S\_IRUSR (Owner may read) + * - 0x200 + - S\_ISVTX (Sticky bit) + * - 0x400 + - S\_ISGID (Set GID) + * - 0x800 + - S\_ISUID (Set UID) + * - + - These are mutually-exclusive file types: + * - 0x1000 + - S\_IFIFO (FIFO) + * - 0x2000 + - S\_IFCHR (Character device) + * - 0x4000 + - S\_IFDIR (Directory) + * - 0x6000 + - S\_IFBLK (Block device) + * - 0x8000 + - S\_IFREG (Regular file) + * - 0xA000 + - S\_IFLNK (Symbolic link) + * - 0xC000 + - S\_IFSOCK (Socket) + +.. _i_flags: + +The ``i_flags`` field is a combination of these values: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented) + * - 0x2 + - This file should be preserved, should undeletion be desired + (EXT4\_UNRM\_FL). (not implemented) + * - 0x4 + - File is compressed (EXT4\_COMPR\_FL). (not really implemented) + * - 0x8 + - All writes to the file must be synchronous (EXT4\_SYNC\_FL). + * - 0x10 + - File is immutable (EXT4\_IMMUTABLE\_FL). + * - 0x20 + - File can only be appended (EXT4\_APPEND\_FL). + * - 0x40 + - The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL). + * - 0x80 + - Do not update access time (EXT4\_NOATIME\_FL). + * - 0x100 + - Dirty compressed file (EXT4\_DIRTY\_FL). (not used) + * - 0x200 + - File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used) + * - 0x400 + - Do not compress file (EXT4\_NOCOMPR\_FL). (not used) + * - 0x800 + - Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was + EXT4\_ECOMPR\_FL (compression error), which was never used. + * - 0x1000 + - Directory has hashed indexes (EXT4\_INDEX\_FL). + * - 0x2000 + - AFS magic directory (EXT4\_IMAGIC\_FL). + * - 0x4000 + - File data must always be written through the journal + (EXT4\_JOURNAL\_DATA\_FL). + * - 0x8000 + - File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4) + * - 0x10000 + - All directory entry data should be written synchronously (see + ``dirsync``) (EXT4\_DIRSYNC\_FL). + * - 0x20000 + - Top of directory hierarchy (EXT4\_TOPDIR\_FL). + * - 0x40000 + - This is a huge file (EXT4\_HUGE\_FILE\_FL). + * - 0x80000 + - Inode uses extents (EXT4\_EXTENTS\_FL). + * - 0x200000 + - Inode stores a large extended attribute value in its data blocks + (EXT4\_EA\_INODE\_FL). + * - 0x400000 + - This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL). + (deprecated) + * - 0x01000000 + - Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline) + * - 0x04000000 + - Snapshot is being deleted (``EXT4_SNAPFILE_DELETED_FL``). (not in + mainline) + * - 0x08000000 + - Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in + mainline) + * - 0x10000000 + - Inode has inline data (EXT4\_INLINE\_DATA\_FL). + * - 0x20000000 + - Create children with the same project ID (EXT4\_PROJINHERIT\_FL). + * - 0x80000000 + - Reserved for ext4 library (EXT4\_RESERVED\_FL). + * - + - Aggregate flags: + * - 0x4BDFFF + - User-visible flags. + * - 0x4B80FF + - User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and + EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's + EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of + these flags in a special manner and they are masked out of the set of + flags that are saved directly to i\_flags. + +.. _i_osd1: + +The ``osd1`` field has multiple meanings depending on the creator: + +Linux: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - l\_i\_version + - Inode version. However, if the EA\_INODE inode flag is set, this inode + stores an extended attribute value and this field contains the upper 32 + bits of the attribute value's reference count. + +Hurd: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - h\_i\_translator + - ?? + +Masix: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - m\_i\_reserved + - ?? + +.. _i_osd2: + +The ``osd2`` field has multiple meanings depending on the filesystem creator: + +Linux: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - l\_i\_blocks\_high + - Upper 16-bits of the block count. Please see the note attached to + i\_blocks\_lo. + * - 0x2 + - \_\_le16 + - l\_i\_file\_acl\_high + - Upper 16-bits of the extended attribute block (historically, the file + ACL location). See the Extended Attributes section below. + * - 0x4 + - \_\_le16 + - l\_i\_uid\_high + - Upper 16-bits of the Owner UID. + * - 0x6 + - \_\_le16 + - l\_i\_gid\_high + - Upper 16-bits of the GID. + * - 0x8 + - \_\_le16 + - l\_i\_checksum\_lo + - Lower 16-bits of the inode checksum. + * - 0xA + - \_\_le16 + - l\_i\_reserved + - Unused. + +Hurd: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - h\_i\_reserved1 + - ?? + * - 0x2 + - \_\_u16 + - h\_i\_mode\_high + - Upper 16-bits of the file mode. + * - 0x4 + - \_\_le16 + - h\_i\_uid\_high + - Upper 16-bits of the Owner UID. + * - 0x6 + - \_\_le16 + - h\_i\_gid\_high + - Upper 16-bits of the GID. + * - 0x8 + - \_\_u32 + - h\_i\_author + - Author code? + +Masix: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le16 + - h\_i\_reserved1 + - ?? + * - 0x2 + - \_\_u16 + - m\_i\_file\_acl\_high + - Upper 16-bits of the extended attribute block (historically, the file + ACL location). + * - 0x4 + - \_\_u32 + - m\_i\_reserved2[2] + - ?? + +Inode Size +~~~~~~~~~~ + +In ext2 and ext3, the inode structure size was fixed at 128 bytes +(``EXT2_GOOD_OLD_INODE_SIZE``) and each inode had a disk record size of +128 bytes. Starting with ext4, it is possible to allocate a larger +on-disk inode at format time for all inodes in the filesystem to provide +space beyond the end of the original ext2 inode. The on-disk inode +record size is recorded in the superblock as ``s_inode_size``. The +number of bytes actually used by struct ext4\_inode beyond the original +128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each +inode, which allows struct ext4\_inode to grow for a new kernel without +having to upgrade all of the on-disk inodes. Access to fields beyond +EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within +``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as +of October 2013) the inode structure is 156 bytes +(``i_extra_isize = 28``). The extra space between the end of the inode +structure and the end of the inode record can be used to store extended +attributes. Each inode record can be as large as the filesystem block +size, though this is not terribly efficient. + +Finding an Inode +~~~~~~~~~~~~~~~~ + +Each block group contains ``sb->s_inodes_per_group`` inodes. Because +inode 0 is defined not to exist, this formula can be used to find the +block group that an inode lives in: +``bg = (inode_num - 1) / sb->s_inodes_per_group``. The particular inode +can be found within the block group's inode table at +``index = (inode_num - 1) % sb->s_inodes_per_group``. To get the byte +address within the inode table, use +``offset = index * sb->s_inode_size``. + +Inode Timestamps +~~~~~~~~~~~~~~~~ + +Four timestamps are recorded in the lower 128 bytes of the inode +structure -- inode change time (ctime), access time (atime), data +modification time (mtime), and deletion time (dtime). The four fields +are 32-bit signed integers that represent seconds since the Unix epoch +(1970-01-01 00:00:00 GMT), which means that the fields will overflow in +January 2038. For inodes that are not linked from any directory but are +still open (orphan inodes), the dtime field is overloaded for use with +the orphan list. The superblock field ``s_last_orphan`` points to the +first inode in the orphan list; dtime is then the number of the next +orphaned inode, or zero if there are no more orphans. + +If the inode structure size ``sb->s_inode_size`` is larger than 128 +bytes and the ``i_inode_extra`` field is large enough to encompass the +respective ``i_[cma]time_extra`` field, the ctime, atime, and mtime +inode fields are widened to 64 bits. Within this “extra” 32-bit field, +the lower two bits are used to extend the 32-bit seconds field to be 34 +bit wide; the upper 30 bits are used to provide nanosecond timestamp +accuracy. Therefore, timestamps should not overflow until May 2446. +dtime was not widened. There is also a fifth timestamp to record inode +creation time (crtime); this field is 64-bits wide and decoded in the +same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible +through the regular stat() interface, though debugfs will report them. + +We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)). +In other words: + +.. list-table:: + :widths: 20 20 20 20 20 + :header-rows: 1 + + * - Extra epoch bits + - MSB of 32-bit time + - Adjustment for signed 32-bit to 64-bit tv\_sec + - Decoded 64-bit tv\_sec + - valid time range + * - 0 0 + - 1 + - 0 + - ``-0x80000000 - -0x00000001`` + - 1901-12-13 to 1969-12-31 + * - 0 0 + - 0 + - 0 + - ``0x000000000 - 0x07fffffff`` + - 1970-01-01 to 2038-01-19 + * - 0 1 + - 1 + - 0x100000000 + - ``0x080000000 - 0x0ffffffff`` + - 2038-01-19 to 2106-02-07 + * - 0 1 + - 0 + - 0x100000000 + - ``0x100000000 - 0x17fffffff`` + - 2106-02-07 to 2174-02-25 + * - 1 0 + - 1 + - 0x200000000 + - ``0x180000000 - 0x1ffffffff`` + - 2174-02-25 to 2242-03-16 + * - 1 0 + - 0 + - 0x200000000 + - ``0x200000000 - 0x27fffffff`` + - 2242-03-16 to 2310-04-04 + * - 1 1 + - 1 + - 0x300000000 + - ``0x280000000 - 0x2ffffffff`` + - 2310-04-04 to 2378-04-22 + * - 1 1 + - 0 + - 0x300000000 + - ``0x300000000 - 0x37fffffff`` + - 2378-04-22 to 2446-05-10 + +This is a somewhat odd encoding since there are effectively seven times +as many positive values as negative values. There have also been +long-standing bugs decoding and encoding dates beyond 2038, which don't +seem to be fixed as of kernel 3.12 and e2fsprogs 1.42.8. 64-bit kernels +incorrectly use the extra epoch bits 1,1 for dates between 1901 and +1970. At some point the kernel will be fixed and e2fsck will fix this +situation, assuming that it is run before 2310. diff --git a/Documentation/filesystems/ext4/journal.rst b/Documentation/filesystems/ext4/journal.rst new file mode 100644 index 000000000000..ea613ee701f5 --- /dev/null +++ b/Documentation/filesystems/ext4/journal.rst @@ -0,0 +1,611 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Journal (jbd2) +-------------- + +Introduced in ext3, the ext4 filesystem employs a journal to protect the +filesystem against corruption in the case of a system crash. A small +continuous region of disk (default 128MiB) is reserved inside the +filesystem as a place to land “important” data writes on-disk as quickly +as possible. Once the important data transaction is fully written to the +disk and flushed from the disk write cache, a record of the data being +committed is also written to the journal. At some later point in time, +the journal code writes the transactions to their final locations on +disk (this could involve a lot of seeking or a lot of small +read-write-erases) before erasing the commit record. Should the system +crash during the second slow write, the journal can be replayed all the +way to the latest commit record, guaranteeing the atomicity of whatever +gets written through the journal to the disk. The effect of this is to +guarantee that the filesystem does not become stuck midway through a +metadata update. + +For performance reasons, ext4 by default only writes filesystem metadata +through the journal. This means that file data blocks are /not/ +guaranteed to be in any consistent state after a crash. If this default +guarantee level (``data=ordered``) is not satisfactory, there is a mount +option to control journal behavior. If ``data=journal``, all data and +metadata are written to disk through the journal. This is slower but +safest. If ``data=writeback``, dirty data blocks are not flushed to the +disk before the metadata are written to disk through the journal. + +The journal inode is typically inode 8. The first 68 bytes of the +journal inode are replicated in the ext4 superblock. The journal itself +is normal (but hidden) file within the filesystem. The file usually +consumes an entire block group, though mke2fs tries to put it in the +middle of the disk. + +All fields in jbd2 are written to disk in big-endian order. This is the +opposite of ext4. + +NOTE: Both ext4 and ocfs2 use jbd2. + +The maximum size of a journal embedded in an ext4 filesystem is 2^32 +blocks. jbd2 itself does not seem to care. + +Layout +~~~~~~ + +Generally speaking, the journal has this format: + +.. list-table:: + :widths: 16 48 16 + :header-rows: 1 + + * - Superblock + - descriptor\_block (data\_blocks or revocation\_block) [more data or + revocations] commmit\_block + - [more transactions...] + * - + - One transaction + - + +Notice that a transaction begins with either a descriptor and some data, +or a block revocation list. A finished transaction always ends with a +commit. If there is no commit record (or the checksums don't match), the +transaction will be discarded during replay. + +External Journal +~~~~~~~~~~~~~~~~ + +Optionally, an ext4 filesystem can be created with an external journal +device (as opposed to an internal journal, which uses a reserved inode). +In this case, on the filesystem device, ``s_journal_inum`` should be +zero and ``s_journal_uuid`` should be set. On the journal device there +will be an ext4 super block in the usual place, with a matching UUID. +The journal superblock will be in the next full block after the +superblock. + +.. list-table:: + :widths: 12 12 12 32 12 + :header-rows: 1 + + * - 1024 bytes of padding + - ext4 Superblock + - Journal Superblock + - descriptor\_block (data\_blocks or revocation\_block) [more data or + revocations] commmit\_block + - [more transactions...] + * - + - + - + - One transaction + - + +Block Header +~~~~~~~~~~~~ + +Every block in the journal starts with a common 12-byte header +``struct journal_header_s``: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_be32 + - h\_magic + - jbd2 magic number, 0xC03B3998. + * - 0x4 + - \_\_be32 + - h\_blocktype + - Description of what this block contains. See the jbd2_blocktype_ table + below. + * - 0x8 + - \_\_be32 + - h\_sequence + - The transaction ID that goes with this block. + +.. _jbd2_blocktype: + +The journal block type can be any one of: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 1 + - Descriptor. This block precedes a series of data blocks that were + written through the journal during a transaction. + * - 2 + - Block commit record. This block signifies the completion of a + transaction. + * - 3 + - Journal superblock, v1. + * - 4 + - Journal superblock, v2. + * - 5 + - Block revocation records. This speeds up recovery by enabling the + journal to skip writing blocks that were subsequently rewritten. + +Super Block +~~~~~~~~~~~ + +The super block for the journal is much simpler as compared to ext4's. +The key data kept within are size of the journal, and where to find the +start of the log of transactions. + +The journal superblock is recorded as ``struct journal_superblock_s``, +which is 1024 bytes long: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - + - + - + - Static information describing the journal. + * - 0x0 + - journal\_header\_t (12 bytes) + - s\_header + - Common header identifying this as a superblock. + * - 0xC + - \_\_be32 + - s\_blocksize + - Journal device block size. + * - 0x10 + - \_\_be32 + - s\_maxlen + - Total number of blocks in this journal. + * - 0x14 + - \_\_be32 + - s\_first + - First block of log information. + * - + - + - + - Dynamic information describing the current state of the log. + * - 0x18 + - \_\_be32 + - s\_sequence + - First commit ID expected in log. + * - 0x1C + - \_\_be32 + - s\_start + - Block number of the start of log. Contrary to the comments, this field + being zero does not imply that the journal is clean! + * - 0x20 + - \_\_be32 + - s\_errno + - Error value, as set by jbd2\_journal\_abort(). + * - + - + - + - The remaining fields are only valid in a v2 superblock. + * - 0x24 + - \_\_be32 + - s\_feature\_compat; + - Compatible feature set. See the table jbd2_compat_ below. + * - 0x28 + - \_\_be32 + - s\_feature\_incompat + - Incompatible feature set. See the table jbd2_incompat_ below. + * - 0x2C + - \_\_be32 + - s\_feature\_ro\_compat + - Read-only compatible feature set. There aren't any of these currently. + * - 0x30 + - \_\_u8 + - s\_uuid[16] + - 128-bit uuid for journal. This is compared against the copy in the ext4 + super block at mount time. + * - 0x40 + - \_\_be32 + - s\_nr\_users + - Number of file systems sharing this journal. + * - 0x44 + - \_\_be32 + - s\_dynsuper + - Location of dynamic super block copy. (Not used?) + * - 0x48 + - \_\_be32 + - s\_max\_transaction + - Limit of journal blocks per transaction. (Not used?) + * - 0x4C + - \_\_be32 + - s\_max\_trans\_data + - Limit of data blocks per transaction. (Not used?) + * - 0x50 + - \_\_u8 + - s\_checksum\_type + - Checksum algorithm used for the journal. See jbd2_checksum_type_ for + more info. + * - 0x51 + - \_\_u8[3] + - s\_padding2 + - + * - 0x54 + - \_\_u32 + - s\_padding[42] + - + * - 0xFC + - \_\_be32 + - s\_checksum + - Checksum of the entire superblock, with this field set to zero. + * - 0x100 + - \_\_u8 + - s\_users[16\*48] + - ids of all file systems sharing the log. e2fsprogs/Linux don't allow + shared external journals, but I imagine Lustre (or ocfs2?), which use + the jbd2 code, might. + +.. _jbd2_compat: + +The journal compat features are any combination of the following: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Journal maintains checksums on the data blocks. + (JBD2\_FEATURE\_COMPAT\_CHECKSUM) + +.. _jbd2_incompat: + +The journal incompat features are any combination of the following: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE) + * - 0x2 + - Journal can deal with 64-bit block numbers. + (JBD2\_FEATURE\_INCOMPAT\_64BIT) + * - 0x4 + - Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT) + * - 0x8 + - This journal uses v2 of the checksum on-disk format. Each journal + metadata block gets its own checksum, and the block tags in the + descriptor table contain checksums for each of the data blocks in the + journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2) + * - 0x10 + - This journal uses v3 of the checksum on-disk format. This is the same as + v2, but the journal block tag size is fixed regardless of the size of + block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3) + +.. _jbd2_checksum_type: + +Journal checksum type codes are one of the following. crc32 or crc32c are the +most likely choices. + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 1 + - CRC32 + * - 2 + - MD5 + * - 3 + - SHA1 + * - 4 + - CRC32C + +Descriptor Block +~~~~~~~~~~~~~~~~ + +The descriptor block contains an array of journal block tags that +describe the final locations of the data blocks that follow in the +journal. Descriptor blocks are open-coded instead of being completely +described by a data structure, but here is the block structure anyway. +Descriptor blocks consume at least 36 bytes, but use a full block: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - journal\_header\_t + - (open coded) + - Common block header. + * - 0xC + - struct journal\_block\_tag\_s + - open coded array[] + - Enough tags either to fill up the block or to describe all the data + blocks that follow this descriptor block. + +Journal block tags have any of the following formats, depending on which +journal feature and block tag flags are set. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is +defined as ``struct journal_block_tag3_s``, which looks like the +following. The size is 16 or 32 bytes. + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - \_\_be32 + - t\_blocknr + - Lower 32-bits of the location of where the corresponding data block + should end up on disk. + * - 0x4 + - \_\_be32 + - t\_flags + - Flags that go with the descriptor. See the table jbd2_tag_flags_ for + more info. + * - 0x8 + - \_\_be32 + - t\_blocknr\_high + - Upper 32-bits of the location of where the corresponding data block + should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is + not enabled. + * - 0xC + - \_\_be32 + - t\_checksum + - Checksum of the journal UUID, the sequence number, and the data block. + * - + - + - + - This field appears to be open coded. It always comes at the end of the + tag, after t_checksum. This field is not present if the "same UUID" flag + is set. + * - 0x8 or 0xC + - char + - uuid[16] + - A UUID to go with this tag. This field appears to be copied from the + ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that + field. + +.. _jbd2_tag_flags: + +The journal tag flags are any combination of the following: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - On-disk block is escaped. The first four bytes of the data block just + happened to match the jbd2 magic number. + * - 0x2 + - This block has the same UUID as previous, therefore the UUID field is + omitted. + * - 0x4 + - The data block was deleted by the transaction. (Not used?) + * - 0x8 + - This is the last tag in this descriptor block. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag +is defined as ``struct journal_block_tag_s``, which looks like the +following. The size is 8, 12, 24, or 28 bytes: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - \_\_be32 + - t\_blocknr + - Lower 32-bits of the location of where the corresponding data block + should end up on disk. + * - 0x4 + - \_\_be16 + - t\_checksum + - Checksum of the journal UUID, the sequence number, and the data block. + Note that only the lower 16 bits are stored. + * - 0x6 + - \_\_be16 + - t\_flags + - Flags that go with the descriptor. See the table jbd2_tag_flags_ for + more info. + * - + - + - + - This next field is only present if the super block indicates support for + 64-bit block numbers. + * - 0x8 + - \_\_be32 + - t\_blocknr\_high + - Upper 32-bits of the location of where the corresponding data block + should end up on disk. + * - + - + - + - This field appears to be open coded. It always comes at the end of the + tag, after t_flags or t_blocknr_high. This field is not present if the + "same UUID" flag is set. + * - 0x8 or 0xC + - char + - uuid[16] + - A UUID to go with this tag. This field appears to be copied from the + ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that + field. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or +JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a +``struct jbd2_journal_block_tail``, which looks like this: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - \_\_be32 + - t\_checksum + - Checksum of the journal UUID + the descriptor block, with this field set + to zero. + +Data Block +~~~~~~~~~~ + +In general, the data blocks being written to disk through the journal +are written verbatim into the journal file after the descriptor block. +However, if the first four bytes of the block match the jbd2 magic +number then those four bytes are replaced with zeroes and the “escaped” +flag is set in the descriptor block tag. + +Revocation Block +~~~~~~~~~~~~~~~~ + +A revocation block is used to prevent replay of a block in an earlier +transaction. This is used to mark blocks that were journalled at one +time but are no longer journalled. Typically this happens if a metadata +block is freed and re-allocated as a file data block; in this case, a +journal replay after the file block was written to disk will cause +corruption. + +**NOTE**: This mechanism is NOT used to express “this journal block is +superseded by this other journal block”, as the author (djwong) +mistakenly thought. Any block being added to a transaction will cause +the removal of all existing revocation records for that block. + +Revocation blocks are described in +``struct jbd2_journal_revoke_header_s``, are at least 16 bytes in +length, but use a full block: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - journal\_header\_t + - r\_header + - Common block header. + * - 0xC + - \_\_be32 + - r\_count + - Number of bytes used in this block. + * - 0x10 + - \_\_be32 or \_\_be64 + - blocks[0] + - Blocks to revoke. + +After r\_count is a linear array of block numbers that are effectively +revoked by this transaction. The size of each block number is 8 bytes if +the superblock advertises 64-bit block number support, or 4 bytes +otherwise. + +If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or +JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation +block is a ``struct jbd2_journal_revoke_tail``, which has this format: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_be32 + - r\_checksum + - Checksum of the journal UUID + revocation block + +Commit Block +~~~~~~~~~~~~ + +The commit block is a sentry that indicates that a transaction has been +completely written to the journal. Once this commit block reaches the +journal, the data stored with this transaction can be written to their +final locations on disk. + +The commit block is described by ``struct commit_header``, which is 32 +bytes long (but uses a full block): + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Descriptor + * - 0x0 + - journal\_header\_s + - (open coded) + - Common block header. + * - 0xC + - unsigned char + - h\_chksum\_type + - The type of checksum to use to verify the integrity of the data blocks + in the transaction. See jbd2_checksum_type_ for more info. + * - 0xD + - unsigned char + - h\_chksum\_size + - The number of bytes used by the checksum. Most likely 4. + * - 0xE + - unsigned char + - h\_padding[2] + - + * - 0x10 + - \_\_be32 + - h\_chksum[JBD2\_CHECKSUM\_BYTES] + - 32 bytes of space to store checksums. If + JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 + are set, the first ``__be32`` is the checksum of the journal UUID and + the entire commit block, with this field zeroed. If + JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the + crc32 of all the blocks already written to the transaction. + * - 0x30 + - \_\_be64 + - h\_commit\_sec + - The time that the transaction was committed, in seconds since the epoch. + * - 0x38 + - \_\_be32 + - h\_commit\_nsec + - Nanoseconds component of the above timestamp. + diff --git a/Documentation/filesystems/ext4/mmp.rst b/Documentation/filesystems/ext4/mmp.rst new file mode 100644 index 000000000000..25660981d93c --- /dev/null +++ b/Documentation/filesystems/ext4/mmp.rst @@ -0,0 +1,77 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Multiple Mount Protection +------------------------- + +Multiple mount protection (MMP) is a feature that protects the +filesystem against multiple hosts trying to use the filesystem +simultaneously. When a filesystem is opened (for mounting, or fsck, +etc.), the MMP code running on the node (call it node A) checks a +sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the +open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then +fsck is (hopefully) running, and open fails immediately. Otherwise, the +open code will wait for twice the specified MMP check interval and check +the sequence number again. If the sequence number has changed, then the +filesystem is active on another machine and the open fails. If the MMP +code passes all of those checks, a new MMP sequence number is generated +and written to the MMP block, and the mount proceeds. + +While the filesystem is live, the kernel sets up a timer to re-check the +MMP block at the specified MMP check interval. To perform the re-check, +the MMP sequence number is re-read; if it does not match the in-memory +MMP sequence number, then another node (node B) has mounted the +filesystem, and node A remounts the filesystem read-only. If the +sequence numbers match, the sequence number is incremented both in +memory and on disk, and the re-check is complete. + +The hostname and device filename are written into the MMP block whenever +an open operation succeeds. The MMP code does not use these values; they +are provided purely for informational purposes. + +The checksum is calculated against the FS UUID and the MMP structure. +The MMP structure (``struct mmp_struct``) is as follows: + +.. list-table:: + :widths: 8 12 20 40 + :header-rows: 1 + + * - Offset + - Type + - Name + - Description + * - 0x0 + - \_\_le32 + - mmp\_magic + - Magic number for MMP, 0x004D4D50 (“MMP”). + * - 0x4 + - \_\_le32 + - mmp\_seq + - Sequence number, updated periodically. + * - 0x8 + - \_\_le64 + - mmp\_time + - Time that the MMP block was last updated. + * - 0x10 + - char[64] + - mmp\_nodename + - Hostname of the node that opened the filesystem. + * - 0x50 + - char[32] + - mmp\_bdevname + - Block device name of the filesystem. + * - 0x70 + - \_\_le16 + - mmp\_check\_interval + - The MMP re-check interval, in seconds. + * - 0x72 + - \_\_le16 + - mmp\_pad1 + - Zero. + * - 0x74 + - \_\_le32[226] + - mmp\_pad2 + - Zero. + * - 0x3FC + - \_\_le32 + - mmp\_checksum + - Checksum of the MMP block. diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/ondisk/about.rst deleted file mode 100644 index 0aadba052264..000000000000 --- a/Documentation/filesystems/ext4/ondisk/about.rst +++ /dev/null @@ -1,44 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -About this Book -=============== - -This document attempts to describe the on-disk format for ext4 -filesystems. The same general ideas should apply to ext2/3 filesystems -as well, though they do not support all the features that ext4 supports, -and the fields will be shorter. - -**NOTE**: This is a work in progress, based on notes that the author -(djwong) made while picking apart a filesystem by hand. The data -structure definitions should be current as of Linux 4.18 and -e2fsprogs-1.44. All comments and corrections are welcome, since there is -undoubtedly plenty of lore that might not be reflected in freshly -created demonstration filesystems. - -License -------- -This book is licensed under the terms of the GNU Public License, v2. - -Terminology ------------ - -ext4 divides a storage device into an array of logical blocks both to -reduce bookkeeping overhead and to increase throughput by forcing larger -transfer sizes. Generally, the block size will be 4KiB (the same size as -pages on x86 and the block layer's default block size), though the -actual size is calculated as 2 ^ (10 + ``sb.s_log_block_size``) bytes. -Throughout this document, disk locations are given in terms of these -logical blocks, not raw LBAs, and not 1024-byte blocks. For the sake of -convenience, the logical block size will be referred to as -``$block_size`` throughout the rest of the document. - -When referenced in ``preformatted text`` blocks, ``sb`` refers to fields -in the super block, and ``inode`` refers to fields in an inode table -entry. - -Other References ----------------- - -Also see http://www.nongnu.org/ext2-doc/ for quite a collection of -information about ext2/3. Here's another old reference: -http://wiki.osdev.org/Ext2 diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/ondisk/allocators.rst deleted file mode 100644 index 7aa85152ace3..000000000000 --- a/Documentation/filesystems/ext4/ondisk/allocators.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Block and Inode Allocation Policy ---------------------------------- - -ext4 recognizes (better than ext3, anyway) that data locality is -generally a desirably quality of a filesystem. On a spinning disk, -keeping related blocks near each other reduces the amount of movement -that the head actuator and disk must perform to access a data block, -thus speeding up disk IO. On an SSD there of course are no moving parts, -but locality can increase the size of each transfer request while -reducing the total number of requests. This locality may also have the -effect of concentrating writes on a single erase block, which can speed -up file rewrites significantly. Therefore, it is useful to reduce -fragmentation whenever possible. - -The first tool that ext4 uses to combat fragmentation is the multi-block -allocator. When a file is first created, the block allocator -speculatively allocates 8KiB of disk space to the file on the assumption -that the space will get written soon. When the file is closed, the -unused speculative allocations are of course freed, but if the -speculation is correct (typically the case for full writes of small -files) then the file data gets written out in a single multi-block -extent. A second related trick that ext4 uses is delayed allocation. -Under this scheme, when a file needs more blocks to absorb file writes, -the filesystem defers deciding the exact placement on the disk until all -the dirty buffers are being written out to disk. By not committing to a -particular placement until it's absolutely necessary (the commit timeout -is hit, or sync() is called, or the kernel runs out of memory), the hope -is that the filesystem can make better location decisions. - -The third trick that ext4 (and ext3) uses is that it tries to keep a -file's data blocks in the same block group as its inode. This cuts down -on the seek penalty when the filesystem first has to read a file's inode -to learn where the file's data blocks live and then seek over to the -file's data blocks to begin I/O operations. - -The fourth trick is that all the inodes in a directory are placed in the -same block group as the directory, when feasible. The working assumption -here is that all the files in a directory might be related, therefore it -is useful to try to keep them all together. - -The fifth trick is that the disk volume is cut up into 128MB block -groups; these mini-containers are used as outlined above to try to -maintain data locality. However, there is a deliberate quirk -- when a -directory is created in the root directory, the inode allocator scans -the block groups and puts that directory into the least heavily loaded -block group that it can find. This encourages directories to spread out -over a disk; as the top-level directory/file blobs fill up one block -group, the allocators simply move on to the next block group. Allegedly -this scheme evens out the loading on the block groups, though the author -suspects that the directories which are so unlucky as to land towards -the end of a spinning drive get a raw deal performance-wise. - -Of course if all of these mechanisms fail, one can always use e4defrag -to defragment files. diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/ondisk/attributes.rst deleted file mode 100644 index 54386a010a8d..000000000000 --- a/Documentation/filesystems/ext4/ondisk/attributes.rst +++ /dev/null @@ -1,191 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Extended Attributes -------------------- - -Extended attributes (xattrs) are typically stored in a separate data -block on the disk and referenced from inodes via ``inode.i_file_acl*``. -The first use of extended attributes seems to have been for storing file -ACLs and other security data (selinux). With the ``user_xattr`` mount -option it is possible for users to store extended attributes so long as -all attribute names begin with “user”; this restriction seems to have -disappeared as of Linux 3.0. - -There are two places where extended attributes can be found. The first -place is between the end of each inode entry and the beginning of the -next inode entry. For example, if inode.i\_extra\_isize = 28 and -sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes -available for in-inode extended attribute storage. The second place -where extended attributes can be found is in the block pointed to by -``inode.i_file_acl``. As of Linux 3.11, it is not possible for this -block to contain a pointer to a second extended attribute block (or even -the remaining blocks of a cluster). In theory it is possible for each -attribute's value to be stored in a separate data block, though as of -Linux 3.11 the code does not permit this. - -Keys are generally assumed to be ASCIIZ strings, whereas values can be -strings or binary data. - -Extended attributes, when stored after the inode, have a header -``ext4_xattr_ibody_header`` that is 4 bytes long: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - \_\_le32 - - h\_magic - - Magic number for identification, 0xEA020000. This value is set by the - Linux driver, though e2fsprogs doesn't seem to check it(?) - -The beginning of an extended attribute block is in -``struct ext4_xattr_header``, which is 32 bytes long: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - \_\_le32 - - h\_magic - - Magic number for identification, 0xEA020000. - * - 0x4 - - \_\_le32 - - h\_refcount - - Reference count. - * - 0x8 - - \_\_le32 - - h\_blocks - - Number of disk blocks used. - * - 0xC - - \_\_le32 - - h\_hash - - Hash value of all attributes. - * - 0x10 - - \_\_le32 - - h\_checksum - - Checksum of the extended attribute block. - * - 0x14 - - \_\_u32 - - h\_reserved[2] - - Zero. - -The checksum is calculated against the FS UUID, the 64-bit block number -of the extended attribute block, and the entire block (header + -entries). - -Following the ``struct ext4_xattr_header`` or -``struct ext4_xattr_ibody_header`` is an array of -``struct ext4_xattr_entry``; each of these entries is at least 16 bytes -long. When stored in an external block, the ``struct ext4_xattr_entry`` -entries must be stored in sorted order. The sort order is -``e_name_index``, then ``e_name_len``, and finally ``e_name``. -Attributes stored inside an inode do not need be stored in sorted order. - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - \_\_u8 - - e\_name\_len - - Length of name. - * - 0x1 - - \_\_u8 - - e\_name\_index - - Attribute name index. There is a discussion of this below. - * - 0x2 - - \_\_le16 - - e\_value\_offs - - Location of this attribute's value on the disk block where it is stored. - Multiple attributes can share the same value. For an inode attribute - this value is relative to the start of the first entry; for a block this - value is relative to the start of the block (i.e. the header). - * - 0x4 - - \_\_le32 - - e\_value\_inum - - The inode where the value is stored. Zero indicates the value is in the - same block as this entry. This field is only used if the - INCOMPAT\_EA\_INODE feature is enabled. - * - 0x8 - - \_\_le32 - - e\_value\_size - - Length of attribute value. - * - 0xC - - \_\_le32 - - e\_hash - - Hash value of attribute name and attribute value. The kernel doesn't - update the hash for in-inode attributes, so for that case this value - must be zero, because e2fsck validates any non-zero hash regardless of - where the xattr lives. - * - 0x10 - - char - - e\_name[e\_name\_len] - - Attribute name. Does not include trailing NULL. - -Attribute values can follow the end of the entry table. There appears to -be a requirement that they be aligned to 4-byte boundaries. The values -are stored starting at the end of the block and grow towards the -xattr\_header/xattr\_entry table. When the two collide, the overflow is -put into a separate disk block. If the disk block fills up, the -filesystem returns -ENOSPC. - -The first four fields of the ``ext4_xattr_entry`` are set to zero to -mark the end of the key list. - -Attribute Name Indices -~~~~~~~~~~~~~~~~~~~~~~ - -Logically speaking, extended attributes are a series of key=value pairs. -The keys are assumed to be NULL-terminated strings. To reduce the amount -of on-disk space that the keys consume, the beginning of the key string -is matched against the attribute name index. If a match is found, the -attribute name index field is set, and matching string is removed from -the key name. Here is a map of name index values to key prefixes: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Name Index - - Key Prefix - * - 0 - - (no prefix) - * - 1 - - “user.” - * - 2 - - “system.posix\_acl\_access” - * - 3 - - “system.posix\_acl\_default” - * - 4 - - “trusted.” - * - 6 - - “security.” - * - 7 - - “system.” (inline\_data only?) - * - 8 - - “system.richacl” (SuSE kernels only?) - -For example, if the attribute key is “user.fubar”, the attribute name -index is set to 1 and the “fubar” name is recorded on disk. - -POSIX ACLs -~~~~~~~~~~ - -POSIX ACLs are stored in a reduced version of the Linux kernel (and -libacl's) internal ACL format. The key difference is that the version -number is different (1) and the ``e_id`` field is only stored for named -user and group ACLs. diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/ondisk/bigalloc.rst deleted file mode 100644 index c6d88557553c..000000000000 --- a/Documentation/filesystems/ext4/ondisk/bigalloc.rst +++ /dev/null @@ -1,22 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Bigalloc --------- - -At the moment, the default size of a block is 4KiB, which is a commonly -supported page size on most MMU-capable hardware. This is fortunate, as -ext4 code is not prepared to handle the case where the block size -exceeds the page size. However, for a filesystem of mostly huge files, -it is desirable to be able to allocate disk blocks in units of multiple -blocks to reduce both fragmentation and metadata overhead. The -`bigalloc `__ feature provides exactly this ability. The -administrator can set a block cluster size at mkfs time (which is stored -in the s\_log\_cluster\_size field in the superblock); from then on, the -block bitmaps track clusters, not individual blocks. This means that -block groups can be several gigabytes in size (instead of just 128MiB); -however, the minimum allocation unit becomes a cluster, not a block, -even for directories. TaoBao had a patchset to extend the “use units of -clusters instead of blocks” to the extent tree, though it is not clear -where those patches went-- they eventually morphed into “extent tree v2” -but that code has not landed as of May 2015. - diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/ondisk/bitmaps.rst deleted file mode 100644 index c7546dbc197a..000000000000 --- a/Documentation/filesystems/ext4/ondisk/bitmaps.rst +++ /dev/null @@ -1,28 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Block and inode Bitmaps ------------------------ - -The data block bitmap tracks the usage of data blocks within the block -group. - -The inode bitmap records which entries in the inode table are in use. - -As with most bitmaps, one bit represents the usage status of one data -block or inode table entry. This implies a block group size of 8 \* -number\_of\_bytes\_in\_a\_logical\_block. - -NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts -of the kernel and e2fsprogs code pretends that the block bitmap contains -zeros (i.e. all blocks in the group are free). However, it is not -necessarily the case that no blocks are in use -- if ``meta_bg`` is set, -the bitmaps and group descriptor live inside the group. Unfortunately, -ext2fs\_test\_block\_bitmap2() will return '0' for those locations, -which produces confusing debugfs output. - -Inode Table ------------ -Inode tables are statically allocated at mkfs time. Each block group -descriptor points to the start of the table, and the superblock records -the number of inodes per group. See the section on inodes for more -information. diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/ondisk/blockgroup.rst deleted file mode 100644 index baf888e4c06a..000000000000 --- a/Documentation/filesystems/ext4/ondisk/blockgroup.rst +++ /dev/null @@ -1,135 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Layout ------- - -The layout of a standard block group is approximately as follows (each -of these fields is discussed in a separate section below): - -.. list-table:: - :widths: 1 1 1 1 1 1 1 1 - :header-rows: 1 - - * - Group 0 Padding - - ext4 Super Block - - Group Descriptors - - Reserved GDT Blocks - - Data Block Bitmap - - inode Bitmap - - inode Table - - Data Blocks - * - 1024 bytes - - 1 block - - many blocks - - many blocks - - 1 block - - 1 block - - many blocks - - many more blocks - -For the special case of block group 0, the first 1024 bytes are unused, -to allow for the installation of x86 boot sectors and other oddities. -The superblock will start at offset 1024 bytes, whichever block that -happens to be (usually 0). However, if for some reason the block size = -1024, then block 0 is marked in use and the superblock goes in block 1. -For all other block groups, there is no padding. - -The ext4 driver primarily works with the superblock and the group -descriptors that are found in block group 0. Redundant copies of the -superblock and group descriptors are written to some of the block groups -across the disk in case the beginning of the disk gets trashed, though -not all block groups necessarily host a redundant copy (see following -paragraph for more details). If the group does not have a redundant -copy, the block group begins with the data block bitmap. Note also that -when the filesystem is freshly formatted, mkfs will allocate “reserve -GDT block” space after the block group descriptors and before the start -of the block bitmaps to allow for future expansion of the filesystem. By -default, a filesystem is allowed to increase in size by a factor of -1024x over the original filesystem size. - -The location of the inode table is given by ``grp.bg_inode_table_*``. It -is continuous range of blocks large enough to contain -``sb.s_inodes_per_group * sb.s_inode_size`` bytes. - -As for the ordering of items in a block group, it is generally -established that the super block and the group descriptor table, if -present, will be at the beginning of the block group. The bitmaps and -the inode table can be anywhere, and it is quite possible for the -bitmaps to come after the inode table, or for both to be in different -groups (flex\_bg). Leftover space is used for file data blocks, indirect -block maps, extent tree blocks, and extended attributes. - -Flexible Block Groups ---------------------- - -Starting in ext4, there is a new feature called flexible block groups -(flex\_bg). In a flex\_bg, several block groups are tied together as one -logical block group; the bitmap spaces and the inode table space in the -first block group of the flex\_bg are expanded to include the bitmaps -and inode tables of all other block groups in the flex\_bg. For example, -if the flex\_bg size is 4, then group 0 will contain (in order) the -superblock, group descriptors, data block bitmaps for groups 0-3, inode -bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining -space in group 0 is for file data. The effect of this is to group the -block metadata close together for faster loading, and to enable large -files to be continuous on disk. Backup copies of the superblock and -group descriptors are always at the beginning of block groups, even if -flex\_bg is enabled. The number of block groups that make up a flex\_bg -is given by 2 ^ ``sb.s_log_groups_per_flex``. - -Meta Block Groups ------------------ - -Without the option META\_BG, for safety concerns, all block group -descriptors copies are kept in the first block group. Given the default -128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4 -can have at most 2^27/64 = 2^21 block groups. This limits the entire -filesystem size to 2^21 ∗ 2^27 = 2^48bytes or 256TiB. - -The solution to this problem is to use the metablock group feature -(META\_BG), which is already in ext3 for all 2.6 releases. With the -META\_BG feature, ext4 filesystems are partitioned into many metablock -groups. Each metablock group is a cluster of block groups whose group -descriptor structures can be stored in a single disk block. For ext4 -filesystems with 4 KB block size, a single metablock group partition -includes 64 block groups, or 8 GiB of disk space. The metablock group -feature moves the location of the group descriptors from the congested -first block group of the whole filesystem into the first group of each -metablock group itself. The backups are in the second and last group of -each metablock group. This increases the 2^21 maximum block groups limit -to the hard limit 2^32, allowing support for a 512PiB filesystem. - -The change in the filesystem format replaces the current scheme where -the superblock is followed by a variable-length set of block group -descriptors. Instead, the superblock and a single block group descriptor -block is placed at the beginning of the first, second, and last block -groups in a meta-block group. A meta-block group is a collection of -block groups which can be described by a single block group descriptor -block. Since the size of the block group descriptor structure is 32 -bytes, a meta-block group contains 32 block groups for filesystems with -a 1KB block size, and 128 block groups for filesystems with a 4KB -blocksize. Filesystems can either be created using this new block group -descriptor layout, or existing filesystems can be resized on-line, and -the field s\_first\_meta\_bg in the superblock will indicate the first -block group using this new layout. - -Please see an important note about ``BLOCK_UNINIT`` in the section about -block and inode bitmaps. - -Lazy Block Group Initialization -------------------------------- - -A new feature for ext4 are three block group descriptor flags that -enable mkfs to skip initializing other parts of the block group -metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean -that the inode and block bitmaps for that group can be calculated and -therefore the on-disk bitmap blocks are not initialized. This is -generally the case for an empty block group or a block group containing -only fixed-location block group metadata. The INODE\_ZEROED flag means -that the inode table has been initialized; mkfs will unset this flag and -rely on the kernel to initialize the inode tables in the background. - -By not writing zeroes to the bitmaps and inode table, mkfs time is -reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM, -but the dumpe2fs output prints this as “uninit\_bg”. They are the same -thing. diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/ondisk/blockmap.rst deleted file mode 100644 index 30e25750d88a..000000000000 --- a/Documentation/filesystems/ext4/ondisk/blockmap.rst +++ /dev/null @@ -1,49 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| i.i\_block Offset | Where It Points | -+=====================+==============================================================================================================================================================================================================================+ -| 0 to 11 | Direct map to file blocks 0 to 11. | -+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| 12 | Indirect block: (file blocks 12 to (``$block_size`` / 4) + 11, or 12 to 1035 if 4KiB blocks) | -| | | -| | +------------------------------+--------------------------------------------------------------------+ | -| | | Indirect Block Offset | Where It Points | | -| | +==============================+====================================================================+ | -| | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | -| | +------------------------------+--------------------------------------------------------------------+ | -+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| 13 | Double-indirect block: (file blocks ``$block_size``/4 + 12 to (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 11, or 1036 to 1049611 if 4KiB blocks) | -| | | -| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | -| | | Double Indirect Block Offset | Where It Points | | -| | +================================+=========================================================================================================+ | -| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | -| | | | | | -| | | | +------------------------------+--------------------------------------------------------------------+ | | -| | | | | Indirect Block Offset | Where It Points | | | -| | | | +==============================+====================================================================+ | | -| | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | -| | | | +------------------------------+--------------------------------------------------------------------+ | | -| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | -+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| 14 | Triple-indirect block: (file blocks (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12 to (``$block_size`` / 4) ^ 3 + (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12, or 1049612 to 1074791436 if 4KiB blocks) | -| | | -| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | -| | | Triple Indirect Block Offset | Where It Points | | -| | +================================+================================================================================================================================================+ | -| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) double indirect blocks (1024 if 4KiB blocks) | | -| | | | | | -| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | -| | | | | Double Indirect Block Offset | Where It Points | | | -| | | | +================================+=========================================================================================================+ | | -| | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | | -| | | | | | | | | -| | | | | | +------------------------------+--------------------------------------------------------------------+ | | | -| | | | | | | Indirect Block Offset | Where It Points | | | | -| | | | | | +==============================+====================================================================+ | | | -| | | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | | -| | | | | | +------------------------------+--------------------------------------------------------------------+ | | | -| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | -| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | -+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/ondisk/blocks.rst deleted file mode 100644 index 73d4dc0f7bda..000000000000 --- a/Documentation/filesystems/ext4/ondisk/blocks.rst +++ /dev/null @@ -1,142 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Blocks ------- - -ext4 allocates storage space in units of “blocks”. A block is a group of -sectors between 1KiB and 64KiB, and the number of sectors must be an -integral power of 2. Blocks are in turn grouped into larger units called -block groups. Block size is specified at mkfs time and typically is -4KiB. You may experience mounting problems if block size is greater than -page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory -pages). By default a filesystem can contain 2^32 blocks; if the '64bit' -feature is enabled, then a filesystem can have 2^64 blocks. - -For 32-bit filesystems, limits are as follows: - -.. list-table:: - :widths: 1 1 1 1 1 - :header-rows: 1 - - * - Item - - 1KiB - - 2KiB - - 4KiB - - 64KiB - * - Blocks - - 2^32 - - 2^32 - - 2^32 - - 2^32 - * - Inodes - - 2^32 - - 2^32 - - 2^32 - - 2^32 - * - File System Size - - 4TiB - - 8TiB - - 16TiB - - 256PiB - * - Blocks Per Block Group - - 8,192 - - 16,384 - - 32,768 - - 524,288 - * - Inodes Per Block Group - - 8,192 - - 16,384 - - 32,768 - - 524,288 - * - Block Group Size - - 8MiB - - 32MiB - - 128MiB - - 32GiB - * - Blocks Per File, Extents - - 2^32 - - 2^32 - - 2^32 - - 2^32 - * - Blocks Per File, Block Maps - - 16,843,020 - - 134,480,396 - - 1,074,791,436 - - 4,398,314,962,956 (really 2^32 due to field size limitations) - * - File Size, Extents - - 4TiB - - 8TiB - - 16TiB - - 256TiB - * - File Size, Block Maps - - 16GiB - - 256GiB - - 4TiB - - 256TiB - -For 64-bit filesystems, limits are as follows: - -.. list-table:: - :widths: 1 1 1 1 1 - :header-rows: 1 - - * - Item - - 1KiB - - 2KiB - - 4KiB - - 64KiB - * - Blocks - - 2^64 - - 2^64 - - 2^64 - - 2^64 - * - Inodes - - 2^32 - - 2^32 - - 2^32 - - 2^32 - * - File System Size - - 16ZiB - - 32ZiB - - 64ZiB - - 1YiB - * - Blocks Per Block Group - - 8,192 - - 16,384 - - 32,768 - - 524,288 - * - Inodes Per Block Group - - 8,192 - - 16,384 - - 32,768 - - 524,288 - * - Block Group Size - - 8MiB - - 32MiB - - 128MiB - - 32GiB - * - Blocks Per File, Extents - - 2^32 - - 2^32 - - 2^32 - - 2^32 - * - Blocks Per File, Block Maps - - 16,843,020 - - 134,480,396 - - 1,074,791,436 - - 4,398,314,962,956 (really 2^32 due to field size limitations) - * - File Size, Extents - - 4TiB - - 8TiB - - 16TiB - - 256TiB - * - File Size, Block Maps - - 16GiB - - 256GiB - - 4TiB - - 256TiB - -Note: Files not using extents (i.e. files using block maps) must be -placed within the first 2^32 blocks of a filesystem. Files with extents -must be placed within the first 2^48 blocks of a filesystem. It's not -clear what happens with larger filesystems. diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/ondisk/checksums.rst deleted file mode 100644 index 5519e253810d..000000000000 --- a/Documentation/filesystems/ext4/ondisk/checksums.rst +++ /dev/null @@ -1,73 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Checksums ---------- - -Starting in early 2012, metadata checksums were added to all major ext4 -and jbd2 data structures. The associated feature flag is metadata\_csum. -The desired checksum algorithm is indicated in the superblock, though as -of October 2012 the only supported algorithm is crc32c. Some data -structures did not have space to fit a full 32-bit checksum, so only the -lower 16 bits are stored. Enabling the 64bit feature increases the data -structure size so that full 32-bit checksums can be stored for many data -structures. However, existing 32-bit filesystems cannot be extended to -enable 64bit mode, at least not without the experimental resize2fs -patches to do so. - -Existing filesystems can have checksumming added by running -``tune2fs -O metadata_csum`` against the underlying device. If tune2fs -encounters directory blocks that lack sufficient empty space to add a -checksum, it will request that you run ``e2fsck -D`` to have the -directories rebuilt with checksums. This has the added benefit of -removing slack space from the directory files and rebalancing the htree -indexes. If you \_ignore\_ this step, your directories will not be -protected by a checksum! - -The following table describes the data elements that go into each type -of checksum. The checksum function is whatever the superblock describes -(crc32c as of October 2013) unless noted otherwise. - -.. list-table:: - :widths: 20 8 50 - :header-rows: 1 - - * - Metadata - - Length - - Ingredients - * - Superblock - - \_\_le32 - - The entire superblock up to the checksum field. The UUID lives inside - the superblock. - * - MMP - - \_\_le32 - - UUID + the entire MMP block up to the checksum field. - * - Extended Attributes - - \_\_le32 - - UUID + the entire extended attribute block. The checksum field is set to - zero. - * - Directory Entries - - \_\_le32 - - UUID + inode number + inode generation + the directory block up to the - fake entry enclosing the checksum field. - * - HTREE Nodes - - \_\_le32 - - UUID + inode number + inode generation + all valid extents + HTREE tail. - The checksum field is set to zero. - * - Extents - - \_\_le32 - - UUID + inode number + inode generation + the entire extent block up to - the checksum field. - * - Bitmaps - - \_\_le32 or \_\_le16 - - UUID + the entire bitmap. Checksums are stored in the group descriptor, - and truncated if the group descriptor size is 32 bytes (i.e. ^64bit) - * - Inodes - - \_\_le32 - - UUID + inode number + inode generation + the entire inode. The checksum - field is set to zero. Each inode has its own checksum. - * - Group Descriptors - - \_\_le16 - - If metadata\_csum, then UUID + group number + the entire descriptor; - else if gdt\_csum, then crc16(UUID + group number + the entire - descriptor). In all cases, only the lower 16 bits are stored. - diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/ondisk/directory.rst deleted file mode 100644 index 614034e24669..000000000000 --- a/Documentation/filesystems/ext4/ondisk/directory.rst +++ /dev/null @@ -1,426 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Directory Entries ------------------ - -In an ext4 filesystem, a directory is more or less a flat file that maps -an arbitrary byte string (usually ASCII) to an inode number on the -filesystem. There can be many directory entries across the filesystem -that reference the same inode number--these are known as hard links, and -that is why hard links cannot reference files on other filesystems. As -such, directory entries are found by reading the data block(s) -associated with a directory file for the particular directory entry that -is desired. - -Linear (Classic) Directories -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -By default, each directory lists its entries in an “almost-linear” -array. I write “almost” because it's not a linear array in the memory -sense because directory entries are not split across filesystem blocks. -Therefore, it is more accurate to say that a directory is a series of -data blocks and that each block contains a linear array of directory -entries. The end of each per-block array is signified by reaching the -end of the block; the last entry in the block has a record length that -takes it all the way to the end of the block. The end of the entire -directory is of course signified by reaching the end of the file. Unused -directory entries are signified by inode = 0. By default the filesystem -uses ``struct ext4_dir_entry_2`` for directory entries unless the -“filetype” feature flag is not set, in which case it uses -``struct ext4_dir_entry``. - -The original directory entry format is ``struct ext4_dir_entry``, which -is at most 263 bytes long, though on disk you'll need to reference -``dirent.rec_len`` to know for sure. - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - inode - - Number of the inode that this directory entry points to. - * - 0x4 - - \_\_le16 - - rec\_len - - Length of this directory entry. Must be a multiple of 4. - * - 0x6 - - \_\_le16 - - name\_len - - Length of the file name. - * - 0x8 - - char - - name[EXT4\_NAME\_LEN] - - File name. - -Since file names cannot be longer than 255 bytes, the new directory -entry format shortens the rec\_len field and uses the space for a file -type flag, probably to avoid having to load every inode during directory -tree traversal. This format is ``ext4_dir_entry_2``, which is at most -263 bytes long, though on disk you'll need to reference -``dirent.rec_len`` to know for sure. - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - inode - - Number of the inode that this directory entry points to. - * - 0x4 - - \_\_le16 - - rec\_len - - Length of this directory entry. - * - 0x6 - - \_\_u8 - - name\_len - - Length of the file name. - * - 0x7 - - \_\_u8 - - file\_type - - File type code, see ftype_ table below. - * - 0x8 - - char - - name[EXT4\_NAME\_LEN] - - File name. - -.. _ftype: - -The directory file type is one of the following values: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x0 - - Unknown. - * - 0x1 - - Regular file. - * - 0x2 - - Directory. - * - 0x3 - - Character device file. - * - 0x4 - - Block device file. - * - 0x5 - - FIFO. - * - 0x6 - - Socket. - * - 0x7 - - Symbolic link. - -In order to add checksums to these classic directory blocks, a phony -``struct ext4_dir_entry`` is placed at the end of each leaf block to -hold the checksum. The directory entry is 12 bytes long. The inode -number and name\_len fields are set to zero to fool old software into -ignoring an apparently empty directory entry, and the checksum is stored -in the place where the name normally goes. The structure is -``struct ext4_dir_entry_tail``: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - det\_reserved\_zero1 - - Inode number, which must be zero. - * - 0x4 - - \_\_le16 - - det\_rec\_len - - Length of this directory entry, which must be 12. - * - 0x6 - - \_\_u8 - - det\_reserved\_zero2 - - Length of the file name, which must be zero. - * - 0x7 - - \_\_u8 - - det\_reserved\_ft - - File type, which must be 0xDE. - * - 0x8 - - \_\_le32 - - det\_checksum - - Directory leaf block checksum. - -The leaf directory block checksum is calculated against the FS UUID, the -directory's inode number, the directory's inode generation number, and -the entire directory entry block up to (but not including) the fake -directory entry. - -Hash Tree Directories -~~~~~~~~~~~~~~~~~~~~~ - -A linear array of directory entries isn't great for performance, so a -new feature was added to ext3 to provide a faster (but peculiar) -balanced tree keyed off a hash of the directory entry name. If the -EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a -hashed btree (htree) to organize and find directory entries. For -backwards read-only compatibility with ext2, this tree is actually -hidden inside the directory file, masquerading as “empty” directory data -blocks! It was stated previously that the end of the linear directory -entry table was signified with an entry pointing to inode 0; this is -(ab)used to fool the old linear-scan algorithm into thinking that the -rest of the directory block is empty so that it moves on. - -The root of the tree always lives in the first data block of the -directory. By ext2 custom, the '.' and '..' entries must appear at the -beginning of this first block, so they are put here as two -``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of -the root node contains metadata about the tree and finally a hash->block -map to find nodes that are lower in the htree. If -``dx_root.info.indirect_levels`` is non-zero then the htree has two -levels; the data block pointed to by the root node's map is an interior -node, which is indexed by a minor hash. Interior nodes in this tree -contains a zeroed out ``struct ext4_dir_entry_2`` followed by a -minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear -array of all ``struct ext4_dir_entry_2``; all of these entries -(presumably) hash to the same value. If there is an overflow, the -entries simply overflow into the next leaf node, and the -least-significant bit of the hash (in the interior node map) that gets -us to this next leaf node is set. - -To traverse the directory as a htree, the code calculates the hash of -the desired file name and uses it to find the corresponding block -number. If the tree is flat, the block is a linear array of directory -entries that can be searched; otherwise, the minor hash of the file name -is computed and used against this second block to find the corresponding -third block number. That third block number will be a linear array of -directory entries. - -To traverse the directory as a linear array (such as the old code does), -the code simply reads every data block in the directory. The blocks used -for the htree will appear to have no entries (aside from '.' and '..') -and so only the leaf nodes will appear to have any interesting content. - -The root of the htree is in ``struct dx_root``, which is the full length -of a data block: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - \_\_le32 - - dot.inode - - inode number of this directory. - * - 0x4 - - \_\_le16 - - dot.rec\_len - - Length of this record, 12. - * - 0x6 - - u8 - - dot.name\_len - - Length of the name, 1. - * - 0x7 - - u8 - - dot.file\_type - - File type of this entry, 0x2 (directory) (if the feature flag is set). - * - 0x8 - - char - - dot.name[4] - - “.\\0\\0\\0” - * - 0xC - - \_\_le32 - - dotdot.inode - - inode number of parent directory. - * - 0x10 - - \_\_le16 - - dotdot.rec\_len - - block\_size - 12. The record length is long enough to cover all htree - data. - * - 0x12 - - u8 - - dotdot.name\_len - - Length of the name, 2. - * - 0x13 - - u8 - - dotdot.file\_type - - File type of this entry, 0x2 (directory) (if the feature flag is set). - * - 0x14 - - char - - dotdot\_name[4] - - “..\\0\\0” - * - 0x18 - - \_\_le32 - - struct dx\_root\_info.reserved\_zero - - Zero. - * - 0x1C - - u8 - - struct dx\_root\_info.hash\_version - - Hash type, see dirhash_ table below. - * - 0x1D - - u8 - - struct dx\_root\_info.info\_length - - Length of the tree information, 0x8. - * - 0x1E - - u8 - - struct dx\_root\_info.indirect\_levels - - Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR - feature is set; cannot be larger than 2 otherwise. - * - 0x1F - - u8 - - struct dx\_root\_info.unused\_flags - - - * - 0x20 - - \_\_le16 - - limit - - Maximum number of dx\_entries that can follow this header, plus 1 for - the header itself. - * - 0x22 - - \_\_le16 - - count - - Actual number of dx\_entries that follow this header, plus 1 for the - header itself. - * - 0x24 - - \_\_le32 - - block - - The block number (within the directory file) that goes with hash=0. - * - 0x28 - - struct dx\_entry - - entries[0] - - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. - -.. _dirhash: - -The directory hash is one of the following values: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x0 - - Legacy. - * - 0x1 - - Half MD4. - * - 0x2 - - Tea. - * - 0x3 - - Legacy, unsigned. - * - 0x4 - - Half MD4, unsigned. - * - 0x5 - - Tea, unsigned. - -Interior nodes of an htree are recorded as ``struct dx_node``, which is -also the full length of a data block: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - \_\_le32 - - fake.inode - - Zero, to make it look like this entry is not in use. - * - 0x4 - - \_\_le16 - - fake.rec\_len - - The size of the block, in order to hide all of the dx\_node data. - * - 0x6 - - u8 - - name\_len - - Zero. There is no name for this “unused” directory entry. - * - 0x7 - - u8 - - file\_type - - Zero. There is no file type for this “unused” directory entry. - * - 0x8 - - \_\_le16 - - limit - - Maximum number of dx\_entries that can follow this header, plus 1 for - the header itself. - * - 0xA - - \_\_le16 - - count - - Actual number of dx\_entries that follow this header, plus 1 for the - header itself. - * - 0xE - - \_\_le32 - - block - - The block number (within the directory file) that goes with the lowest - hash value of this block. This value is stored in the parent block. - * - 0x12 - - struct dx\_entry - - entries[0] - - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. - -The hash maps that exist in both ``struct dx_root`` and -``struct dx_node`` are recorded as ``struct dx_entry``, which is 8 bytes -long: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - \_\_le32 - - hash - - Hash code. - * - 0x4 - - \_\_le32 - - block - - Block number (within the directory file, not filesystem blocks) of the - next node in the htree. - -(If you think this is all quite clever and peculiar, so does the -author.) - -If metadata checksums are enabled, the last 8 bytes of the directory -block (precisely the length of one dx\_entry) are used to store a -``struct dx_tail``, which contains the checksum. The ``limit`` and -``count`` entries in the dx\_root/dx\_node structures are adjusted as -necessary to fit the dx\_tail into the block. If there is no space for -the dx\_tail, the user is notified to run e2fsck -D to rebuild the -directory index (which will ensure that there's space for the checksum. -The dx\_tail structure is 8 bytes long and looks like this: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - u32 - - dt\_reserved - - Zero. - * - 0x4 - - \_\_le32 - - dt\_checksum - - Checksum of the htree directory block. - -The checksum is calculated against the FS UUID, the htree index header -(dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in -use, and the tail block (dx\_tail). diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/ondisk/dynamic.rst deleted file mode 100644 index bb0c84333341..000000000000 --- a/Documentation/filesystems/ext4/ondisk/dynamic.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Dynamic Structures -================== - -Dynamic metadata are created on the fly when files and blocks are -allocated to files. - -.. include:: inodes.rst -.. include:: ifork.rst -.. include:: directory.rst -.. include:: attributes.rst diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/ondisk/eainode.rst deleted file mode 100644 index ecc0d01a0a72..000000000000 --- a/Documentation/filesystems/ext4/ondisk/eainode.rst +++ /dev/null @@ -1,18 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Large Extended Attribute Values -------------------------------- - -To enable ext4 to store extended attribute values that do not fit in the -inode or in the single extended attribute block attached to an inode, -the EA\_INODE feature allows us to store the value in the data blocks of -a regular file inode. This “EA inode” is linked only from the extended -attribute name index and must not appear in a directory entry. The -inode's i\_atime field is used to store a checksum of the xattr value; -and i\_ctime/i\_version store a 64-bit reference count, which enables -sharing of large xattr values between multiple owning inodes. For -backward compatibility with older versions of this feature, the -i\_mtime/i\_generation *may* store a back-reference to the inode number -and i\_generation of the **one** owning inode (in cases where the EA -inode is not referenced by multiple inodes) to verify that the EA inode -is the correct one being accessed. diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/ondisk/globals.rst deleted file mode 100644 index 368bf7662b96..000000000000 --- a/Documentation/filesystems/ext4/ondisk/globals.rst +++ /dev/null @@ -1,13 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Global Structures -================= - -The filesystem is sharded into a number of block groups, each of which -have static metadata at fixed locations. - -.. include:: super.rst -.. include:: group_descr.rst -.. include:: bitmaps.rst -.. include:: mmp.rst -.. include:: journal.rst diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/ondisk/group_descr.rst deleted file mode 100644 index 0f783ed88592..000000000000 --- a/Documentation/filesystems/ext4/ondisk/group_descr.rst +++ /dev/null @@ -1,170 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Block Group Descriptors ------------------------ - -Each block group on the filesystem has one of these descriptors -associated with it. As noted in the Layout section above, the group -descriptors (if present) are the second item in the block group. The -standard configuration is for each block group to contain a full copy of -the block group descriptor table unless the sparse\_super feature flag -is set. - -Notice how the group descriptor records the location of both bitmaps and -the inode table (i.e. they can float). This means that within a block -group, the only data structures with fixed locations are the superblock -and the group descriptor table. The flex\_bg mechanism uses this -property to group several block groups into a flex group and lay out all -of the groups' bitmaps and inode tables into one long run in the first -group of the flex group. - -If the meta\_bg feature flag is set, then several block groups are -grouped together into a meta group. Note that in the meta\_bg case, -however, the first and last two block groups within the larger meta -group contain only group descriptors for the groups inside the meta -group. - -flex\_bg and meta\_bg do not appear to be mutually exclusive features. - -In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the -block group descriptor was only 32 bytes long and therefore ends at -bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the -block group descriptor expands to at least the 64 bytes described below; -the size is stored in the superblock. - -If gdt\_csum is set and metadata\_csum is not set, the block group -checksum is the crc16 of the FS UUID, the group number, and the group -descriptor structure. If metadata\_csum is set, then the block group -checksum is the lower 16 bits of the checksum of the FS UUID, the group -number, and the group descriptor structure. Both block and inode bitmap -checksums are calculated against the FS UUID, the group number, and the -entire bitmap. - -The block group descriptor is laid out in ``struct ext4_group_desc``. - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - bg\_block\_bitmap\_lo - - Lower 32-bits of location of block bitmap. - * - 0x4 - - \_\_le32 - - bg\_inode\_bitmap\_lo - - Lower 32-bits of location of inode bitmap. - * - 0x8 - - \_\_le32 - - bg\_inode\_table\_lo - - Lower 32-bits of location of inode table. - * - 0xC - - \_\_le16 - - bg\_free\_blocks\_count\_lo - - Lower 16-bits of free block count. - * - 0xE - - \_\_le16 - - bg\_free\_inodes\_count\_lo - - Lower 16-bits of free inode count. - * - 0x10 - - \_\_le16 - - bg\_used\_dirs\_count\_lo - - Lower 16-bits of directory count. - * - 0x12 - - \_\_le16 - - bg\_flags - - Block group flags. See the bgflags_ table below. - * - 0x14 - - \_\_le32 - - bg\_exclude\_bitmap\_lo - - Lower 32-bits of location of snapshot exclusion bitmap. - * - 0x18 - - \_\_le16 - - bg\_block\_bitmap\_csum\_lo - - Lower 16-bits of the block bitmap checksum. - * - 0x1A - - \_\_le16 - - bg\_inode\_bitmap\_csum\_lo - - Lower 16-bits of the inode bitmap checksum. - * - 0x1C - - \_\_le16 - - bg\_itable\_unused\_lo - - Lower 16-bits of unused inode count. If set, we needn't scan past the - ``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the - inode table for this group. - * - 0x1E - - \_\_le16 - - bg\_checksum - - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the - RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) & - 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set. - * - - - - - - - These fields only exist if the 64bit feature is enabled and s_desc_size - > 32. - * - 0x20 - - \_\_le32 - - bg\_block\_bitmap\_hi - - Upper 32-bits of location of block bitmap. - * - 0x24 - - \_\_le32 - - bg\_inode\_bitmap\_hi - - Upper 32-bits of location of inodes bitmap. - * - 0x28 - - \_\_le32 - - bg\_inode\_table\_hi - - Upper 32-bits of location of inodes table. - * - 0x2C - - \_\_le16 - - bg\_free\_blocks\_count\_hi - - Upper 16-bits of free block count. - * - 0x2E - - \_\_le16 - - bg\_free\_inodes\_count\_hi - - Upper 16-bits of free inode count. - * - 0x30 - - \_\_le16 - - bg\_used\_dirs\_count\_hi - - Upper 16-bits of directory count. - * - 0x32 - - \_\_le16 - - bg\_itable\_unused\_hi - - Upper 16-bits of unused inode count. - * - 0x34 - - \_\_le32 - - bg\_exclude\_bitmap\_hi - - Upper 32-bits of location of snapshot exclusion bitmap. - * - 0x38 - - \_\_le16 - - bg\_block\_bitmap\_csum\_hi - - Upper 16-bits of the block bitmap checksum. - * - 0x3A - - \_\_le16 - - bg\_inode\_bitmap\_csum\_hi - - Upper 16-bits of the inode bitmap checksum. - * - 0x3C - - \_\_u32 - - bg\_reserved - - Padding to 64 bytes. - -.. _bgflags: - -Block group flags can be any combination of the following: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x1 - - inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT). - * - 0x2 - - block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT). - * - 0x4 - - inode table is zeroed (EXT4\_BG\_INODE\_ZEROED). diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ondisk/ifork.rst deleted file mode 100644 index b9816d5a896b..000000000000 --- a/Documentation/filesystems/ext4/ondisk/ifork.rst +++ /dev/null @@ -1,194 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -The Contents of inode.i\_block ------------------------------- - -Depending on the type of file an inode describes, the 60 bytes of -storage in ``inode.i_block`` can be used in different ways. In general, -regular files and directories will use it for file block indexing -information, and special files will use it for special purposes. - -Symbolic Links -~~~~~~~~~~~~~~ - -The target of a symbolic link will be stored in this field if the target -string is less than 60 bytes long. Otherwise, either extents or block -maps will be used to allocate data blocks to store the link target. - -Direct/Indirect Block Addressing -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In ext2/3, file block numbers were mapped to logical block numbers by -means of an (up to) three level 1-1 block map. To find the logical block -that stores a particular file block, the code would navigate through -this increasingly complicated structure. Notice that there is neither a -magic number nor a checksum to provide any level of confidence that the -block isn't full of garbage. - -.. ifconfig:: builder != 'latex' - - .. include:: blockmap.rst - -.. ifconfig:: builder == 'latex' - - [Table omitted because LaTeX doesn't support nested tables.] - -Note that with this block mapping scheme, it is necessary to fill out a -lot of mapping data even for a large contiguous file! This inefficiency -led to the creation of the extent mapping scheme, discussed below. - -Notice also that a file using this mapping scheme cannot be placed -higher than 2^32 blocks. - -Extent Tree -~~~~~~~~~~~ - -In ext4, the file to logical block map has been replaced with an extent -tree. Under the old scheme, allocating a contiguous run of 1,000 blocks -requires an indirect block to map all 1,000 entries; with extents, the -mapping is reduced to a single ``struct ext4_extent`` with -``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate -very large files with a single extent, at a considerable reduction in -metadata block use, and some improvement in disk efficiency. The inode -must have the extents flag (0x80000) flag set for this feature to be in -use. - -Extents are arranged as a tree. Each node of the tree begins with a -``struct ext4_extent_header``. If the node is an interior node -(``eh.eh_depth`` > 0), the header is followed by ``eh.eh_entries`` -instances of ``struct ext4_extent_idx``; each of these index entries -points to a block containing more nodes in the extent tree. If the node -is a leaf node (``eh.eh_depth == 0``), then the header is followed by -``eh.eh_entries`` instances of ``struct ext4_extent``; these instances -point to the file's data blocks. The root node of the extent tree is -stored in ``inode.i_block``, which allows for the first four extents to -be recorded without the use of extra metadata blocks. - -The extent tree header is recorded in ``struct ext4_extent_header``, -which is 12 bytes long: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le16 - - eh\_magic - - Magic number, 0xF30A. - * - 0x2 - - \_\_le16 - - eh\_entries - - Number of valid entries following the header. - * - 0x4 - - \_\_le16 - - eh\_max - - Maximum number of entries that could follow the header. - * - 0x6 - - \_\_le16 - - eh\_depth - - Depth of this extent node in the extent tree. 0 = this extent node - points to data blocks; otherwise, this extent node points to other - extent nodes. The extent tree can be at most 5 levels deep: a logical - block number can be at most ``2^32``, and the smallest ``n`` that - satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5. - * - 0x8 - - \_\_le32 - - eh\_generation - - Generation of the tree. (Used by Lustre, but not standard ext4). - -Internal nodes of the extent tree, also known as index nodes, are -recorded as ``struct ext4_extent_idx``, and are 12 bytes long: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - ei\_block - - This index node covers file blocks from 'block' onward. - * - 0x4 - - \_\_le32 - - ei\_leaf\_lo - - Lower 32-bits of the block number of the extent node that is the next - level lower in the tree. The tree node pointed to can be either another - internal node or a leaf node, described below. - * - 0x8 - - \_\_le16 - - ei\_leaf\_hi - - Upper 16-bits of the previous field. - * - 0xA - - \_\_u16 - - ei\_unused - - - -Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, -and are also 12 bytes long: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - ee\_block - - First file block number that this extent covers. - * - 0x4 - - \_\_le16 - - ee\_len - - Number of blocks covered by extent. If the value of this field is <= - 32768, the extent is initialized. If the value of the field is > 32768, - the extent is uninitialized and the actual extent length is ``ee_len`` - - 32768. Therefore, the maximum length of a initialized extent is 32768 - blocks, and the maximum length of an uninitialized extent is 32767. - * - 0x6 - - \_\_le16 - - ee\_start\_hi - - Upper 16-bits of the block number to which this extent points. - * - 0x8 - - \_\_le32 - - ee\_start\_lo - - Lower 32-bits of the block number to which this extent points. - -Prior to the introduction of metadata checksums, the extent header + -extent entries always left at least 4 bytes of unallocated space at the -end of each extent tree data block (because (2^x % 12) >= 4). Therefore, -the 32-bit checksum is inserted into this space. The 4 extents in the -inode do not need checksumming, since the inode is already checksummed. -The checksum is calculated against the FS UUID, the inode number, the -inode generation, and the entire extent block leading up to (but not -including) the checksum itself. - -``struct ext4_extent_tail`` is 4 bytes long: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - eb\_checksum - - Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock) - -Inline Data -~~~~~~~~~~~ - -If the inline data feature is enabled for the filesystem and the flag is -set for the inode, it is possible that the first 60 bytes of the file -data are stored here. diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst deleted file mode 100644 index f7d082c3a435..000000000000 --- a/Documentation/filesystems/ext4/ondisk/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============================== -Data Structures and Algorithms -============================== -.. include:: about.rst -.. include:: overview.rst -.. include:: globals.rst -.. include:: dynamic.rst diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/ondisk/inlinedata.rst deleted file mode 100644 index d1075178ce0b..000000000000 --- a/Documentation/filesystems/ext4/ondisk/inlinedata.rst +++ /dev/null @@ -1,37 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Inline Data ------------ - -The inline data feature was designed to handle the case that a file's -data is so tiny that it readily fits inside the inode, which -(theoretically) reduces disk block consumption and reduces seeks. If the -file is smaller than 60 bytes, then the data are stored inline in -``inode.i_block``. If the rest of the file would fit inside the extended -attribute space, then it might be found as an extended attribute -“system.data” within the inode body (“ibody EA”). This of course -constrains the amount of extended attributes one can attach to an inode. -If the data size increases beyond i\_block + ibody EA, a regular block -is allocated and the contents moved to that block. - -Pending a change to compact the extended attribute key used to store -inline data, one ought to be able to store 160 bytes of data in a -256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to -that, the limit was 156 bytes due to inefficient use of inode space. - -The inline data feature requires the presence of an extended attribute -for “system.data”, even if the attribute value is zero length. - -Inline Directories -~~~~~~~~~~~~~~~~~~ - -The first four bytes of i\_block are the inode number of the parent -directory. Following that is a 56-byte space for an array of directory -entries; see ``struct ext4_dir_entry``. If there is a “system.data” -attribute in the inode body, the EA value is an array of -``struct ext4_dir_entry`` as well. Note that for inline directories, the -i\_block and EA space are treated as separate dirent blocks; directory -entries cannot span the two. - -Inline directory entries are not checksummed, as the inode checksum -should protect all inline data contents. diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/ondisk/inodes.rst deleted file mode 100644 index 6bd35e506b6f..000000000000 --- a/Documentation/filesystems/ext4/ondisk/inodes.rst +++ /dev/null @@ -1,576 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Index Nodes ------------ - -In a regular UNIX filesystem, the inode stores all the metadata -pertaining to the file (time stamps, block maps, extended attributes, -etc), not the directory entry. To find the information associated with a -file, one must traverse the directory files to find the directory entry -associated with a file, then load the inode to find the metadata for -that file. ext4 appears to cheat (for performance reasons) a little bit -by storing a copy of the file type (normally stored in the inode) in the -directory entry. (Compare all this to FAT, which stores all the file -information directly in the directory entry, but does not support hard -links and is in general more seek-happy than ext4 due to its simpler -block allocator and extensive use of linked lists.) - -The inode table is a linear array of ``struct ext4_inode``. The table is -sized to have enough blocks to store at least -``sb.s_inode_size * sb.s_inodes_per_group`` bytes. The number of the -block group containing an inode can be calculated as -``(inode_number - 1) / sb.s_inodes_per_group``, and the offset into the -group's table is ``(inode_number - 1) % sb.s_inodes_per_group``. There -is no inode 0. - -The inode checksum is calculated against the FS UUID, the inode number, -and the inode structure itself. - -The inode table entry is laid out in ``struct ext4_inode``. - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - :class: longtable - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le16 - - i\_mode - - File mode. See the table i_mode_ below. - * - 0x2 - - \_\_le16 - - i\_uid - - Lower 16-bits of Owner UID. - * - 0x4 - - \_\_le32 - - i\_size\_lo - - Lower 32-bits of size in bytes. - * - 0x8 - - \_\_le32 - - i\_atime - - Last access time, in seconds since the epoch. However, if the EA\_INODE - inode flag is set, this inode stores an extended attribute value and - this field contains the checksum of the value. - * - 0xC - - \_\_le32 - - i\_ctime - - Last inode change time, in seconds since the epoch. However, if the - EA\_INODE inode flag is set, this inode stores an extended attribute - value and this field contains the lower 32 bits of the attribute value's - reference count. - * - 0x10 - - \_\_le32 - - i\_mtime - - Last data modification time, in seconds since the epoch. However, if the - EA\_INODE inode flag is set, this inode stores an extended attribute - value and this field contains the number of the inode that owns the - extended attribute. - * - 0x14 - - \_\_le32 - - i\_dtime - - Deletion Time, in seconds since the epoch. - * - 0x18 - - \_\_le16 - - i\_gid - - Lower 16-bits of GID. - * - 0x1A - - \_\_le16 - - i\_links\_count - - Hard link count. Normally, ext4 does not permit an inode to have more - than 65,000 hard links. This applies to files as well as directories, - which means that there cannot be more than 64,998 subdirectories in a - directory (each subdirectory's '..' entry counts as a hard link, as does - the '.' entry in the directory itself). With the DIR\_NLINK feature - enabled, ext4 supports more than 64,998 subdirectories by setting this - field to 1 to indicate that the number of hard links is not known. - * - 0x1C - - \_\_le32 - - i\_blocks\_lo - - Lower 32-bits of “block” count. If the huge\_file feature flag is not - set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks - on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in - ``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi - << 32)`` 512-byte blocks on disk. If huge\_file is set and - EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file - consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on - disk. - * - 0x20 - - \_\_le32 - - i\_flags - - Inode flags. See the table i_flags_ below. - * - 0x24 - - 4 bytes - - i\_osd1 - - See the table i_osd1_ for more details. - * - 0x28 - - 60 bytes - - i\_block[EXT4\_N\_BLOCKS=15] - - Block map or extent tree. See the section “The Contents of inode.i\_block”. - * - 0x64 - - \_\_le32 - - i\_generation - - File version (for NFS). - * - 0x68 - - \_\_le32 - - i\_file\_acl\_lo - - Lower 32-bits of extended attribute block. ACLs are of course one of - many possible extended attributes; I think the name of this field is a - result of the first use of extended attributes being for ACLs. - * - 0x6C - - \_\_le32 - - i\_size\_high / i\_dir\_acl - - Upper 32-bits of file/directory size. In ext2/3 this field was named - i\_dir\_acl, though it was usually set to zero and never used. - * - 0x70 - - \_\_le32 - - i\_obso\_faddr - - (Obsolete) fragment address. - * - 0x74 - - 12 bytes - - i\_osd2 - - See the table i_osd2_ for more details. - * - 0x80 - - \_\_le16 - - i\_extra\_isize - - Size of this inode - 128. Alternately, the size of the extended inode - fields beyond the original ext2 inode, including this field. - * - 0x82 - - \_\_le16 - - i\_checksum\_hi - - Upper 16-bits of the inode checksum. - * - 0x84 - - \_\_le32 - - i\_ctime\_extra - - Extra change time bits. This provides sub-second precision. See Inode - Timestamps section. - * - 0x88 - - \_\_le32 - - i\_mtime\_extra - - Extra modification time bits. This provides sub-second precision. - * - 0x8C - - \_\_le32 - - i\_atime\_extra - - Extra access time bits. This provides sub-second precision. - * - 0x90 - - \_\_le32 - - i\_crtime - - File creation time, in seconds since the epoch. - * - 0x94 - - \_\_le32 - - i\_crtime\_extra - - Extra file creation time bits. This provides sub-second precision. - * - 0x98 - - \_\_le32 - - i\_version\_hi - - Upper 32-bits for version number. - * - 0x9C - - \_\_le32 - - i\_projid - - Project ID. - -.. _i_mode: - -The ``i_mode`` value is a combination of the following flags: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x1 - - S\_IXOTH (Others may execute) - * - 0x2 - - S\_IWOTH (Others may write) - * - 0x4 - - S\_IROTH (Others may read) - * - 0x8 - - S\_IXGRP (Group members may execute) - * - 0x10 - - S\_IWGRP (Group members may write) - * - 0x20 - - S\_IRGRP (Group members may read) - * - 0x40 - - S\_IXUSR (Owner may execute) - * - 0x80 - - S\_IWUSR (Owner may write) - * - 0x100 - - S\_IRUSR (Owner may read) - * - 0x200 - - S\_ISVTX (Sticky bit) - * - 0x400 - - S\_ISGID (Set GID) - * - 0x800 - - S\_ISUID (Set UID) - * - - - These are mutually-exclusive file types: - * - 0x1000 - - S\_IFIFO (FIFO) - * - 0x2000 - - S\_IFCHR (Character device) - * - 0x4000 - - S\_IFDIR (Directory) - * - 0x6000 - - S\_IFBLK (Block device) - * - 0x8000 - - S\_IFREG (Regular file) - * - 0xA000 - - S\_IFLNK (Symbolic link) - * - 0xC000 - - S\_IFSOCK (Socket) - -.. _i_flags: - -The ``i_flags`` field is a combination of these values: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x1 - - This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented) - * - 0x2 - - This file should be preserved, should undeletion be desired - (EXT4\_UNRM\_FL). (not implemented) - * - 0x4 - - File is compressed (EXT4\_COMPR\_FL). (not really implemented) - * - 0x8 - - All writes to the file must be synchronous (EXT4\_SYNC\_FL). - * - 0x10 - - File is immutable (EXT4\_IMMUTABLE\_FL). - * - 0x20 - - File can only be appended (EXT4\_APPEND\_FL). - * - 0x40 - - The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL). - * - 0x80 - - Do not update access time (EXT4\_NOATIME\_FL). - * - 0x100 - - Dirty compressed file (EXT4\_DIRTY\_FL). (not used) - * - 0x200 - - File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used) - * - 0x400 - - Do not compress file (EXT4\_NOCOMPR\_FL). (not used) - * - 0x800 - - Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was - EXT4\_ECOMPR\_FL (compression error), which was never used. - * - 0x1000 - - Directory has hashed indexes (EXT4\_INDEX\_FL). - * - 0x2000 - - AFS magic directory (EXT4\_IMAGIC\_FL). - * - 0x4000 - - File data must always be written through the journal - (EXT4\_JOURNAL\_DATA\_FL). - * - 0x8000 - - File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4) - * - 0x10000 - - All directory entry data should be written synchronously (see - ``dirsync``) (EXT4\_DIRSYNC\_FL). - * - 0x20000 - - Top of directory hierarchy (EXT4\_TOPDIR\_FL). - * - 0x40000 - - This is a huge file (EXT4\_HUGE\_FILE\_FL). - * - 0x80000 - - Inode uses extents (EXT4\_EXTENTS\_FL). - * - 0x200000 - - Inode stores a large extended attribute value in its data blocks - (EXT4\_EA\_INODE\_FL). - * - 0x400000 - - This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL). - (deprecated) - * - 0x01000000 - - Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline) - * - 0x04000000 - - Snapshot is being deleted (``EXT4_SNAPFILE_DELETED_FL``). (not in - mainline) - * - 0x08000000 - - Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in - mainline) - * - 0x10000000 - - Inode has inline data (EXT4\_INLINE\_DATA\_FL). - * - 0x20000000 - - Create children with the same project ID (EXT4\_PROJINHERIT\_FL). - * - 0x80000000 - - Reserved for ext4 library (EXT4\_RESERVED\_FL). - * - - - Aggregate flags: - * - 0x4BDFFF - - User-visible flags. - * - 0x4B80FF - - User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and - EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's - EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of - these flags in a special manner and they are masked out of the set of - flags that are saved directly to i\_flags. - -.. _i_osd1: - -The ``osd1`` field has multiple meanings depending on the creator: - -Linux: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - l\_i\_version - - Inode version. However, if the EA\_INODE inode flag is set, this inode - stores an extended attribute value and this field contains the upper 32 - bits of the attribute value's reference count. - -Hurd: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - h\_i\_translator - - ?? - -Masix: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - m\_i\_reserved - - ?? - -.. _i_osd2: - -The ``osd2`` field has multiple meanings depending on the filesystem creator: - -Linux: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le16 - - l\_i\_blocks\_high - - Upper 16-bits of the block count. Please see the note attached to - i\_blocks\_lo. - * - 0x2 - - \_\_le16 - - l\_i\_file\_acl\_high - - Upper 16-bits of the extended attribute block (historically, the file - ACL location). See the Extended Attributes section below. - * - 0x4 - - \_\_le16 - - l\_i\_uid\_high - - Upper 16-bits of the Owner UID. - * - 0x6 - - \_\_le16 - - l\_i\_gid\_high - - Upper 16-bits of the GID. - * - 0x8 - - \_\_le16 - - l\_i\_checksum\_lo - - Lower 16-bits of the inode checksum. - * - 0xA - - \_\_le16 - - l\_i\_reserved - - Unused. - -Hurd: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le16 - - h\_i\_reserved1 - - ?? - * - 0x2 - - \_\_u16 - - h\_i\_mode\_high - - Upper 16-bits of the file mode. - * - 0x4 - - \_\_le16 - - h\_i\_uid\_high - - Upper 16-bits of the Owner UID. - * - 0x6 - - \_\_le16 - - h\_i\_gid\_high - - Upper 16-bits of the GID. - * - 0x8 - - \_\_u32 - - h\_i\_author - - Author code? - -Masix: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le16 - - h\_i\_reserved1 - - ?? - * - 0x2 - - \_\_u16 - - m\_i\_file\_acl\_high - - Upper 16-bits of the extended attribute block (historically, the file - ACL location). - * - 0x4 - - \_\_u32 - - m\_i\_reserved2[2] - - ?? - -Inode Size -~~~~~~~~~~ - -In ext2 and ext3, the inode structure size was fixed at 128 bytes -(``EXT2_GOOD_OLD_INODE_SIZE``) and each inode had a disk record size of -128 bytes. Starting with ext4, it is possible to allocate a larger -on-disk inode at format time for all inodes in the filesystem to provide -space beyond the end of the original ext2 inode. The on-disk inode -record size is recorded in the superblock as ``s_inode_size``. The -number of bytes actually used by struct ext4\_inode beyond the original -128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each -inode, which allows struct ext4\_inode to grow for a new kernel without -having to upgrade all of the on-disk inodes. Access to fields beyond -EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within -``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as -of October 2013) the inode structure is 156 bytes -(``i_extra_isize = 28``). The extra space between the end of the inode -structure and the end of the inode record can be used to store extended -attributes. Each inode record can be as large as the filesystem block -size, though this is not terribly efficient. - -Finding an Inode -~~~~~~~~~~~~~~~~ - -Each block group contains ``sb->s_inodes_per_group`` inodes. Because -inode 0 is defined not to exist, this formula can be used to find the -block group that an inode lives in: -``bg = (inode_num - 1) / sb->s_inodes_per_group``. The particular inode -can be found within the block group's inode table at -``index = (inode_num - 1) % sb->s_inodes_per_group``. To get the byte -address within the inode table, use -``offset = index * sb->s_inode_size``. - -Inode Timestamps -~~~~~~~~~~~~~~~~ - -Four timestamps are recorded in the lower 128 bytes of the inode -structure -- inode change time (ctime), access time (atime), data -modification time (mtime), and deletion time (dtime). The four fields -are 32-bit signed integers that represent seconds since the Unix epoch -(1970-01-01 00:00:00 GMT), which means that the fields will overflow in -January 2038. For inodes that are not linked from any directory but are -still open (orphan inodes), the dtime field is overloaded for use with -the orphan list. The superblock field ``s_last_orphan`` points to the -first inode in the orphan list; dtime is then the number of the next -orphaned inode, or zero if there are no more orphans. - -If the inode structure size ``sb->s_inode_size`` is larger than 128 -bytes and the ``i_inode_extra`` field is large enough to encompass the -respective ``i_[cma]time_extra`` field, the ctime, atime, and mtime -inode fields are widened to 64 bits. Within this “extra” 32-bit field, -the lower two bits are used to extend the 32-bit seconds field to be 34 -bit wide; the upper 30 bits are used to provide nanosecond timestamp -accuracy. Therefore, timestamps should not overflow until May 2446. -dtime was not widened. There is also a fifth timestamp to record inode -creation time (crtime); this field is 64-bits wide and decoded in the -same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible -through the regular stat() interface, though debugfs will report them. - -We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)). -In other words: - -.. list-table:: - :widths: 20 20 20 20 20 - :header-rows: 1 - - * - Extra epoch bits - - MSB of 32-bit time - - Adjustment for signed 32-bit to 64-bit tv\_sec - - Decoded 64-bit tv\_sec - - valid time range - * - 0 0 - - 1 - - 0 - - ``-0x80000000 - -0x00000001`` - - 1901-12-13 to 1969-12-31 - * - 0 0 - - 0 - - 0 - - ``0x000000000 - 0x07fffffff`` - - 1970-01-01 to 2038-01-19 - * - 0 1 - - 1 - - 0x100000000 - - ``0x080000000 - 0x0ffffffff`` - - 2038-01-19 to 2106-02-07 - * - 0 1 - - 0 - - 0x100000000 - - ``0x100000000 - 0x17fffffff`` - - 2106-02-07 to 2174-02-25 - * - 1 0 - - 1 - - 0x200000000 - - ``0x180000000 - 0x1ffffffff`` - - 2174-02-25 to 2242-03-16 - * - 1 0 - - 0 - - 0x200000000 - - ``0x200000000 - 0x27fffffff`` - - 2242-03-16 to 2310-04-04 - * - 1 1 - - 1 - - 0x300000000 - - ``0x280000000 - 0x2ffffffff`` - - 2310-04-04 to 2378-04-22 - * - 1 1 - - 0 - - 0x300000000 - - ``0x300000000 - 0x37fffffff`` - - 2378-04-22 to 2446-05-10 - -This is a somewhat odd encoding since there are effectively seven times -as many positive values as negative values. There have also been -long-standing bugs decoding and encoding dates beyond 2038, which don't -seem to be fixed as of kernel 3.12 and e2fsprogs 1.42.8. 64-bit kernels -incorrectly use the extra epoch bits 1,1 for dates between 1901 and -1970. At some point the kernel will be fixed and e2fsck will fix this -situation, assuming that it is run before 2310. diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/ondisk/journal.rst deleted file mode 100644 index ea613ee701f5..000000000000 --- a/Documentation/filesystems/ext4/ondisk/journal.rst +++ /dev/null @@ -1,611 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Journal (jbd2) --------------- - -Introduced in ext3, the ext4 filesystem employs a journal to protect the -filesystem against corruption in the case of a system crash. A small -continuous region of disk (default 128MiB) is reserved inside the -filesystem as a place to land “important” data writes on-disk as quickly -as possible. Once the important data transaction is fully written to the -disk and flushed from the disk write cache, a record of the data being -committed is also written to the journal. At some later point in time, -the journal code writes the transactions to their final locations on -disk (this could involve a lot of seeking or a lot of small -read-write-erases) before erasing the commit record. Should the system -crash during the second slow write, the journal can be replayed all the -way to the latest commit record, guaranteeing the atomicity of whatever -gets written through the journal to the disk. The effect of this is to -guarantee that the filesystem does not become stuck midway through a -metadata update. - -For performance reasons, ext4 by default only writes filesystem metadata -through the journal. This means that file data blocks are /not/ -guaranteed to be in any consistent state after a crash. If this default -guarantee level (``data=ordered``) is not satisfactory, there is a mount -option to control journal behavior. If ``data=journal``, all data and -metadata are written to disk through the journal. This is slower but -safest. If ``data=writeback``, dirty data blocks are not flushed to the -disk before the metadata are written to disk through the journal. - -The journal inode is typically inode 8. The first 68 bytes of the -journal inode are replicated in the ext4 superblock. The journal itself -is normal (but hidden) file within the filesystem. The file usually -consumes an entire block group, though mke2fs tries to put it in the -middle of the disk. - -All fields in jbd2 are written to disk in big-endian order. This is the -opposite of ext4. - -NOTE: Both ext4 and ocfs2 use jbd2. - -The maximum size of a journal embedded in an ext4 filesystem is 2^32 -blocks. jbd2 itself does not seem to care. - -Layout -~~~~~~ - -Generally speaking, the journal has this format: - -.. list-table:: - :widths: 16 48 16 - :header-rows: 1 - - * - Superblock - - descriptor\_block (data\_blocks or revocation\_block) [more data or - revocations] commmit\_block - - [more transactions...] - * - - - One transaction - - - -Notice that a transaction begins with either a descriptor and some data, -or a block revocation list. A finished transaction always ends with a -commit. If there is no commit record (or the checksums don't match), the -transaction will be discarded during replay. - -External Journal -~~~~~~~~~~~~~~~~ - -Optionally, an ext4 filesystem can be created with an external journal -device (as opposed to an internal journal, which uses a reserved inode). -In this case, on the filesystem device, ``s_journal_inum`` should be -zero and ``s_journal_uuid`` should be set. On the journal device there -will be an ext4 super block in the usual place, with a matching UUID. -The journal superblock will be in the next full block after the -superblock. - -.. list-table:: - :widths: 12 12 12 32 12 - :header-rows: 1 - - * - 1024 bytes of padding - - ext4 Superblock - - Journal Superblock - - descriptor\_block (data\_blocks or revocation\_block) [more data or - revocations] commmit\_block - - [more transactions...] - * - - - - - - - One transaction - - - -Block Header -~~~~~~~~~~~~ - -Every block in the journal starts with a common 12-byte header -``struct journal_header_s``: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - \_\_be32 - - h\_magic - - jbd2 magic number, 0xC03B3998. - * - 0x4 - - \_\_be32 - - h\_blocktype - - Description of what this block contains. See the jbd2_blocktype_ table - below. - * - 0x8 - - \_\_be32 - - h\_sequence - - The transaction ID that goes with this block. - -.. _jbd2_blocktype: - -The journal block type can be any one of: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 1 - - Descriptor. This block precedes a series of data blocks that were - written through the journal during a transaction. - * - 2 - - Block commit record. This block signifies the completion of a - transaction. - * - 3 - - Journal superblock, v1. - * - 4 - - Journal superblock, v2. - * - 5 - - Block revocation records. This speeds up recovery by enabling the - journal to skip writing blocks that were subsequently rewritten. - -Super Block -~~~~~~~~~~~ - -The super block for the journal is much simpler as compared to ext4's. -The key data kept within are size of the journal, and where to find the -start of the log of transactions. - -The journal superblock is recorded as ``struct journal_superblock_s``, -which is 1024 bytes long: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - - - - - - - Static information describing the journal. - * - 0x0 - - journal\_header\_t (12 bytes) - - s\_header - - Common header identifying this as a superblock. - * - 0xC - - \_\_be32 - - s\_blocksize - - Journal device block size. - * - 0x10 - - \_\_be32 - - s\_maxlen - - Total number of blocks in this journal. - * - 0x14 - - \_\_be32 - - s\_first - - First block of log information. - * - - - - - - - Dynamic information describing the current state of the log. - * - 0x18 - - \_\_be32 - - s\_sequence - - First commit ID expected in log. - * - 0x1C - - \_\_be32 - - s\_start - - Block number of the start of log. Contrary to the comments, this field - being zero does not imply that the journal is clean! - * - 0x20 - - \_\_be32 - - s\_errno - - Error value, as set by jbd2\_journal\_abort(). - * - - - - - - - The remaining fields are only valid in a v2 superblock. - * - 0x24 - - \_\_be32 - - s\_feature\_compat; - - Compatible feature set. See the table jbd2_compat_ below. - * - 0x28 - - \_\_be32 - - s\_feature\_incompat - - Incompatible feature set. See the table jbd2_incompat_ below. - * - 0x2C - - \_\_be32 - - s\_feature\_ro\_compat - - Read-only compatible feature set. There aren't any of these currently. - * - 0x30 - - \_\_u8 - - s\_uuid[16] - - 128-bit uuid for journal. This is compared against the copy in the ext4 - super block at mount time. - * - 0x40 - - \_\_be32 - - s\_nr\_users - - Number of file systems sharing this journal. - * - 0x44 - - \_\_be32 - - s\_dynsuper - - Location of dynamic super block copy. (Not used?) - * - 0x48 - - \_\_be32 - - s\_max\_transaction - - Limit of journal blocks per transaction. (Not used?) - * - 0x4C - - \_\_be32 - - s\_max\_trans\_data - - Limit of data blocks per transaction. (Not used?) - * - 0x50 - - \_\_u8 - - s\_checksum\_type - - Checksum algorithm used for the journal. See jbd2_checksum_type_ for - more info. - * - 0x51 - - \_\_u8[3] - - s\_padding2 - - - * - 0x54 - - \_\_u32 - - s\_padding[42] - - - * - 0xFC - - \_\_be32 - - s\_checksum - - Checksum of the entire superblock, with this field set to zero. - * - 0x100 - - \_\_u8 - - s\_users[16\*48] - - ids of all file systems sharing the log. e2fsprogs/Linux don't allow - shared external journals, but I imagine Lustre (or ocfs2?), which use - the jbd2 code, might. - -.. _jbd2_compat: - -The journal compat features are any combination of the following: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x1 - - Journal maintains checksums on the data blocks. - (JBD2\_FEATURE\_COMPAT\_CHECKSUM) - -.. _jbd2_incompat: - -The journal incompat features are any combination of the following: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x1 - - Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE) - * - 0x2 - - Journal can deal with 64-bit block numbers. - (JBD2\_FEATURE\_INCOMPAT\_64BIT) - * - 0x4 - - Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT) - * - 0x8 - - This journal uses v2 of the checksum on-disk format. Each journal - metadata block gets its own checksum, and the block tags in the - descriptor table contain checksums for each of the data blocks in the - journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2) - * - 0x10 - - This journal uses v3 of the checksum on-disk format. This is the same as - v2, but the journal block tag size is fixed regardless of the size of - block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3) - -.. _jbd2_checksum_type: - -Journal checksum type codes are one of the following. crc32 or crc32c are the -most likely choices. - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 1 - - CRC32 - * - 2 - - MD5 - * - 3 - - SHA1 - * - 4 - - CRC32C - -Descriptor Block -~~~~~~~~~~~~~~~~ - -The descriptor block contains an array of journal block tags that -describe the final locations of the data blocks that follow in the -journal. Descriptor blocks are open-coded instead of being completely -described by a data structure, but here is the block structure anyway. -Descriptor blocks consume at least 36 bytes, but use a full block: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Descriptor - * - 0x0 - - journal\_header\_t - - (open coded) - - Common block header. - * - 0xC - - struct journal\_block\_tag\_s - - open coded array[] - - Enough tags either to fill up the block or to describe all the data - blocks that follow this descriptor block. - -Journal block tags have any of the following formats, depending on which -journal feature and block tag flags are set. - -If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is -defined as ``struct journal_block_tag3_s``, which looks like the -following. The size is 16 or 32 bytes. - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Descriptor - * - 0x0 - - \_\_be32 - - t\_blocknr - - Lower 32-bits of the location of where the corresponding data block - should end up on disk. - * - 0x4 - - \_\_be32 - - t\_flags - - Flags that go with the descriptor. See the table jbd2_tag_flags_ for - more info. - * - 0x8 - - \_\_be32 - - t\_blocknr\_high - - Upper 32-bits of the location of where the corresponding data block - should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is - not enabled. - * - 0xC - - \_\_be32 - - t\_checksum - - Checksum of the journal UUID, the sequence number, and the data block. - * - - - - - - - This field appears to be open coded. It always comes at the end of the - tag, after t_checksum. This field is not present if the "same UUID" flag - is set. - * - 0x8 or 0xC - - char - - uuid[16] - - A UUID to go with this tag. This field appears to be copied from the - ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that - field. - -.. _jbd2_tag_flags: - -The journal tag flags are any combination of the following: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x1 - - On-disk block is escaped. The first four bytes of the data block just - happened to match the jbd2 magic number. - * - 0x2 - - This block has the same UUID as previous, therefore the UUID field is - omitted. - * - 0x4 - - The data block was deleted by the transaction. (Not used?) - * - 0x8 - - This is the last tag in this descriptor block. - -If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag -is defined as ``struct journal_block_tag_s``, which looks like the -following. The size is 8, 12, 24, or 28 bytes: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Descriptor - * - 0x0 - - \_\_be32 - - t\_blocknr - - Lower 32-bits of the location of where the corresponding data block - should end up on disk. - * - 0x4 - - \_\_be16 - - t\_checksum - - Checksum of the journal UUID, the sequence number, and the data block. - Note that only the lower 16 bits are stored. - * - 0x6 - - \_\_be16 - - t\_flags - - Flags that go with the descriptor. See the table jbd2_tag_flags_ for - more info. - * - - - - - - - This next field is only present if the super block indicates support for - 64-bit block numbers. - * - 0x8 - - \_\_be32 - - t\_blocknr\_high - - Upper 32-bits of the location of where the corresponding data block - should end up on disk. - * - - - - - - - This field appears to be open coded. It always comes at the end of the - tag, after t_flags or t_blocknr_high. This field is not present if the - "same UUID" flag is set. - * - 0x8 or 0xC - - char - - uuid[16] - - A UUID to go with this tag. This field appears to be copied from the - ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that - field. - -If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or -JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a -``struct jbd2_journal_block_tail``, which looks like this: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Descriptor - * - 0x0 - - \_\_be32 - - t\_checksum - - Checksum of the journal UUID + the descriptor block, with this field set - to zero. - -Data Block -~~~~~~~~~~ - -In general, the data blocks being written to disk through the journal -are written verbatim into the journal file after the descriptor block. -However, if the first four bytes of the block match the jbd2 magic -number then those four bytes are replaced with zeroes and the “escaped” -flag is set in the descriptor block tag. - -Revocation Block -~~~~~~~~~~~~~~~~ - -A revocation block is used to prevent replay of a block in an earlier -transaction. This is used to mark blocks that were journalled at one -time but are no longer journalled. Typically this happens if a metadata -block is freed and re-allocated as a file data block; in this case, a -journal replay after the file block was written to disk will cause -corruption. - -**NOTE**: This mechanism is NOT used to express “this journal block is -superseded by this other journal block”, as the author (djwong) -mistakenly thought. Any block being added to a transaction will cause -the removal of all existing revocation records for that block. - -Revocation blocks are described in -``struct jbd2_journal_revoke_header_s``, are at least 16 bytes in -length, but use a full block: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - journal\_header\_t - - r\_header - - Common block header. - * - 0xC - - \_\_be32 - - r\_count - - Number of bytes used in this block. - * - 0x10 - - \_\_be32 or \_\_be64 - - blocks[0] - - Blocks to revoke. - -After r\_count is a linear array of block numbers that are effectively -revoked by this transaction. The size of each block number is 8 bytes if -the superblock advertises 64-bit block number support, or 4 bytes -otherwise. - -If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or -JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation -block is a ``struct jbd2_journal_revoke_tail``, which has this format: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - \_\_be32 - - r\_checksum - - Checksum of the journal UUID + revocation block - -Commit Block -~~~~~~~~~~~~ - -The commit block is a sentry that indicates that a transaction has been -completely written to the journal. Once this commit block reaches the -journal, the data stored with this transaction can be written to their -final locations on disk. - -The commit block is described by ``struct commit_header``, which is 32 -bytes long (but uses a full block): - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Descriptor - * - 0x0 - - journal\_header\_s - - (open coded) - - Common block header. - * - 0xC - - unsigned char - - h\_chksum\_type - - The type of checksum to use to verify the integrity of the data blocks - in the transaction. See jbd2_checksum_type_ for more info. - * - 0xD - - unsigned char - - h\_chksum\_size - - The number of bytes used by the checksum. Most likely 4. - * - 0xE - - unsigned char - - h\_padding[2] - - - * - 0x10 - - \_\_be32 - - h\_chksum[JBD2\_CHECKSUM\_BYTES] - - 32 bytes of space to store checksums. If - JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 - are set, the first ``__be32`` is the checksum of the journal UUID and - the entire commit block, with this field zeroed. If - JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the - crc32 of all the blocks already written to the transaction. - * - 0x30 - - \_\_be64 - - h\_commit\_sec - - The time that the transaction was committed, in seconds since the epoch. - * - 0x38 - - \_\_be32 - - h\_commit\_nsec - - Nanoseconds component of the above timestamp. - diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/ondisk/mmp.rst deleted file mode 100644 index 25660981d93c..000000000000 --- a/Documentation/filesystems/ext4/ondisk/mmp.rst +++ /dev/null @@ -1,77 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Multiple Mount Protection -------------------------- - -Multiple mount protection (MMP) is a feature that protects the -filesystem against multiple hosts trying to use the filesystem -simultaneously. When a filesystem is opened (for mounting, or fsck, -etc.), the MMP code running on the node (call it node A) checks a -sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the -open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then -fsck is (hopefully) running, and open fails immediately. Otherwise, the -open code will wait for twice the specified MMP check interval and check -the sequence number again. If the sequence number has changed, then the -filesystem is active on another machine and the open fails. If the MMP -code passes all of those checks, a new MMP sequence number is generated -and written to the MMP block, and the mount proceeds. - -While the filesystem is live, the kernel sets up a timer to re-check the -MMP block at the specified MMP check interval. To perform the re-check, -the MMP sequence number is re-read; if it does not match the in-memory -MMP sequence number, then another node (node B) has mounted the -filesystem, and node A remounts the filesystem read-only. If the -sequence numbers match, the sequence number is incremented both in -memory and on disk, and the re-check is complete. - -The hostname and device filename are written into the MMP block whenever -an open operation succeeds. The MMP code does not use these values; they -are provided purely for informational purposes. - -The checksum is calculated against the FS UUID and the MMP structure. -The MMP structure (``struct mmp_struct``) is as follows: - -.. list-table:: - :widths: 8 12 20 40 - :header-rows: 1 - - * - Offset - - Type - - Name - - Description - * - 0x0 - - \_\_le32 - - mmp\_magic - - Magic number for MMP, 0x004D4D50 (“MMP”). - * - 0x4 - - \_\_le32 - - mmp\_seq - - Sequence number, updated periodically. - * - 0x8 - - \_\_le64 - - mmp\_time - - Time that the MMP block was last updated. - * - 0x10 - - char[64] - - mmp\_nodename - - Hostname of the node that opened the filesystem. - * - 0x50 - - char[32] - - mmp\_bdevname - - Block device name of the filesystem. - * - 0x70 - - \_\_le16 - - mmp\_check\_interval - - The MMP re-check interval, in seconds. - * - 0x72 - - \_\_le16 - - mmp\_pad1 - - Zero. - * - 0x74 - - \_\_le32[226] - - mmp\_pad2 - - Zero. - * - 0x3FC - - \_\_le32 - - mmp\_checksum - - Checksum of the MMP block. diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/ondisk/overview.rst deleted file mode 100644 index cbab18baba12..000000000000 --- a/Documentation/filesystems/ext4/ondisk/overview.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -High Level Design -================= - -An ext4 file system is split into a series of block groups. To reduce -performance difficulties due to fragmentation, the block allocator tries -very hard to keep each file's blocks within the same group, thereby -reducing seek times. The size of a block group is specified in -``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \* -``block_size_in_bytes``. With the default block size of 4KiB, each group -will contain 32,768 blocks, for a length of 128MiB. The number of block -groups is the size of the device divided by the size of a block group. - -All fields in ext4 are written to disk in little-endian order. HOWEVER, -all fields in jbd2 (the journal) are written to disk in big-endian -order. - -.. include:: blocks.rst -.. include:: blockgroup.rst -.. include:: special_inodes.rst -.. include:: allocators.rst -.. include:: checksums.rst -.. include:: bigalloc.rst -.. include:: inlinedata.rst -.. include:: eainode.rst diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/ondisk/special_inodes.rst deleted file mode 100644 index 9061aabba827..000000000000 --- a/Documentation/filesystems/ext4/ondisk/special_inodes.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Special inodes --------------- - -ext4 reserves some inode for special features, as follows: - -.. list-table:: - :widths: 6 70 - :header-rows: 1 - - * - inode Number - - Purpose - * - 0 - - Doesn't exist; there is no inode 0. - * - 1 - - List of defective blocks. - * - 2 - - Root directory. - * - 3 - - User quota. - * - 4 - - Group quota. - * - 5 - - Boot loader. - * - 6 - - Undelete directory. - * - 7 - - Reserved group descriptors inode. (“resize inode”) - * - 8 - - Journal inode. - * - 9 - - The “exclude” inode, for snapshots(?) - * - 10 - - Replica inode, used for some non-upstream feature? - * - 11 - - Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock. - diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/ondisk/super.rst deleted file mode 100644 index 04ff079a2acf..000000000000 --- a/Documentation/filesystems/ext4/ondisk/super.rst +++ /dev/null @@ -1,801 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Super Block ------------ - -The superblock records various information about the enclosing -filesystem, such as block counts, inode counts, supported features, -maintenance information, and more. - -If the sparse\_super feature flag is set, redundant copies of the -superblock and group descriptors are kept only in the groups whose group -number is either 0 or a power of 3, 5, or 7. If the flag is not set, -redundant copies are kept in all groups. - -The superblock checksum is calculated against the superblock structure, -which includes the FS UUID. - -The ext4 superblock is laid out as follows in -``struct ext4_super_block``: - -.. list-table:: - :widths: 8 8 24 40 - :header-rows: 1 - - * - Offset - - Size - - Name - - Description - * - 0x0 - - \_\_le32 - - s\_inodes\_count - - Total inode count. - * - 0x4 - - \_\_le32 - - s\_blocks\_count\_lo - - Total block count. - * - 0x8 - - \_\_le32 - - s\_r\_blocks\_count\_lo - - This number of blocks can only be allocated by the super-user. - * - 0xC - - \_\_le32 - - s\_free\_blocks\_count\_lo - - Free block count. - * - 0x10 - - \_\_le32 - - s\_free\_inodes\_count - - Free inode count. - * - 0x14 - - \_\_le32 - - s\_first\_data\_block - - First data block. This must be at least 1 for 1k-block filesystems and - is typically 0 for all other block sizes. - * - 0x18 - - \_\_le32 - - s\_log\_block\_size - - Block size is 2 ^ (10 + s\_log\_block\_size). - * - 0x1C - - \_\_le32 - - s\_log\_cluster\_size - - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is - enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size. - * - 0x20 - - \_\_le32 - - s\_blocks\_per\_group - - Blocks per group. - * - 0x24 - - \_\_le32 - - s\_clusters\_per\_group - - Clusters per group, if bigalloc is enabled. Otherwise - s\_clusters\_per\_group must equal s\_blocks\_per\_group. - * - 0x28 - - \_\_le32 - - s\_inodes\_per\_group - - Inodes per group. - * - 0x2C - - \_\_le32 - - s\_mtime - - Mount time, in seconds since the epoch. - * - 0x30 - - \_\_le32 - - s\_wtime - - Write time, in seconds since the epoch. - * - 0x34 - - \_\_le16 - - s\_mnt\_count - - Number of mounts since the last fsck. - * - 0x36 - - \_\_le16 - - s\_max\_mnt\_count - - Number of mounts beyond which a fsck is needed. - * - 0x38 - - \_\_le16 - - s\_magic - - Magic signature, 0xEF53 - * - 0x3A - - \_\_le16 - - s\_state - - File system state. See super_state_ for more info. - * - 0x3C - - \_\_le16 - - s\_errors - - Behaviour when detecting errors. See super_errors_ for more info. - * - 0x3E - - \_\_le16 - - s\_minor\_rev\_level - - Minor revision level. - * - 0x40 - - \_\_le32 - - s\_lastcheck - - Time of last check, in seconds since the epoch. - * - 0x44 - - \_\_le32 - - s\_checkinterval - - Maximum time between checks, in seconds. - * - 0x48 - - \_\_le32 - - s\_creator\_os - - Creator OS. See the table super_creator_ for more info. - * - 0x4C - - \_\_le32 - - s\_rev\_level - - Revision level. See the table super_revision_ for more info. - * - 0x50 - - \_\_le16 - - s\_def\_resuid - - Default uid for reserved blocks. - * - 0x52 - - \_\_le16 - - s\_def\_resgid - - Default gid for reserved blocks. - * - - - - - - - These fields are for EXT4_DYNAMIC_REV superblocks only. - - Note: the difference between the compatible feature set and the - incompatible feature set is that if there is a bit set in the - incompatible feature set that the kernel doesn't know about, it should - refuse to mount the filesystem. - - e2fsck's requirements are more strict; if it doesn't know - about a feature in either the compatible or incompatible feature set, it - must abort and not try to meddle with things it doesn't understand... - * - 0x54 - - \_\_le32 - - s\_first\_ino - - First non-reserved inode. - * - 0x58 - - \_\_le16 - - s\_inode\_size - - Size of inode structure, in bytes. - * - 0x5A - - \_\_le16 - - s\_block\_group\_nr - - Block group # of this superblock. - * - 0x5C - - \_\_le32 - - s\_feature\_compat - - Compatible feature set flags. Kernel can still read/write this fs even - if it doesn't understand a flag; fsck should not do that. See the - super_compat_ table for more info. - * - 0x60 - - \_\_le32 - - s\_feature\_incompat - - Incompatible feature set. If the kernel or fsck doesn't understand one - of these bits, it should stop. See the super_incompat_ table for more - info. - * - 0x64 - - \_\_le32 - - s\_feature\_ro\_compat - - Readonly-compatible feature set. If the kernel doesn't understand one of - these bits, it can still mount read-only. See the super_rocompat_ table - for more info. - * - 0x68 - - \_\_u8 - - s\_uuid[16] - - 128-bit UUID for volume. - * - 0x78 - - char - - s\_volume\_name[16] - - Volume label. - * - 0x88 - - char - - s\_last\_mounted[64] - - Directory where filesystem was last mounted. - * - 0xC8 - - \_\_le32 - - s\_algorithm\_usage\_bitmap - - For compression (Not used in e2fsprogs/Linux) - * - - - - - - - Performance hints. Directory preallocation should only happen if the - EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. - * - 0xCC - - \_\_u8 - - s\_prealloc\_blocks - - #. of blocks to try to preallocate for ... files? (Not used in - e2fsprogs/Linux) - * - 0xCD - - \_\_u8 - - s\_prealloc\_dir\_blocks - - #. of blocks to preallocate for directories. (Not used in - e2fsprogs/Linux) - * - 0xCE - - \_\_le16 - - s\_reserved\_gdt\_blocks - - Number of reserved GDT entries for future filesystem expansion. - * - - - - - - - Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is - set. - * - 0xD0 - - \_\_u8 - - s\_journal\_uuid[16] - - UUID of journal superblock - * - 0xE0 - - \_\_le32 - - s\_journal\_inum - - inode number of journal file. - * - 0xE4 - - \_\_le32 - - s\_journal\_dev - - Device number of journal file, if the external journal feature flag is - set. - * - 0xE8 - - \_\_le32 - - s\_last\_orphan - - Start of list of orphaned inodes to delete. - * - 0xEC - - \_\_le32 - - s\_hash\_seed[4] - - HTREE hash seed. - * - 0xFC - - \_\_u8 - - s\_def\_hash\_version - - Default hash algorithm to use for directory hashes. See super_def_hash_ - for more info. - * - 0xFD - - \_\_u8 - - s\_jnl\_backup\_type - - If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the - ``s_jnl_blocks`` field contains a duplicate copy of the inode's - ``i_block[]`` array and ``i_size``. - * - 0xFE - - \_\_le16 - - s\_desc\_size - - Size of group descriptors, in bytes, if the 64bit incompat feature flag - is set. - * - 0x100 - - \_\_le32 - - s\_default\_mount\_opts - - Default mount options. See the super_mountopts_ table for more info. - * - 0x104 - - \_\_le32 - - s\_first\_meta\_bg - - First metablock block group, if the meta\_bg feature is enabled. - * - 0x108 - - \_\_le32 - - s\_mkfs\_time - - When the filesystem was created, in seconds since the epoch. - * - 0x10C - - \_\_le32 - - s\_jnl\_blocks[17] - - Backup copy of the journal inode's ``i_block[]`` array in the first 15 - elements and i\_size\_high and i\_size in the 16th and 17th elements, - respectively. - * - - - - - - - 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set. - * - 0x150 - - \_\_le32 - - s\_blocks\_count\_hi - - High 32-bits of the block count. - * - 0x154 - - \_\_le32 - - s\_r\_blocks\_count\_hi - - High 32-bits of the reserved block count. - * - 0x158 - - \_\_le32 - - s\_free\_blocks\_count\_hi - - High 32-bits of the free block count. - * - 0x15C - - \_\_le16 - - s\_min\_extra\_isize - - All inodes have at least # bytes. - * - 0x15E - - \_\_le16 - - s\_want\_extra\_isize - - New inodes should reserve # bytes. - * - 0x160 - - \_\_le32 - - s\_flags - - Miscellaneous flags. See the super_flags_ table for more info. - * - 0x164 - - \_\_le16 - - s\_raid\_stride - - RAID stride. This is the number of logical blocks read from or written - to the disk before moving to the next disk. This affects the placement - of filesystem metadata, which will hopefully make RAID storage faster. - * - 0x166 - - \_\_le16 - - s\_mmp\_interval - - #. seconds to wait in multi-mount prevention (MMP) checking. In theory, - MMP is a mechanism to record in the superblock which host and device - have mounted the filesystem, in order to prevent multiple mounts. This - feature does not seem to be implemented... - * - 0x168 - - \_\_le64 - - s\_mmp\_block - - Block # for multi-mount protection data. - * - 0x170 - - \_\_le32 - - s\_raid\_stripe\_width - - RAID stripe width. This is the number of logical blocks read from or - written to the disk before coming back to the current disk. This is used - by the block allocator to try to reduce the number of read-modify-write - operations in a RAID5/6. - * - 0x174 - - \_\_u8 - - s\_log\_groups\_per\_flex - - Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``. - * - 0x175 - - \_\_u8 - - s\_checksum\_type - - Metadata checksum algorithm type. The only valid value is 1 (crc32c). - * - 0x176 - - \_\_le16 - - s\_reserved\_pad - - - * - 0x178 - - \_\_le64 - - s\_kbytes\_written - - Number of KiB written to this filesystem over its lifetime. - * - 0x180 - - \_\_le32 - - s\_snapshot\_inum - - inode number of active snapshot. (Not used in e2fsprogs/Linux.) - * - 0x184 - - \_\_le32 - - s\_snapshot\_id - - Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.) - * - 0x188 - - \_\_le64 - - s\_snapshot\_r\_blocks\_count - - Number of blocks reserved for active snapshot's future use. (Not used in - e2fsprogs/Linux.) - * - 0x190 - - \_\_le32 - - s\_snapshot\_list - - inode number of the head of the on-disk snapshot list. (Not used in - e2fsprogs/Linux.) - * - 0x194 - - \_\_le32 - - s\_error\_count - - Number of errors seen. - * - 0x198 - - \_\_le32 - - s\_first\_error\_time - - First time an error happened, in seconds since the epoch. - * - 0x19C - - \_\_le32 - - s\_first\_error\_ino - - inode involved in first error. - * - 0x1A0 - - \_\_le64 - - s\_first\_error\_block - - Number of block involved of first error. - * - 0x1A8 - - \_\_u8 - - s\_first\_error\_func[32] - - Name of function where the error happened. - * - 0x1C8 - - \_\_le32 - - s\_first\_error\_line - - Line number where error happened. - * - 0x1CC - - \_\_le32 - - s\_last\_error\_time - - Time of most recent error, in seconds since the epoch. - * - 0x1D0 - - \_\_le32 - - s\_last\_error\_ino - - inode involved in most recent error. - * - 0x1D4 - - \_\_le32 - - s\_last\_error\_line - - Line number where most recent error happened. - * - 0x1D8 - - \_\_le64 - - s\_last\_error\_block - - Number of block involved in most recent error. - * - 0x1E0 - - \_\_u8 - - s\_last\_error\_func[32] - - Name of function where the most recent error happened. - * - 0x200 - - \_\_u8 - - s\_mount\_opts[64] - - ASCIIZ string of mount options. - * - 0x240 - - \_\_le32 - - s\_usr\_quota\_inum - - Inode number of user `quota `__ file. - * - 0x244 - - \_\_le32 - - s\_grp\_quota\_inum - - Inode number of group `quota `__ file. - * - 0x248 - - \_\_le32 - - s\_overhead\_blocks - - Overhead blocks/clusters in fs. (Huh? This field is always zero, which - means that the kernel calculates it dynamically.) - * - 0x24C - - \_\_le32 - - s\_backup\_bgs[2] - - Block groups containing superblock backups (if sparse\_super2) - * - 0x254 - - \_\_u8 - - s\_encrypt\_algos[4] - - Encryption algorithms in use. There can be up to four algorithms in use - at any time; valid algorithm codes are given in the super_encrypt_ table - below. - * - 0x258 - - \_\_u8 - - s\_encrypt\_pw\_salt[16] - - Salt for the string2key algorithm for encryption. - * - 0x268 - - \_\_le32 - - s\_lpf\_ino - - Inode number of lost+found - * - 0x26C - - \_\_le32 - - s\_prj\_quota\_inum - - Inode that tracks project quotas. - * - 0x270 - - \_\_le32 - - s\_checksum\_seed - - Checksum seed used for metadata\_csum calculations. This value is - crc32c(~0, $orig\_fs\_uuid). - * - 0x274 - - \_\_u8 - - s\_wtime_hi - - Upper 8 bits of the s_wtime field. - * - 0x275 - - \_\_u8 - - s\_wtime_hi - - Upper 8 bits of the s_mtime field. - * - 0x276 - - \_\_u8 - - s\_mkfs_time_hi - - Upper 8 bits of the s_mkfs_time field. - * - 0x277 - - \_\_u8 - - s\_lastcheck_hi - - Upper 8 bits of the s_lastcheck_hi field. - * - 0x278 - - \_\_u8 - - s\_first_error_time_hi - - Upper 8 bits of the s_first_error_time_hi field. - * - 0x279 - - \_\_u8 - - s\_last_error_time_hi - - Upper 8 bits of the s_last_error_time_hi field. - * - 0x27A - - \_\_u8[2] - - s\_pad - - Zero padding. - * - 0x27C - - \_\_le32 - - s\_reserved[96] - - Padding to the end of the block. - * - 0x3FC - - \_\_le32 - - s\_checksum - - Superblock checksum. - -.. _super_state: - -The superblock state is some combination of the following: - -.. list-table:: - :widths: 8 72 - :header-rows: 1 - - * - Value - - Description - * - 0x0001 - - Cleanly umounted - * - 0x0002 - - Errors detected - * - 0x0004 - - Orphans being recovered - -.. _super_errors: - -The superblock error policy is one of the following: - -.. list-table:: - :widths: 8 72 - :header-rows: 1 - - * - Value - - Description - * - 1 - - Continue - * - 2 - - Remount read-only - * - 3 - - Panic - -.. _super_creator: - -The filesystem creator is one of the following: - -.. list-table:: - :widths: 8 72 - :header-rows: 1 - - * - Value - - Description - * - 0 - - Linux - * - 1 - - Hurd - * - 2 - - Masix - * - 3 - - FreeBSD - * - 4 - - Lites - -.. _super_revision: - -The superblock revision is one of the following: - -.. list-table:: - :widths: 8 72 - :header-rows: 1 - - * - Value - - Description - * - 0 - - Original format - * - 1 - - v2 format w/ dynamic inode sizes - -Note that ``EXT4_DYNAMIC_REV`` refers to a revision 1 or newer filesystem. - -.. _super_compat: - -The superblock compatible features field is a combination of any of the -following: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x1 - - Directory preallocation (COMPAT\_DIR\_PREALLOC). - * - 0x2 - - “imagic inodes”. Not clear from the code what this does - (COMPAT\_IMAGIC\_INODES). - * - 0x4 - - Has a journal (COMPAT\_HAS\_JOURNAL). - * - 0x8 - - Supports extended attributes (COMPAT\_EXT\_ATTR). - * - 0x10 - - Has reserved GDT blocks for filesystem expansion - (COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER. - * - 0x20 - - Has directory indices (COMPAT\_DIR\_INDEX). - * - 0x40 - - “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized - block groups? (COMPAT\_LAZY\_BG) - * - 0x80 - - “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE). - * - 0x100 - - “Exclude bitmap”. Seems to be used to indicate the presence of - snapshot-related exclude bitmaps? Not defined in kernel or used in - e2fsprogs (COMPAT\_EXCLUDE\_BITMAP). - * - 0x200 - - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs - points to the two block groups that contain backup superblocks - (COMPAT\_SPARSE\_SUPER2). - -.. _super_incompat: - -The superblock incompatible features field is a combination of any of the -following: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x1 - - Compression (INCOMPAT\_COMPRESSION). - * - 0x2 - - Directory entries record the file type. See ext4\_dir\_entry\_2 below - (INCOMPAT\_FILETYPE). - * - 0x4 - - Filesystem needs recovery (INCOMPAT\_RECOVER). - * - 0x8 - - Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV). - * - 0x10 - - Meta block groups. See the earlier discussion of this feature - (INCOMPAT\_META\_BG). - * - 0x40 - - Files in this filesystem use extents (INCOMPAT\_EXTENTS). - * - 0x80 - - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT). - * - 0x100 - - Multiple mount protection. Not implemented (INCOMPAT\_MMP). - * - 0x200 - - Flexible block groups. See the earlier discussion of this feature - (INCOMPAT\_FLEX\_BG). - * - 0x400 - - Inodes can be used to store large extended attribute values - (INCOMPAT\_EA\_INODE). - * - 0x1000 - - Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?) - * - 0x2000 - - Metadata checksum seed is stored in the superblock. This feature enables - the administrator to change the UUID of a metadata\_csum filesystem - while the filesystem is mounted; without it, the checksum definition - requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED). - * - 0x4000 - - Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to - this feature, directories could not be larger than 4GiB and could not - have an htree more than 2 levels deep. If this feature is enabled, - directories can be larger than 4GiB and have a maximum htree depth of 3. - * - 0x8000 - - Data in inode (INCOMPAT\_INLINE\_DATA). - * - 0x10000 - - Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT). - -.. _super_rocompat: - -The superblock read-only compatible features field is a combination of any of -the following: - -.. list-table:: - :widths: 16 64 - :header-rows: 1 - - * - Value - - Description - * - 0x1 - - Sparse superblocks. See the earlier discussion of this feature - (RO\_COMPAT\_SPARSE\_SUPER). - * - 0x2 - - This filesystem has been used to store a file greater than 2GiB - (RO\_COMPAT\_LARGE\_FILE). - * - 0x4 - - Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR). - * - 0x8 - - This filesystem has files whose sizes are represented in units of - logical blocks, not 512-byte sectors. This implies a very large file - indeed! (RO\_COMPAT\_HUGE\_FILE) - * - 0x10 - - Group descriptors have checksums. In addition to detecting corruption, - this is useful for lazy formatting with uninitialized groups - (RO\_COMPAT\_GDT\_CSUM). - * - 0x20 - - Indicates that the old ext3 32,000 subdirectory limit no longer applies - (RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1 - if it is incremented past 64,999. - * - 0x40 - - Indicates that large inodes exist on this filesystem - (RO\_COMPAT\_EXTRA\_ISIZE). - * - 0x80 - - This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT). - * - 0x100 - - `Quota `__ (RO\_COMPAT\_QUOTA). - * - 0x200 - - This filesystem supports “bigalloc”, which means that file extents are - tracked in units of clusters (of blocks) instead of blocks - (RO\_COMPAT\_BIGALLOC). - * - 0x400 - - This filesystem supports metadata checksumming. - (RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though - GDT\_CSUM must not be set) - * - 0x800 - - Filesystem supports replicas. This feature is neither in the kernel nor - e2fsprogs. (RO\_COMPAT\_REPLICA) - * - 0x1000 - - Read-only filesystem image; the kernel will not mount this image - read-write and most tools will refuse to write to the image. - (RO\_COMPAT\_READONLY) - * - 0x2000 - - Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT) - -.. _super_def_hash: - -The ``s_def_hash_version`` field is one of the following: - -.. list-table:: - :widths: 8 72 - :header-rows: 1 - - * - Value - - Description - * - 0x0 - - Legacy. - * - 0x1 - - Half MD4. - * - 0x2 - - Tea. - * - 0x3 - - Legacy, unsigned. - * - 0x4 - - Half MD4, unsigned. - * - 0x5 - - Tea, unsigned. - -.. _super_mountopts: - -The ``s_default_mount_opts`` field is any combination of the following: - -.. list-table:: - :widths: 8 72 - :header-rows: 1 - - * - Value - - Description - * - 0x0001 - - Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG) - * - 0x0002 - - New files take the gid of the containing directory (instead of the fsgid - of the current process). (EXT4\_DEFM\_BSDGROUPS) - * - 0x0004 - - Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER) - * - 0x0008 - - Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL) - * - 0x0010 - - Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16) - * - 0x0020 - - All data and metadata are commited to the journal. - (EXT4\_DEFM\_JMODE\_DATA) - * - 0x0040 - - All data are flushed to the disk before metadata are committed to the - journal. (EXT4\_DEFM\_JMODE\_ORDERED) - * - 0x0060 - - Data ordering is not preserved; data may be written after the metadata - has been written. (EXT4\_DEFM\_JMODE\_WBACK) - * - 0x0100 - - Disable write flushes. (EXT4\_DEFM\_NOBARRIER) - * - 0x0200 - - Track which blocks in a filesystem are metadata and therefore should not - be used as data blocks. This option will be enabled by default on 3.18, - hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY) - * - 0x0400 - - Enable DISCARD support, where the storage device is told about blocks - becoming unused. (EXT4\_DEFM\_DISCARD) - * - 0x0800 - - Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC) - -.. _super_flags: - -The ``s_flags`` field is any combination of the following: - -.. list-table:: - :widths: 8 72 - :header-rows: 1 - - * - Value - - Description - * - 0x0001 - - Signed directory hash in use. - * - 0x0002 - - Unsigned directory hash in use. - * - 0x0004 - - To test development code. - -.. _super_encrypt: - -The ``s_encrypt_algos`` list can contain any of the following: - -.. list-table:: - :widths: 8 72 - :header-rows: 1 - - * - Value - - Description - * - 0 - - Invalid algorithm (ENCRYPTION\_MODE\_INVALID). - * - 1 - - 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS). - * - 2 - - 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM). - * - 3 - - 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC). - -Total size of the superblock is 1024 bytes. diff --git a/Documentation/filesystems/ext4/overview.rst b/Documentation/filesystems/ext4/overview.rst new file mode 100644 index 000000000000..cbab18baba12 --- /dev/null +++ b/Documentation/filesystems/ext4/overview.rst @@ -0,0 +1,26 @@ +.. SPDX-License-Identifier: GPL-2.0 + +High Level Design +================= + +An ext4 file system is split into a series of block groups. To reduce +performance difficulties due to fragmentation, the block allocator tries +very hard to keep each file's blocks within the same group, thereby +reducing seek times. The size of a block group is specified in +``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \* +``block_size_in_bytes``. With the default block size of 4KiB, each group +will contain 32,768 blocks, for a length of 128MiB. The number of block +groups is the size of the device divided by the size of a block group. + +All fields in ext4 are written to disk in little-endian order. HOWEVER, +all fields in jbd2 (the journal) are written to disk in big-endian +order. + +.. include:: blocks.rst +.. include:: blockgroup.rst +.. include:: special_inodes.rst +.. include:: allocators.rst +.. include:: checksums.rst +.. include:: bigalloc.rst +.. include:: inlinedata.rst +.. include:: eainode.rst diff --git a/Documentation/filesystems/ext4/special_inodes.rst b/Documentation/filesystems/ext4/special_inodes.rst new file mode 100644 index 000000000000..9061aabba827 --- /dev/null +++ b/Documentation/filesystems/ext4/special_inodes.rst @@ -0,0 +1,38 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Special inodes +-------------- + +ext4 reserves some inode for special features, as follows: + +.. list-table:: + :widths: 6 70 + :header-rows: 1 + + * - inode Number + - Purpose + * - 0 + - Doesn't exist; there is no inode 0. + * - 1 + - List of defective blocks. + * - 2 + - Root directory. + * - 3 + - User quota. + * - 4 + - Group quota. + * - 5 + - Boot loader. + * - 6 + - Undelete directory. + * - 7 + - Reserved group descriptors inode. (“resize inode”) + * - 8 + - Journal inode. + * - 9 + - The “exclude” inode, for snapshots(?) + * - 10 + - Replica inode, used for some non-upstream feature? + * - 11 + - Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock. + diff --git a/Documentation/filesystems/ext4/super.rst b/Documentation/filesystems/ext4/super.rst new file mode 100644 index 000000000000..04ff079a2acf --- /dev/null +++ b/Documentation/filesystems/ext4/super.rst @@ -0,0 +1,801 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Super Block +----------- + +The superblock records various information about the enclosing +filesystem, such as block counts, inode counts, supported features, +maintenance information, and more. + +If the sparse\_super feature flag is set, redundant copies of the +superblock and group descriptors are kept only in the groups whose group +number is either 0 or a power of 3, 5, or 7. If the flag is not set, +redundant copies are kept in all groups. + +The superblock checksum is calculated against the superblock structure, +which includes the FS UUID. + +The ext4 superblock is laid out as follows in +``struct ext4_super_block``: + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - s\_inodes\_count + - Total inode count. + * - 0x4 + - \_\_le32 + - s\_blocks\_count\_lo + - Total block count. + * - 0x8 + - \_\_le32 + - s\_r\_blocks\_count\_lo + - This number of blocks can only be allocated by the super-user. + * - 0xC + - \_\_le32 + - s\_free\_blocks\_count\_lo + - Free block count. + * - 0x10 + - \_\_le32 + - s\_free\_inodes\_count + - Free inode count. + * - 0x14 + - \_\_le32 + - s\_first\_data\_block + - First data block. This must be at least 1 for 1k-block filesystems and + is typically 0 for all other block sizes. + * - 0x18 + - \_\_le32 + - s\_log\_block\_size + - Block size is 2 ^ (10 + s\_log\_block\_size). + * - 0x1C + - \_\_le32 + - s\_log\_cluster\_size + - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is + enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size. + * - 0x20 + - \_\_le32 + - s\_blocks\_per\_group + - Blocks per group. + * - 0x24 + - \_\_le32 + - s\_clusters\_per\_group + - Clusters per group, if bigalloc is enabled. Otherwise + s\_clusters\_per\_group must equal s\_blocks\_per\_group. + * - 0x28 + - \_\_le32 + - s\_inodes\_per\_group + - Inodes per group. + * - 0x2C + - \_\_le32 + - s\_mtime + - Mount time, in seconds since the epoch. + * - 0x30 + - \_\_le32 + - s\_wtime + - Write time, in seconds since the epoch. + * - 0x34 + - \_\_le16 + - s\_mnt\_count + - Number of mounts since the last fsck. + * - 0x36 + - \_\_le16 + - s\_max\_mnt\_count + - Number of mounts beyond which a fsck is needed. + * - 0x38 + - \_\_le16 + - s\_magic + - Magic signature, 0xEF53 + * - 0x3A + - \_\_le16 + - s\_state + - File system state. See super_state_ for more info. + * - 0x3C + - \_\_le16 + - s\_errors + - Behaviour when detecting errors. See super_errors_ for more info. + * - 0x3E + - \_\_le16 + - s\_minor\_rev\_level + - Minor revision level. + * - 0x40 + - \_\_le32 + - s\_lastcheck + - Time of last check, in seconds since the epoch. + * - 0x44 + - \_\_le32 + - s\_checkinterval + - Maximum time between checks, in seconds. + * - 0x48 + - \_\_le32 + - s\_creator\_os + - Creator OS. See the table super_creator_ for more info. + * - 0x4C + - \_\_le32 + - s\_rev\_level + - Revision level. See the table super_revision_ for more info. + * - 0x50 + - \_\_le16 + - s\_def\_resuid + - Default uid for reserved blocks. + * - 0x52 + - \_\_le16 + - s\_def\_resgid + - Default gid for reserved blocks. + * - + - + - + - These fields are for EXT4_DYNAMIC_REV superblocks only. + + Note: the difference between the compatible feature set and the + incompatible feature set is that if there is a bit set in the + incompatible feature set that the kernel doesn't know about, it should + refuse to mount the filesystem. + + e2fsck's requirements are more strict; if it doesn't know + about a feature in either the compatible or incompatible feature set, it + must abort and not try to meddle with things it doesn't understand... + * - 0x54 + - \_\_le32 + - s\_first\_ino + - First non-reserved inode. + * - 0x58 + - \_\_le16 + - s\_inode\_size + - Size of inode structure, in bytes. + * - 0x5A + - \_\_le16 + - s\_block\_group\_nr + - Block group # of this superblock. + * - 0x5C + - \_\_le32 + - s\_feature\_compat + - Compatible feature set flags. Kernel can still read/write this fs even + if it doesn't understand a flag; fsck should not do that. See the + super_compat_ table for more info. + * - 0x60 + - \_\_le32 + - s\_feature\_incompat + - Incompatible feature set. If the kernel or fsck doesn't understand one + of these bits, it should stop. See the super_incompat_ table for more + info. + * - 0x64 + - \_\_le32 + - s\_feature\_ro\_compat + - Readonly-compatible feature set. If the kernel doesn't understand one of + these bits, it can still mount read-only. See the super_rocompat_ table + for more info. + * - 0x68 + - \_\_u8 + - s\_uuid[16] + - 128-bit UUID for volume. + * - 0x78 + - char + - s\_volume\_name[16] + - Volume label. + * - 0x88 + - char + - s\_last\_mounted[64] + - Directory where filesystem was last mounted. + * - 0xC8 + - \_\_le32 + - s\_algorithm\_usage\_bitmap + - For compression (Not used in e2fsprogs/Linux) + * - + - + - + - Performance hints. Directory preallocation should only happen if the + EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + * - 0xCC + - \_\_u8 + - s\_prealloc\_blocks + - #. of blocks to try to preallocate for ... files? (Not used in + e2fsprogs/Linux) + * - 0xCD + - \_\_u8 + - s\_prealloc\_dir\_blocks + - #. of blocks to preallocate for directories. (Not used in + e2fsprogs/Linux) + * - 0xCE + - \_\_le16 + - s\_reserved\_gdt\_blocks + - Number of reserved GDT entries for future filesystem expansion. + * - + - + - + - Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is + set. + * - 0xD0 + - \_\_u8 + - s\_journal\_uuid[16] + - UUID of journal superblock + * - 0xE0 + - \_\_le32 + - s\_journal\_inum + - inode number of journal file. + * - 0xE4 + - \_\_le32 + - s\_journal\_dev + - Device number of journal file, if the external journal feature flag is + set. + * - 0xE8 + - \_\_le32 + - s\_last\_orphan + - Start of list of orphaned inodes to delete. + * - 0xEC + - \_\_le32 + - s\_hash\_seed[4] + - HTREE hash seed. + * - 0xFC + - \_\_u8 + - s\_def\_hash\_version + - Default hash algorithm to use for directory hashes. See super_def_hash_ + for more info. + * - 0xFD + - \_\_u8 + - s\_jnl\_backup\_type + - If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the + ``s_jnl_blocks`` field contains a duplicate copy of the inode's + ``i_block[]`` array and ``i_size``. + * - 0xFE + - \_\_le16 + - s\_desc\_size + - Size of group descriptors, in bytes, if the 64bit incompat feature flag + is set. + * - 0x100 + - \_\_le32 + - s\_default\_mount\_opts + - Default mount options. See the super_mountopts_ table for more info. + * - 0x104 + - \_\_le32 + - s\_first\_meta\_bg + - First metablock block group, if the meta\_bg feature is enabled. + * - 0x108 + - \_\_le32 + - s\_mkfs\_time + - When the filesystem was created, in seconds since the epoch. + * - 0x10C + - \_\_le32 + - s\_jnl\_blocks[17] + - Backup copy of the journal inode's ``i_block[]`` array in the first 15 + elements and i\_size\_high and i\_size in the 16th and 17th elements, + respectively. + * - + - + - + - 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set. + * - 0x150 + - \_\_le32 + - s\_blocks\_count\_hi + - High 32-bits of the block count. + * - 0x154 + - \_\_le32 + - s\_r\_blocks\_count\_hi + - High 32-bits of the reserved block count. + * - 0x158 + - \_\_le32 + - s\_free\_blocks\_count\_hi + - High 32-bits of the free block count. + * - 0x15C + - \_\_le16 + - s\_min\_extra\_isize + - All inodes have at least # bytes. + * - 0x15E + - \_\_le16 + - s\_want\_extra\_isize + - New inodes should reserve # bytes. + * - 0x160 + - \_\_le32 + - s\_flags + - Miscellaneous flags. See the super_flags_ table for more info. + * - 0x164 + - \_\_le16 + - s\_raid\_stride + - RAID stride. This is the number of logical blocks read from or written + to the disk before moving to the next disk. This affects the placement + of filesystem metadata, which will hopefully make RAID storage faster. + * - 0x166 + - \_\_le16 + - s\_mmp\_interval + - #. seconds to wait in multi-mount prevention (MMP) checking. In theory, + MMP is a mechanism to record in the superblock which host and device + have mounted the filesystem, in order to prevent multiple mounts. This + feature does not seem to be implemented... + * - 0x168 + - \_\_le64 + - s\_mmp\_block + - Block # for multi-mount protection data. + * - 0x170 + - \_\_le32 + - s\_raid\_stripe\_width + - RAID stripe width. This is the number of logical blocks read from or + written to the disk before coming back to the current disk. This is used + by the block allocator to try to reduce the number of read-modify-write + operations in a RAID5/6. + * - 0x174 + - \_\_u8 + - s\_log\_groups\_per\_flex + - Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``. + * - 0x175 + - \_\_u8 + - s\_checksum\_type + - Metadata checksum algorithm type. The only valid value is 1 (crc32c). + * - 0x176 + - \_\_le16 + - s\_reserved\_pad + - + * - 0x178 + - \_\_le64 + - s\_kbytes\_written + - Number of KiB written to this filesystem over its lifetime. + * - 0x180 + - \_\_le32 + - s\_snapshot\_inum + - inode number of active snapshot. (Not used in e2fsprogs/Linux.) + * - 0x184 + - \_\_le32 + - s\_snapshot\_id + - Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.) + * - 0x188 + - \_\_le64 + - s\_snapshot\_r\_blocks\_count + - Number of blocks reserved for active snapshot's future use. (Not used in + e2fsprogs/Linux.) + * - 0x190 + - \_\_le32 + - s\_snapshot\_list + - inode number of the head of the on-disk snapshot list. (Not used in + e2fsprogs/Linux.) + * - 0x194 + - \_\_le32 + - s\_error\_count + - Number of errors seen. + * - 0x198 + - \_\_le32 + - s\_first\_error\_time + - First time an error happened, in seconds since the epoch. + * - 0x19C + - \_\_le32 + - s\_first\_error\_ino + - inode involved in first error. + * - 0x1A0 + - \_\_le64 + - s\_first\_error\_block + - Number of block involved of first error. + * - 0x1A8 + - \_\_u8 + - s\_first\_error\_func[32] + - Name of function where the error happened. + * - 0x1C8 + - \_\_le32 + - s\_first\_error\_line + - Line number where error happened. + * - 0x1CC + - \_\_le32 + - s\_last\_error\_time + - Time of most recent error, in seconds since the epoch. + * - 0x1D0 + - \_\_le32 + - s\_last\_error\_ino + - inode involved in most recent error. + * - 0x1D4 + - \_\_le32 + - s\_last\_error\_line + - Line number where most recent error happened. + * - 0x1D8 + - \_\_le64 + - s\_last\_error\_block + - Number of block involved in most recent error. + * - 0x1E0 + - \_\_u8 + - s\_last\_error\_func[32] + - Name of function where the most recent error happened. + * - 0x200 + - \_\_u8 + - s\_mount\_opts[64] + - ASCIIZ string of mount options. + * - 0x240 + - \_\_le32 + - s\_usr\_quota\_inum + - Inode number of user `quota `__ file. + * - 0x244 + - \_\_le32 + - s\_grp\_quota\_inum + - Inode number of group `quota `__ file. + * - 0x248 + - \_\_le32 + - s\_overhead\_blocks + - Overhead blocks/clusters in fs. (Huh? This field is always zero, which + means that the kernel calculates it dynamically.) + * - 0x24C + - \_\_le32 + - s\_backup\_bgs[2] + - Block groups containing superblock backups (if sparse\_super2) + * - 0x254 + - \_\_u8 + - s\_encrypt\_algos[4] + - Encryption algorithms in use. There can be up to four algorithms in use + at any time; valid algorithm codes are given in the super_encrypt_ table + below. + * - 0x258 + - \_\_u8 + - s\_encrypt\_pw\_salt[16] + - Salt for the string2key algorithm for encryption. + * - 0x268 + - \_\_le32 + - s\_lpf\_ino + - Inode number of lost+found + * - 0x26C + - \_\_le32 + - s\_prj\_quota\_inum + - Inode that tracks project quotas. + * - 0x270 + - \_\_le32 + - s\_checksum\_seed + - Checksum seed used for metadata\_csum calculations. This value is + crc32c(~0, $orig\_fs\_uuid). + * - 0x274 + - \_\_u8 + - s\_wtime_hi + - Upper 8 bits of the s_wtime field. + * - 0x275 + - \_\_u8 + - s\_wtime_hi + - Upper 8 bits of the s_mtime field. + * - 0x276 + - \_\_u8 + - s\_mkfs_time_hi + - Upper 8 bits of the s_mkfs_time field. + * - 0x277 + - \_\_u8 + - s\_lastcheck_hi + - Upper 8 bits of the s_lastcheck_hi field. + * - 0x278 + - \_\_u8 + - s\_first_error_time_hi + - Upper 8 bits of the s_first_error_time_hi field. + * - 0x279 + - \_\_u8 + - s\_last_error_time_hi + - Upper 8 bits of the s_last_error_time_hi field. + * - 0x27A + - \_\_u8[2] + - s\_pad + - Zero padding. + * - 0x27C + - \_\_le32 + - s\_reserved[96] + - Padding to the end of the block. + * - 0x3FC + - \_\_le32 + - s\_checksum + - Superblock checksum. + +.. _super_state: + +The superblock state is some combination of the following: + +.. list-table:: + :widths: 8 72 + :header-rows: 1 + + * - Value + - Description + * - 0x0001 + - Cleanly umounted + * - 0x0002 + - Errors detected + * - 0x0004 + - Orphans being recovered + +.. _super_errors: + +The superblock error policy is one of the following: + +.. list-table:: + :widths: 8 72 + :header-rows: 1 + + * - Value + - Description + * - 1 + - Continue + * - 2 + - Remount read-only + * - 3 + - Panic + +.. _super_creator: + +The filesystem creator is one of the following: + +.. list-table:: + :widths: 8 72 + :header-rows: 1 + + * - Value + - Description + * - 0 + - Linux + * - 1 + - Hurd + * - 2 + - Masix + * - 3 + - FreeBSD + * - 4 + - Lites + +.. _super_revision: + +The superblock revision is one of the following: + +.. list-table:: + :widths: 8 72 + :header-rows: 1 + + * - Value + - Description + * - 0 + - Original format + * - 1 + - v2 format w/ dynamic inode sizes + +Note that ``EXT4_DYNAMIC_REV`` refers to a revision 1 or newer filesystem. + +.. _super_compat: + +The superblock compatible features field is a combination of any of the +following: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Directory preallocation (COMPAT\_DIR\_PREALLOC). + * - 0x2 + - “imagic inodes”. Not clear from the code what this does + (COMPAT\_IMAGIC\_INODES). + * - 0x4 + - Has a journal (COMPAT\_HAS\_JOURNAL). + * - 0x8 + - Supports extended attributes (COMPAT\_EXT\_ATTR). + * - 0x10 + - Has reserved GDT blocks for filesystem expansion + (COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER. + * - 0x20 + - Has directory indices (COMPAT\_DIR\_INDEX). + * - 0x40 + - “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized + block groups? (COMPAT\_LAZY\_BG) + * - 0x80 + - “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE). + * - 0x100 + - “Exclude bitmap”. Seems to be used to indicate the presence of + snapshot-related exclude bitmaps? Not defined in kernel or used in + e2fsprogs (COMPAT\_EXCLUDE\_BITMAP). + * - 0x200 + - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs + points to the two block groups that contain backup superblocks + (COMPAT\_SPARSE\_SUPER2). + +.. _super_incompat: + +The superblock incompatible features field is a combination of any of the +following: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Compression (INCOMPAT\_COMPRESSION). + * - 0x2 + - Directory entries record the file type. See ext4\_dir\_entry\_2 below + (INCOMPAT\_FILETYPE). + * - 0x4 + - Filesystem needs recovery (INCOMPAT\_RECOVER). + * - 0x8 + - Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV). + * - 0x10 + - Meta block groups. See the earlier discussion of this feature + (INCOMPAT\_META\_BG). + * - 0x40 + - Files in this filesystem use extents (INCOMPAT\_EXTENTS). + * - 0x80 + - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT). + * - 0x100 + - Multiple mount protection. Not implemented (INCOMPAT\_MMP). + * - 0x200 + - Flexible block groups. See the earlier discussion of this feature + (INCOMPAT\_FLEX\_BG). + * - 0x400 + - Inodes can be used to store large extended attribute values + (INCOMPAT\_EA\_INODE). + * - 0x1000 + - Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?) + * - 0x2000 + - Metadata checksum seed is stored in the superblock. This feature enables + the administrator to change the UUID of a metadata\_csum filesystem + while the filesystem is mounted; without it, the checksum definition + requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED). + * - 0x4000 + - Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to + this feature, directories could not be larger than 4GiB and could not + have an htree more than 2 levels deep. If this feature is enabled, + directories can be larger than 4GiB and have a maximum htree depth of 3. + * - 0x8000 + - Data in inode (INCOMPAT\_INLINE\_DATA). + * - 0x10000 + - Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT). + +.. _super_rocompat: + +The superblock read-only compatible features field is a combination of any of +the following: + +.. list-table:: + :widths: 16 64 + :header-rows: 1 + + * - Value + - Description + * - 0x1 + - Sparse superblocks. See the earlier discussion of this feature + (RO\_COMPAT\_SPARSE\_SUPER). + * - 0x2 + - This filesystem has been used to store a file greater than 2GiB + (RO\_COMPAT\_LARGE\_FILE). + * - 0x4 + - Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR). + * - 0x8 + - This filesystem has files whose sizes are represented in units of + logical blocks, not 512-byte sectors. This implies a very large file + indeed! (RO\_COMPAT\_HUGE\_FILE) + * - 0x10 + - Group descriptors have checksums. In addition to detecting corruption, + this is useful for lazy formatting with uninitialized groups + (RO\_COMPAT\_GDT\_CSUM). + * - 0x20 + - Indicates that the old ext3 32,000 subdirectory limit no longer applies + (RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1 + if it is incremented past 64,999. + * - 0x40 + - Indicates that large inodes exist on this filesystem + (RO\_COMPAT\_EXTRA\_ISIZE). + * - 0x80 + - This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT). + * - 0x100 + - `Quota `__ (RO\_COMPAT\_QUOTA). + * - 0x200 + - This filesystem supports “bigalloc”, which means that file extents are + tracked in units of clusters (of blocks) instead of blocks + (RO\_COMPAT\_BIGALLOC). + * - 0x400 + - This filesystem supports metadata checksumming. + (RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though + GDT\_CSUM must not be set) + * - 0x800 + - Filesystem supports replicas. This feature is neither in the kernel nor + e2fsprogs. (RO\_COMPAT\_REPLICA) + * - 0x1000 + - Read-only filesystem image; the kernel will not mount this image + read-write and most tools will refuse to write to the image. + (RO\_COMPAT\_READONLY) + * - 0x2000 + - Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT) + +.. _super_def_hash: + +The ``s_def_hash_version`` field is one of the following: + +.. list-table:: + :widths: 8 72 + :header-rows: 1 + + * - Value + - Description + * - 0x0 + - Legacy. + * - 0x1 + - Half MD4. + * - 0x2 + - Tea. + * - 0x3 + - Legacy, unsigned. + * - 0x4 + - Half MD4, unsigned. + * - 0x5 + - Tea, unsigned. + +.. _super_mountopts: + +The ``s_default_mount_opts`` field is any combination of the following: + +.. list-table:: + :widths: 8 72 + :header-rows: 1 + + * - Value + - Description + * - 0x0001 + - Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG) + * - 0x0002 + - New files take the gid of the containing directory (instead of the fsgid + of the current process). (EXT4\_DEFM\_BSDGROUPS) + * - 0x0004 + - Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER) + * - 0x0008 + - Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL) + * - 0x0010 + - Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16) + * - 0x0020 + - All data and metadata are commited to the journal. + (EXT4\_DEFM\_JMODE\_DATA) + * - 0x0040 + - All data are flushed to the disk before metadata are committed to the + journal. (EXT4\_DEFM\_JMODE\_ORDERED) + * - 0x0060 + - Data ordering is not preserved; data may be written after the metadata + has been written. (EXT4\_DEFM\_JMODE\_WBACK) + * - 0x0100 + - Disable write flushes. (EXT4\_DEFM\_NOBARRIER) + * - 0x0200 + - Track which blocks in a filesystem are metadata and therefore should not + be used as data blocks. This option will be enabled by default on 3.18, + hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY) + * - 0x0400 + - Enable DISCARD support, where the storage device is told about blocks + becoming unused. (EXT4\_DEFM\_DISCARD) + * - 0x0800 + - Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC) + +.. _super_flags: + +The ``s_flags`` field is any combination of the following: + +.. list-table:: + :widths: 8 72 + :header-rows: 1 + + * - Value + - Description + * - 0x0001 + - Signed directory hash in use. + * - 0x0002 + - Unsigned directory hash in use. + * - 0x0004 + - To test development code. + +.. _super_encrypt: + +The ``s_encrypt_algos`` list can contain any of the following: + +.. list-table:: + :widths: 8 72 + :header-rows: 1 + + * - Value + - Description + * - 0 + - Invalid algorithm (ENCRYPTION\_MODE\_INVALID). + * - 1 + - 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS). + * - 2 + - 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM). + * - 3 + - 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC). + +Total size of the superblock is 1024 bytes. -- cgit v1.2.3 From 6fd941784b8ac3e74313f7112f0586076dc36544 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Sat, 6 Oct 2018 22:40:34 -0400 Subject: ext4: cache NULL when both default_acl and acl are NULL default_acl and acl of newly created inode will be initiated as ACL_NOT_CACHED in vfs function inode_init_always() and later will be updated by calling xxx_init_acl() in specific filesystems. However, when default_acl and acl are NULL then they keep the value of ACL_NOT_CACHED. This patch changes the code to cache NULL for acl / default_acl in this case to save unnecessary ACL lookup attempt. Signed-off-by: Chengguang Xu Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/acl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index fb50f9aa6ead..c1d570ee1d9f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -284,12 +284,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, default_acl, XATTR_CREATE); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl, XATTR_CREATE); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } return error; } -- cgit v1.2.3 From 33458eaba4dfe778a426df6a19b7aad2ff9f7eec Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 12 Oct 2018 09:28:09 -0400 Subject: ext4: fix use-after-free race in ext4_remount()'s error path It's possible for ext4_show_quota_options() to try reading s_qf_names[i] while it is being modified by ext4_remount() --- most notably, in ext4_remount's error path when the original values of the quota file name gets restored. Reported-by: syzbot+a2872d6feea6918008a9@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 3.2+ --- fs/ext4/ext4.h | 3 ++- fs/ext4/super.c | 73 +++++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 86e1bacac757..12f90d48ba61 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1405,7 +1405,8 @@ struct ext4_sb_info { u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_QUOTA - char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index faf293ed8060..a221f1cdf704 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -914,6 +914,18 @@ static inline void ext4_quota_off_umount(struct super_block *sb) for (type = 0; type < EXT4_MAXQUOTAS; type++) ext4_quota_off(sb, type); } + +/* + * This is a helper function which is used in the mount/remount + * codepaths (which holds s_umount) to fetch the quota file name. + */ +static inline char *get_qf_name(struct super_block *sb, + struct ext4_sb_info *sbi, + int type) +{ + return rcu_dereference_protected(sbi->s_qf_names[type], + lockdep_is_held(&sb->s_umount)); +} #else static inline void ext4_quota_off_umount(struct super_block *sb) { @@ -965,7 +977,7 @@ static void ext4_put_super(struct super_block *sb) percpu_free_rwsem(&sbi->s_journal_flag_rwsem); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list @@ -1531,11 +1543,10 @@ static const char deprecated_msg[] = static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *qname; + char *qname, *old_qname = get_qf_name(sb, sbi, qtype); int ret = -1; - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && !old_qname) { ext4_msg(sb, KERN_ERR, "Cannot change journaled " "quota options when quota turned on"); @@ -1552,8 +1563,8 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "Not enough memory for storing quotafile name"); return -1; } - if (sbi->s_qf_names[qtype]) { - if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + if (old_qname) { + if (strcmp(old_qname, qname) == 0) ret = 1; else ext4_msg(sb, KERN_ERR, @@ -1566,7 +1577,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "quotafile must be on filesystem root"); goto errout; } - sbi->s_qf_names[qtype] = qname; + rcu_assign_pointer(sbi->s_qf_names[qtype], qname); set_opt(sb, QUOTA); return 1; errout: @@ -1578,15 +1589,16 @@ static int clear_qf_name(struct super_block *sb, int qtype) { struct ext4_sb_info *sbi = EXT4_SB(sb); + char *old_qname = get_qf_name(sb, sbi, qtype); - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && old_qname) { ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" " when quota turned on"); return -1; } - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + rcu_assign_pointer(sbi->s_qf_names[qtype], NULL); + synchronize_rcu(); + kfree(old_qname); return 1; } #endif @@ -1961,7 +1973,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *p; + char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; substring_t args[MAX_OPT_ARGS]; int token; @@ -1992,11 +2004,13 @@ static int parse_options(char *options, struct super_block *sb, "Cannot enable project quota enforcement."); return 0; } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + usr_qf_name = get_qf_name(sb, sbi, USRQUOTA); + grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA); + if (usr_qf_name || grp_qf_name) { + if (test_opt(sb, USRQUOTA) && usr_qf_name) clear_opt(sb, USRQUOTA); - if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && grp_qf_name) clear_opt(sb, GRPQUOTA); if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { @@ -2030,6 +2044,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq, { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); + char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; @@ -2048,11 +2063,14 @@ static inline void ext4_show_quota_options(struct seq_file *seq, seq_printf(seq, ",jqfmt=%s", fmtname); } - if (sbi->s_qf_names[USRQUOTA]) - seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + rcu_read_lock(); + usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); + grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); + if (usr_qf_name) + seq_show_option(seq, "usrjquota", usr_qf_name); + if (grp_qf_name) + seq_show_option(seq, "grpjquota", grp_qf_name); + rcu_read_unlock(); #endif } @@ -5104,6 +5122,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int err = 0; #ifdef CONFIG_QUOTA int i, j; + char *to_free[EXT4_MAXQUOTAS]; #endif char *orig_data = kstrdup(data, GFP_KERNEL); @@ -5123,8 +5142,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { - old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], - GFP_KERNEL); + char *qf_name = get_qf_name(sb, sbi, i); + + old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); @@ -5353,9 +5373,12 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + to_free[i] = get_qf_name(sb, sbi, i); + rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); } + synchronize_rcu(); + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(to_free[i]); #endif kfree(orig_data); return err; @@ -5546,7 +5569,7 @@ static int ext4_write_info(struct super_block *sb, int type) */ static int ext4_quota_on_mount(struct super_block *sb, int type) { - return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], + return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), EXT4_SB(sb)->s_jquota_fmt, type); } -- cgit v1.2.3