/* * fs/logfs/logfs_abi.h * * As should be obvious for Linux kernel code, license is GPLv2 * * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> * * Public header for logfs. */ #ifndef FS_LOGFS_LOGFS_ABI_H #define FS_LOGFS_LOGFS_ABI_H /* For out-of-kernel compiles */ #ifndef BUILD_BUG_ON #define BUILD_BUG_ON(condition) /**/ #endif #define SIZE_CHECK(type, size) \ static inline void check_##type(void) \ { \ BUILD_BUG_ON(sizeof(struct type) != (size)); \ } /* * Throughout the logfs code, we're constantly dealing with blocks at * various positions or offsets. To remove confusion, we stricly * distinguish between a "position" - the logical position within a * file and an "offset" - the physical location within the device. * * Any usage of the term offset for a logical location or position for * a physical one is a bug and should get fixed. */ /* * Block are allocated in one of several segments depending on their * level. The following levels are used: * 0 - regular data block * 1 - i1 indirect blocks * 2 - i2 indirect blocks * 3 - i3 indirect blocks * 4 - i4 indirect blocks * 5 - i5 indirect blocks * 6 - ifile data blocks * 7 - ifile i1 indirect blocks * 8 - ifile i2 indirect blocks * 9 - ifile i3 indirect blocks * 10 - ifile i4 indirect blocks * 11 - ifile i5 indirect blocks * Potential levels to be used in the future: * 12 - gc recycled blocks, long-lived data * 13 - replacement blocks, short-lived data * * Levels 1-11 are necessary for robust gc operations and help separate * short-lived metadata from longer-lived file data. In the future, * file data should get separated into several segments based on simple * heuristics. Old data recycled during gc operation is expected to be * long-lived. New data is of uncertain life expectancy. New data * used to replace older blocks in existing files is expected to be * short-lived. */ /* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ #define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull #define LOGFS_MAGIC_U32 0xc97e8168u /* * Various blocksize related macros. Blocksize is currently fixed at 4KiB. * Sooner or later that should become configurable and the macros replaced * by something superblock-dependent. Pointers in indirect blocks are and * will remain 64bit. * * LOGFS_BLOCKSIZE - self-explaining * LOGFS_BLOCK_FACTOR - number of pointers per indirect block * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts */ #define LOGFS_BLOCKSIZE (4096ull) #define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64)) #define LOGFS_BLOCK_BITS (9) /* * Number of blocks at various levels of indirection. There are 16 direct * block pointers plus a single indirect pointer. */ #define I0_BLOCKS (16) #define I1_BLOCKS LOGFS_BLOCK_FACTOR #define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS) #define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS) #define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS) #define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS) #define INDIRECT_INDEX I0_BLOCKS #define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1) /* * Sizes at which files require another level of indirection. Files smaller * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself, * similar like ext2 fast symlinks. * * Data at a position smaller than LOGFS_I0_SIZE is accessed through the * direct pointers, else through the 1x indirect pointer and so forth. */ #define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64)) #define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE) #define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE) #define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE) #define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE) #define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE) #define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE) /* * Each indirect block pointer must have this flag set, if all block pointers * behind it are set, i.e. there is no hole hidden in the shadow of this * indirect block pointer. */ #define LOGFS_FULLY_POPULATED (1ULL << 63) #define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) /* * LogFS needs to separate data into levels. Each level is defined as the * maximal possible distance from the master inode (inode of the inode file). * Data blocks reside on level 0, 1x indirect block on level 1, etc. * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. * This effort is necessary to guarantee garbage collection to always make * progress. * * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks, * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is * the maximal number of levels for one file. * LOGFS_NO_AREAS is twice that, as the inode file and regular files are * effectively stacked on top of each other. */ #define LOGFS_MAX_INDIRECT (5) #define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1) #define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS) /* Maximum size of filenames */ #define LOGFS_MAX_NAMELEN (255) /* Number of segments in the primary journal. */ #define LOGFS_JOURNAL_SEGS (16) /* Maximum number of free/erased/etc. segments in journal entries */ #define MAX_CACHED_SEGS (64) /* * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store, * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including * its header, * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for * its segment header and the padded space at the end when no further objects * fit. */ #define LOGFS_OBJECT_HEADERSIZE (0x1c) #define LOGFS_SEGMENT_HEADERSIZE (0x18) #define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE) #define LOGFS_SEGMENT_RESERVE \ (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1) /* * Segment types: * SEG_SUPER - Data or indirect block * SEG_JOURNAL - Inode * SEG_OSTORE - Dentry */ enum { SEG_SUPER = 0x01, SEG_JOURNAL = 0x02, SEG_OSTORE = 0x03, }; /** * struct logfs_segment_header - per-segment header in the ostore * * @crc: crc32 of header (there is no data) * @pad: unused, must be 0 * @type: segment type, see above * @level: GC level for all objects in this segment * @segno: segment number * @ec: erase count for this segment * @gec: global erase count at time of writing */ struct logfs_segment_header { __be32 crc; __be16 pad; __u8 type; __u8 level; __be32 segno; __be32 ec; __be64 gec; }; SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); #define LOGFS_FEATURES_INCOMPAT (0ull) #define LOGFS_FEATURES_RO_COMPAT (0ull) #define LOGFS_FEATURES_COMPAT (0ull) /** * struct logfs_disk_super - on-medium superblock * * @ds_magic: magic number, must equal LOGFS_MAGIC * @ds_crc: crc32 of structure starting with the next field * @ds_ifile_levels: maximum number of levels for ifile * @ds_iblock_levels: maximum number of levels for regular files * @ds_data_levels: number of separate levels for data * @pad0: reserved, must be 0 * @ds_feature_incompat: incompatible filesystem features * @ds_feature_ro_compat: read-only compatible filesystem features * @ds_feature_compat: compatible filesystem features * @ds_flags: flags * @ds_segment_shift: log2 of segment size * @ds_block_shift: log2 of block size * @ds_write_shift: log2 of write size * @pad1: reserved, must be 0 * @ds_journal_seg: segments used by primary journal * @ds_root_reserve: bytes reserved for the superuser * @ds_speed_reserve: bytes reserved to speed up GC * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks * @pad2: reserved, must be 0 * @pad3: reserved, must be 0 * * Contains only read-only fields. Read-write fields like the amount of used * space is tracked in the dynamic superblock, which is stored in the journal. */ struct logfs_disk_super { struct logfs_segment_header ds_sh; __be64 ds_magic; __be32 ds_crc; __u8 ds_ifile_levels; __u8 ds_iblock_levels; __u8 ds_data_levels; __u8 ds_segment_shift; __u8 ds_block_shift; __u8 ds_write_shift; __u8 pad0[6]; __be64 ds_filesystem_size; __be32 ds_segment_size; __be32 ds_bad_seg_reserve; __be64 ds_feature_incompat; __be64 ds_feature_ro_compat; __be64 ds_feature_compat; __be64 ds_feature_flags; __be64 ds_root_reserve; __be64 ds_speed_reserve; __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS]; __be64 ds_super_ofs[2]; __be64 pad3[8]; }; SIZE_CHECK(logfs_disk_super, 256); /* * Object types: * OBJ_BLOCK - Data or indirect block * OBJ_INODE - Inode * OBJ_DENTRY - Dentry */ enum { OBJ_BLOCK = 0x04, OBJ_INODE = 0x05, OBJ_DENTRY = 0x06, }; /** * struct logfs_object_header - per-object header in the ostore * * @crc: crc32 of header, excluding data_crc * @len: length of data * @type: object type, see above * @compr: compression type * @ino: inode number * @bix: block index * @data_crc: crc32 of payload */ struct logfs_object_header { __be32 crc; __be16 len; __u8 type; __u8 compr; __be64 ino; __be64 bix; __be32 data_crc; } __attribute__((packed)); SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE); /* * Reserved inode numbers: * LOGFS_INO_MASTER - master inode (for inode file) * LOGFS_INO_ROOT - root directory * LOGFS_INO_SEGFILE - per-segment used bytes and erase count */ enum { LOGFS_INO_MAPPING = 0x00, LOGFS_INO_MASTER = 0x01, LOGFS_INO_ROOT = 0x02, LOGFS_INO_SEGFILE = 0x03, LOGFS_RESERVED_INOS = 0x10, }; /* * Inode flags. High bits should never be written to the medium. They are * reserved for in-memory usage. * Low bits should either remain in sync with the corresponding FS_*_FL or * reuse slots that obviously don't make sense for logfs. * * LOGFS_IF_DIRTY Inode must be written back * LOGFS_IF_ZOMBIE Inode has been deleted * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode */ #define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */ #define LOGFS_IF_DIRTY 0x20000000 #define LOGFS_IF_ZOMBIE 0x40000000 #define LOGFS_IF_STILLBORN 0x80000000 /* Flags available to chattr */ #define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED) #define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED) /* Flags inherited from parent directory on file/directory creation */ #define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED) /** * struct logfs_disk_inode - on-medium inode * * @di_mode: file mode * @di_pad: reserved, must be 0 * @di_flags: inode flags, see above * @di_uid: user id * @di_gid: group id * @di_ctime: change time * @di_mtime: modify time * @di_refcount: reference count (aka nlink or link count) * @di_generation: inode generation, for nfs * @di_used_bytes: number of bytes used * @di_size: file size * @di_data: data pointers */ struct logfs_disk_inode { __be16 di_mode; __u8 di_height; __u8 di_pad; __be32 di_flags; __be32 di_uid; __be32 di_gid; __be64 di_ctime; __be64 di_mtime; __be64 di_atime; __be32 di_refcount; __be32 di_generation; __be64 di_used_bytes; __be64 di_size; __be64 di_data[LOGFS_EMBEDDED_FIELDS]; }; SIZE_CHECK(logfs_disk_inode, 200); #define INODE_POINTER_OFS \ (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64)) #define INODE_USED_OFS \ (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64)) #define INODE_SIZE_OFS \ (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64)) #define INODE_HEIGHT_OFS (0) /** * struct logfs_disk_dentry - on-medium dentry structure * * @ino: inode number * @namelen: length of file name * @type: file type, identical to bits 12..15 of mode * @name: file name */ /* FIXME: add 6 bytes of padding to remove the __packed */ struct logfs_disk_dentry { __be64 ino; __be16 namelen; __u8 type; __u8 name[LOGFS_MAX_NAMELEN]; } __attribute__((packed)); SIZE_CHECK(logfs_disk_dentry, 266); #define RESERVED 0xffffffff #define BADSEG 0xffffffff /** * struct logfs_segment_entry - segment file entry * * @ec_level: erase count and level * @valid: number of valid bytes * * Segment file contains one entry for every segment. ec_level contains the * erasecount in the upper 28 bits and the level in the lower 4 bits. An * ec_level of BADSEG (-1) identifies bad segments. valid contains the number * of valid bytes or RESERVED (-1 again) if the segment is used for either the * superblock or the journal, or when the segment is bad. */ struct logfs_segment_entry { __be32 ec_level; __be32 valid; }; SIZE_CHECK(logfs_segment_entry, 8); /** * struct logfs_journal_header - header for journal entries (JEs) * * @h_crc: crc32 of journal entry * @h_len: length of compressed journal entry, * not including header * @h_datalen: length of uncompressed data * @h_type: JE type * @h_compr: compression type * @h_pad: reserved */ struct logfs_journal_header { __be32 h_crc; __be16 h_len; __be16 h_datalen; __be16 h_type; __u8 h_compr; __u8 h_pad[5]; }; SIZE_CHECK(logfs_journal_header, 16); /* * Life expectency of data. * VIM_DEFAULT - default vim * VIM_SEGFILE - for segment file only - very short-living * VIM_GC - GC'd data - likely long-living */ enum logfs_vim { VIM_DEFAULT = 0, VIM_SEGFILE = 1, }; /** * struct logfs_je_area - wbuf header * * @segno: segment number of area * @used_bytes: number of bytes already used * @gc_level: GC level * @vim: life expectancy of data * * "Areas" are segments currently being used for writing. There is at least * one area per GC level. Several may be used to separate long-living from * short-living data. If an area with unknown vim is encountered, it can * simply be closed. * The write buffer immediately follow this header. */ struct logfs_je_area { __be32 segno; __be32 used_bytes; __u8 gc_level; __u8 vim; } __attribute__((packed)); SIZE_CHECK(logfs_je_area, 10); #define MAX_JOURNAL_HEADER \ (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area)) /** * struct logfs_je_dynsb - dynamic superblock * * @ds_gec: global erase count * @ds_sweeper: current position of GC "sweeper" * @ds_rename_dir: source directory ino (see dir.c documentation) * @ds_rename_pos: position of source dd (see dir.c documentation) * @ds_victim_ino: victims of incomplete dir operation (see dir.c) * @ds_victim_ino: parent inode of victim (see dir.c) * @ds_used_bytes: number of used bytes */ struct logfs_je_dynsb { __be64 ds_gec; __be64 ds_sweeper; __be64 ds_rename_dir; __be64 ds_rename_pos; __be64 ds_victim_ino; __be64 ds_victim_parent; /* XXX */ __be64 ds_used_bytes; __be32 ds_generation; __be32 pad; }; SIZE_CHECK(logfs_je_dynsb, 64); /** * struct logfs_je_anchor - anchor of filesystem tree, aka master inode * * @da_size: size of inode file * @da_last_ino: last created inode * @da_used_bytes: number of bytes used * @da_data: data pointers */ struct logfs_je_anchor { __be64 da_size; __be64 da_last_ino; __be64 da_used_bytes; u8 da_height; u8 pad[7]; __be64 da_data[LOGFS_EMBEDDED_FIELDS]; }; SIZE_CHECK(logfs_je_anchor, 168); /** * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal) * * @so_segment: segments used for 2nd journal * * Length of the array is given by h_len field in the header. */ struct logfs_je_spillout { __be64 so_segment[0]; }; SIZE_CHECK(logfs_je_spillout, 0); /** * struct logfs_je_journal_ec - erase counts for all journal segments * * @ec: erase count * * Length of the array is given by h_len field in the header. */ struct logfs_je_journal_ec { __be32 ec[0]; }; SIZE_CHECK(logfs_je_journal_ec, 0); /** * struct logfs_je_free_segments - list of free segmetns with erase count */ struct logfs_je_free_segments { __be32 segno; __be32 ec; }; SIZE_CHECK(logfs_je_free_segments, 8); /** * struct logfs_seg_alias - list of segment aliases */ struct logfs_seg_alias { __be32 old_segno; __be32 new_segno; }; SIZE_CHECK(logfs_seg_alias, 8); /** * struct logfs_obj_alias - list of object aliases */ struct logfs_obj_alias { __be64 ino; __be64 bix; __be64 val; u8 level; u8 pad[5]; __be16 child_no; }; SIZE_CHECK(logfs_obj_alias, 32); /** * Compression types. * * COMPR_NONE - uncompressed * COMPR_ZLIB - compressed with zlib */ enum { COMPR_NONE = 0, COMPR_ZLIB = 1, }; /* * Journal entries come in groups of 16. First group contains unique * entries, next groups contain one entry per level * * JE_FIRST - smallest possible journal entry number * * JEG_BASE - base group, containing unique entries * JE_COMMIT - commit entry, validates all previous entries * JE_DYNSB - dynamic superblock, anything that ought to be in the * superblock but cannot because it is read-write data * JE_ANCHOR - anchor aka master inode aka inode file's inode * JE_ERASECOUNT erasecounts for all journal segments * JE_SPILLOUT - unused * JE_SEG_ALIAS - aliases segments * JE_AREA - area description * * JE_LAST - largest possible journal entry number */ enum { JE_FIRST = 0x01, JEG_BASE = 0x00, JE_COMMIT = 0x02, JE_DYNSB = 0x03, JE_ANCHOR = 0x04, JE_ERASECOUNT = 0x05, JE_SPILLOUT = 0x06, JE_OBJ_ALIAS = 0x0d, JE_AREA = 0x0e, JE_LAST = 0x0e, }; #endif