diff options
Diffstat (limited to 'fs')
56 files changed, 19432 insertions, 28 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 86b203fc3c56..9f7270f36b2a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -175,9 +175,34 @@ source "fs/qnx4/Kconfig" source "fs/romfs/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" - source "fs/exofs/Kconfig" +config NILFS2_FS + tristate "NILFS2 file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + select CRC32 + help + NILFS2 is a log-structured file system (LFS) supporting continuous + snapshotting. In addition to versioning capability of the entire + file system, users can even restore files mistakenly overwritten or + destroyed just a few seconds ago. Since this file system can keep + consistency like conventional LFS, it achieves quick recovery after + system crashes. + + NILFS2 creates a number of checkpoints every few seconds or per + synchronous write basis (unless there is no change). Users can + select significant versions among continuously created checkpoints, + and can change them into snapshots which will be preserved for long + periods until they are changed back to checkpoints. Each + snapshot is mountable as a read-only file system concurrently with + its writable mount, and this feature is convenient for online backup. + + Some features including atime, extended attributes, and POSIX ACLs, + are not supported yet. + + To compile this file system support as a module, choose M here: the + module will be called nilfs2. If unsure, say N. + endif # MISC_FILESYSTEMS menuconfig NETWORK_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 70b2aed87133..af6d04700d9c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -114,6 +114,7 @@ obj-$(CONFIG_JFS_FS) += jfs/ obj-$(CONFIG_XFS_FS) += xfs/ obj-$(CONFIG_9P_FS) += 9p/ obj-$(CONFIG_AFS_FS) += afs/ +obj-$(CONFIG_NILFS2_FS) += nilfs2/ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c index 49f189423063..7ad36506c256 100644 --- a/fs/afs/netdevices.c +++ b/fs/afs/netdevices.c @@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen) struct net_device *dev; int ret = -ENODEV; - if (maclen != ETH_ALEN) - BUG(); + BUG_ON(maclen != ETH_ALEN); rtnl_lock(); dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); diff --git a/fs/befs/super.c b/fs/befs/super.c index 41f2b4d0093e..ca40f828f64d 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -8,6 +8,7 @@ */ #include <linux/fs.h> +#include <asm/page.h> /* for PAGE_SIZE */ #include "befs.h" #include "super.h" diff --git a/fs/buffer.c b/fs/buffer.c index 6e35762b6169..13edf7ad3ff1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1596,6 +1596,16 @@ EXPORT_SYMBOL(unmap_underlying_metadata); * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. + * + * If block_write_full_page() is called with wbc->sync_mode == + * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this + * causes the writes to be flagged as synchronous writes, but the + * block device queue will NOT be unplugged, since usually many pages + * will be pushed to the out before the higher-level caller actually + * waits for the writes to be completed. The various wait functions, + * such as wait_on_writeback_range() will ultimately call sync_page() + * which will ultimately call blk_run_backing_dev(), which will end up + * unplugging the device queue. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc) @@ -1606,7 +1616,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; const unsigned blocksize = 1 << inode->i_blkbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE); BUG_ON(!PageLocked(page)); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 466a332e0bd1..fcfa24361856 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1521,12 +1521,16 @@ static int ext3_ordered_writepage(struct page *page, if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); - } else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { - /* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */ - return block_write_full_page(page, NULL, wbc); + page_bufs = page_buffers(page); + } else { + page_bufs = page_buffers(page); + if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } } - page_bufs = page_buffers(page); - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { @@ -1581,6 +1585,15 @@ static int ext3_writeback_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3523b895eb4b..5a97bcfe03e5 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -516,8 +516,6 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out_unlock; ret = nfs_updatepage(filp, page, 0, pagelen); - if (ret == 0) - ret = pagelen; out_unlock: unlock_page(page); if (ret) diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile new file mode 100644 index 000000000000..df3e62c1ddc5 --- /dev/null +++ b/fs/nilfs2/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_NILFS2_FS) += nilfs2.o +nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ + btnode.o bmap.o btree.o direct.o dat.o recovery.o \ + the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ + ifile.o alloc.o gcinode.o ioctl.o gcdat.o diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c new file mode 100644 index 000000000000..d69e6ae59251 --- /dev/null +++ b/fs/nilfs2/alloc.c @@ -0,0 +1,504 @@ +/* + * alloc.c - NILFS dat/inode allocator + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Original code was written by Koji Sato <koji@osrg.net>. + * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>, + * Amagai Yoshiji <amagai@osrg.net>. + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/bitops.h> +#include "mdt.h" +#include "alloc.h" + + +static inline unsigned long +nilfs_palloc_groups_per_desc_block(const struct inode *inode) +{ + return (1UL << inode->i_blkbits) / + sizeof(struct nilfs_palloc_group_desc); +} + +static inline unsigned long +nilfs_palloc_groups_count(const struct inode *inode) +{ + return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); +} + +int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS); + if (!mi->mi_bgl) + return -ENOMEM; + + bgl_lock_init(mi->mi_bgl); + + nilfs_mdt_set_entry_size(inode, entry_size, 0); + + mi->mi_blocks_per_group = + DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode), + mi->mi_entries_per_block) + 1; + /* Number of blocks in a group including entry blocks and + a bitmap block */ + mi->mi_blocks_per_desc_block = + nilfs_palloc_groups_per_desc_block(inode) * + mi->mi_blocks_per_group + 1; + /* Number of blocks per descriptor including the + descriptor block */ + return 0; +} + +static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, + unsigned long *offset) +{ + __u64 group = nr; + + *offset = do_div(group, nilfs_palloc_entries_per_group(inode)); + return group; +} + +static unsigned long +nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) +{ + unsigned long desc_block = + group / nilfs_palloc_groups_per_desc_block(inode); + return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; +} + +static unsigned long +nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) +{ + unsigned long desc_offset = + group % nilfs_palloc_groups_per_desc_block(inode); + return nilfs_palloc_desc_blkoff(inode, group) + 1 + + desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; +} + +static unsigned long +nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, + const struct nilfs_palloc_group_desc *desc) +{ + unsigned long nfree; + + spin_lock(nilfs_mdt_bgl_lock(inode, group)); + nfree = le32_to_cpu(desc->pg_nfrees); + spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + return nfree; +} + +static void +nilfs_palloc_group_desc_add_entries(struct inode *inode, + unsigned long group, + struct nilfs_palloc_group_desc *desc, + u32 n) +{ + spin_lock(nilfs_mdt_bgl_lock(inode, group)); + le32_add_cpu(&desc->pg_nfrees, n); + spin_unlock(nilfs_mdt_bgl_lock(inode, group)); +} + +static unsigned long +nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) +{ + unsigned long group, group_offset; + + group = nilfs_palloc_group(inode, nr, &group_offset); + + return nilfs_palloc_bitmap_blkoff(inode, group) + 1 + + group_offset / NILFS_MDT(inode)->mi_entries_per_block; +} + +static void nilfs_palloc_desc_block_init(struct inode *inode, + struct buffer_head *bh, void *kaddr) +{ + struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh); + unsigned long n = nilfs_palloc_groups_per_desc_block(inode); + __le32 nfrees; + + nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode)); + while (n-- > 0) { + desc->pg_nfrees = nfrees; + desc++; + } +} + +static int nilfs_palloc_get_desc_block(struct inode *inode, + unsigned long group, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(inode, + nilfs_palloc_desc_blkoff(inode, group), + create, nilfs_palloc_desc_block_init, bhp); +} + +static int nilfs_palloc_get_bitmap_block(struct inode *inode, + unsigned long group, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(inode, + nilfs_palloc_bitmap_blkoff(inode, group), + create, NULL, bhp); +} + +int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr), + create, NULL, bhp); +} + +static struct nilfs_palloc_group_desc * +nilfs_palloc_block_get_group_desc(const struct inode *inode, + unsigned long group, + const struct buffer_head *bh, void *kaddr) +{ + return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) + + group % nilfs_palloc_groups_per_desc_block(inode); +} + +static unsigned char * +nilfs_palloc_block_get_bitmap(const struct inode *inode, + const struct buffer_head *bh, void *kaddr) +{ + return (unsigned char *)(kaddr + bh_offset(bh)); +} + +void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, + const struct buffer_head *bh, void *kaddr) +{ + unsigned long entry_offset, group_offset; + + nilfs_palloc_group(inode, nr, &group_offset); + entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block; + + return kaddr + bh_offset(bh) + + entry_offset * NILFS_MDT(inode)->mi_entry_size; +} + +static int nilfs_palloc_find_available_slot(struct inode *inode, + unsigned long group, + unsigned long target, + unsigned char *bitmap, + int bsize) /* size in bits */ +{ + int curr, pos, end, i; + + if (target > 0) { + end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1); + if (end > bsize) + end = bsize; + pos = nilfs_find_next_zero_bit(bitmap, end, target); + if (pos < end && + !nilfs_set_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), pos, bitmap)) + return pos; + } else + end = 0; + + for (i = 0, curr = end; + i < bsize; + i += BITS_PER_LONG, curr += BITS_PER_LONG) { + /* wrap around */ + if (curr >= bsize) + curr = 0; + while (*((unsigned long *)bitmap + curr / BITS_PER_LONG) + != ~0UL) { + end = curr + BITS_PER_LONG; + if (end > bsize) + end = bsize; + pos = nilfs_find_next_zero_bit(bitmap, end, curr); + if ((pos < end) && + !nilfs_set_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), pos, + bitmap)) + return pos; + } + } + return -ENOSPC; +} + +static unsigned long +nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, + unsigned long curr, unsigned long max) +{ + return min_t(unsigned long, + nilfs_palloc_groups_per_desc_block(inode) - + curr % nilfs_palloc_groups_per_desc_block(inode), + max - curr + 1); +} + +int nilfs_palloc_prepare_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct buffer_head *desc_bh, *bitmap_bh; + struct nilfs_palloc_group_desc *desc; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + unsigned long group, maxgroup, ngroups; + unsigned long group_offset, maxgroup_offset; + unsigned long n, entries_per_group, groups_per_desc_block; + unsigned long i, j; + int pos, ret; + + ngroups = nilfs_palloc_groups_count(inode); + maxgroup = ngroups - 1; + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + entries_per_group = nilfs_palloc_entries_per_group(inode); + groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode); + + for (i = 0; i < ngroups; i += n) { + if (group >= ngroups) { + /* wrap around */ + group = 0; + maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr, + &maxgroup_offset) - 1; + } + ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); + if (ret < 0) + return ret; + desc_kaddr = kmap(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + n = nilfs_palloc_rest_groups_in_desc_block(inode, group, + maxgroup); + for (j = 0; j < n; j++, desc++, group++) { + if (nilfs_palloc_group_desc_nfrees(inode, group, desc) + > 0) { + ret = nilfs_palloc_get_bitmap_block( + inode, group, 1, &bitmap_bh); + if (ret < 0) + goto out_desc; + bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap( + inode, bitmap_bh, bitmap_kaddr); + pos = nilfs_palloc_find_available_slot( + inode, group, group_offset, bitmap, + entries_per_group); + if (pos >= 0) { + /* found a free entry */ + nilfs_palloc_group_desc_add_entries( + inode, group, desc, -1); + req->pr_entry_nr = + entries_per_group * group + pos; + kunmap(desc_bh->b_page); + kunmap(bitmap_bh->b_page); + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; + } + kunmap(bitmap_bh->b_page); + brelse(bitmap_bh); + } + + group_offset = 0; + } + + kunmap(desc_bh->b_page); + brelse(desc_bh); + } + + /* no entries left */ + return -ENOSPC; + + out_desc: + kunmap(desc_bh->b_page); + brelse(desc_bh); + return ret; +} + +void nilfs_palloc_commit_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh); + nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); +} + +void nilfs_palloc_commit_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct nilfs_palloc_group_desc *desc; + unsigned long group, group_offset; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc(inode, group, + req->pr_desc_bh, desc_kaddr); + bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, + bitmap_kaddr); + + if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) + printk(KERN_WARNING "%s: entry number %llu already freed\n", + __func__, (unsigned long long)req->pr_entry_nr); + + nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + + kunmap(req->pr_bitmap_bh->b_page); + kunmap(req->pr_desc_bh->b_page); + + nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh); + nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); +} + +void nilfs_palloc_abort_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct nilfs_palloc_group_desc *desc; + void *desc_kaddr, *bitmap_kaddr; + unsigned char *bitmap; + unsigned long group, group_offset; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc(inode, group, + req->pr_desc_bh, desc_kaddr); + bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, + bitmap_kaddr); + if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) + printk(KERN_WARNING "%s: entry numer %llu already freed\n", + __func__, (unsigned long long)req->pr_entry_nr); + + nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + + kunmap(req->pr_bitmap_bh->b_page); + kunmap(req->pr_desc_bh->b_page); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); + + req->pr_entry_nr = 0; + req->pr_bitmap_bh = NULL; + req->pr_desc_bh = NULL; +} + +int nilfs_palloc_prepare_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct buffer_head *desc_bh, *bitmap_bh; + unsigned long group, group_offset; + int ret; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); + if (ret < 0) + return ret; + ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); + if (ret < 0) { + brelse(desc_bh); + return ret; + } + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; +} + +void nilfs_palloc_abort_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); + + req->pr_entry_nr = 0; + req->pr_bitmap_bh = NULL; + req->pr_desc_bh = NULL; +} + +static int +nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) +{ + __u64 first, last; + + first = group * nilfs_palloc_entries_per_group(inode); + last = first + nilfs_palloc_entries_per_group(inode) - 1; + return (nr >= first) && (nr <= last); +} + +int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) +{ + struct buffer_head *desc_bh, *bitmap_bh; + struct nilfs_palloc_group_desc *desc; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + unsigned long group, group_offset; + int i, j, n, ret; + + for (i = 0; i < nitems; i += n) { + group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); + ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); + if (ret < 0) + return ret; + ret = nilfs_palloc_get_bitmap_block(inode, group, 0, + &bitmap_bh); + if (ret < 0) { + brelse(desc_bh); + return ret; + } + desc_kaddr = kmap(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap( + inode, bitmap_bh, bitmap_kaddr); + for (j = i, n = 0; + (j < nitems) && nilfs_palloc_group_is_in(inode, group, + entry_nrs[j]); + j++, n++) { + nilfs_palloc_group(inode, entry_nrs[j], &group_offset); + if (!nilfs_clear_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) { + printk(KERN_WARNING + "%s: entry number %llu already freed\n", + __func__, + (unsigned long long)entry_nrs[j]); + } + } + nilfs_palloc_group_desc_add_entries(inode, group, desc, n); + + kunmap(bitmap_bh->b_page); + kunmap(desc_bh->b_page); + + nilfs_mdt_mark_buffer_dirty(desc_bh); + nilfs_mdt_mark_buffer_dirty(bitmap_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(bitmap_bh); + brelse(desc_bh); + } + return 0; +} diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h new file mode 100644 index 000000000000..4ace5475c2c7 --- /dev/null +++ b/fs/nilfs2/alloc.h @@ -0,0 +1,72 @@ +/* + * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Original code was written by Koji Sato <koji@osrg.net>. + * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>, + * Amagai Yoshiji <amagai@osrg.net>. + */ + +#ifndef _NILFS_ALLOC_H +#define _NILFS_ALLOC_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> + +static inline unsigned long +nilfs_palloc_entries_per_group(const struct inode *inode) +{ + return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */); +} + +int nilfs_palloc_init_blockgroup(struct inode *, unsigned); +int nilfs_palloc_get_entry_block(struct inode *, __u64, int, + struct buffer_head **); +void *nilfs_palloc_block_get_entry(const struct inode *, __u64, + const struct buffer_head *, void *); + +/** + * nilfs_palloc_req - persistent alloctor request and reply + * @pr_entry_nr: entry number (vblocknr or inode number) + * @pr_desc_bh: buffer head of the buffer containing block group descriptors + * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap + * @pr_entry_bh: buffer head of the buffer containing translation entries + */ +struct nilfs_palloc_req { + __u64 pr_entry_nr; + struct buffer_head *pr_desc_bh; + struct buffer_head *pr_bitmap_bh; + struct buffer_head *pr_entry_bh; +}; + +int nilfs_palloc_prepare_alloc_entry(struct inode *, + struct nilfs_palloc_req *); +void nilfs_palloc_commit_alloc_entry(struct inode *, + struct nilfs_palloc_req *); +void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *); +void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *); +int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *); +void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *); +int nilfs_palloc_freev(struct inode *, __u64 *, size_t); + +#define nilfs_set_bit_atomic ext2_set_bit_atomic +#define nilfs_clear_bit_atomic ext2_clear_bit_atomic +#define nilfs_find_next_zero_bit ext2_find_next_zero_bit + +#endif /* _NILFS_ALLOC_H */ diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c new file mode 100644 index 000000000000..24638e059bf3 --- /dev/null +++ b/fs/nilfs2/bmap.c @@ -0,0 +1,783 @@ +/* + * bmap.c - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/errno.h> +#include "nilfs.h" +#include "bmap.h" +#include "sb.h" +#include "btnode.h" +#include "mdt.h" +#include "dat.h" +#include "alloc.h" + +int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, + __u64 *ptrp) +{ + __u64 ptr; + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); + if (ret < 0) + goto out; + if (bmap->b_pops->bpop_translate != NULL) { + ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr); + if (ret < 0) + goto out; + *ptrp = ptr; + } + + out: + up_read(&bmap->b_sem); + return ret; +} + + +/** + * nilfs_bmap_lookup - find a record + * @bmap: bmap + * @key: key + * @recp: pointer to record + * + * Description: nilfs_bmap_lookup() finds a record whose key matches @key in + * @bmap. + * + * Return Value: On success, 0 is returned and the record associated with @key + * is stored in the place pointed by @recp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ +int nilfs_bmap_lookup(struct nilfs_bmap *bmap, + unsigned long key, + unsigned long *recp) +{ + __u64 ptr; + int ret; + + /* XXX: use macro for level 1 */ + ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr); + if (recp != NULL) + *recp = ptr; + return ret; +} + +static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; + __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1]; + int ret, n; + + if (bmap->b_ops->bop_check_insert != NULL) { + ret = bmap->b_ops->bop_check_insert(bmap, key); + if (ret > 0) { + n = bmap->b_ops->bop_gather_data( + bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1); + if (n < 0) + return n; + ret = nilfs_btree_convert_and_insert( + bmap, key, ptr, keys, ptrs, n, + NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH); + if (ret == 0) + bmap->b_u.u_flags |= NILFS_BMAP_LARGE; + + return ret; + } else if (ret < 0) + return ret; + } + + return bmap->b_ops->bop_insert(bmap, key, ptr); +} + +/** + * nilfs_bmap_insert - insert a new key-record pair into a bmap + * @bmap: bmap + * @key: key + * @rec: record + * + * Description: nilfs_bmap_insert() inserts the new key-record pair specified + * by @key and @rec into @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EEXIST - A record associated with @key already exist. + */ +int nilfs_bmap_insert(struct nilfs_bmap *bmap, + unsigned long key, + unsigned long rec) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_insert(bmap, key, rec); + up_write(&bmap->b_sem); + return ret; +} + +static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) +{ + __u64 keys[NILFS_BMAP_LARGE_LOW + 1]; + __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1]; + int ret, n; + + if (bmap->b_ops->bop_check_delete != NULL) { + ret = bmap->b_ops->bop_check_delete(bmap, key); + if (ret > 0) { + n = bmap->b_ops->bop_gather_data( + bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1); + if (n < 0) + return n; + ret = nilfs_direct_delete_and_convert( + bmap, key, keys, ptrs, n, + NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH); + if (ret == 0) + bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE; + + return ret; + } else if (ret < 0) + return ret; + } + + return bmap->b_ops->bop_delete(bmap, key); +} + +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) +{ + __u64 lastkey; + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (!ret) + *key = lastkey; + up_read(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_delete - delete a key-record pair from a bmap + * @bmap: bmap + * @key: key + * + * Description: nilfs_bmap_delete() deletes the key-record pair specified by + * @key from @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ +int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_delete(bmap, key); + up_write(&bmap->b_sem); + return ret; +} + +static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) +{ + __u64 lastkey; + int ret; + + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + return ret; + } + + while (key <= lastkey) { + ret = nilfs_bmap_do_delete(bmap, lastkey); + if (ret < 0) + return ret; + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + return ret; + } + } + return 0; +} + +/** + * nilfs_bmap_truncate - truncate a bmap to a specified key + * @bmap: bmap + * @key: key + * + * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are + * greater than or equal to @key from @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_truncate(bmap, key); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_clear - free resources a bmap holds + * @bmap: bmap + * + * Description: nilfs_bmap_clear() frees resources associated with @bmap. + */ +void nilfs_bmap_clear(struct nilfs_bmap *bmap) +{ + down_write(&bmap->b_sem); + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + up_write(&bmap->b_sem); +} + +/** + * nilfs_bmap_propagate - propagate dirty state + * @bmap: bmap + * @bh: buffer head + * + * Description: nilfs_bmap_propagate() marks the buffers that directly or + * indirectly refer to the block specified by @bh dirty. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh) +{ + int ret; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_propagate(bmap, bh); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_lookup_dirty_buffers - + * @bmap: bmap + * @listp: pointer to buffer head list + */ +void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, + struct list_head *listp) +{ + if (bmap->b_ops->bop_lookup_dirty_buffers != NULL) + bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp); +} + +/** + * nilfs_bmap_assign - assign a new block number to a block + * @bmap: bmap + * @bhp: pointer to buffer head + * @blocknr: block number + * @binfo: block information + * + * Description: nilfs_bmap_assign() assigns the block number @blocknr to the + * buffer specified by @bh. + * + * Return Value: On success, 0 is returned and the buffer head of a newly + * create buffer and the block information associated with the buffer are + * stored in the place pointed by @bh and @binfo, respectively. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + unsigned long blocknr, + union nilfs_binfo *binfo) +{ + int ret; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_mark - mark block dirty + * @bmap: bmap + * @key: key + * @level: level + * + * Description: nilfs_bmap_mark() marks the block specified by @key and @level + * as dirty. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level) +{ + int ret; + + if (bmap->b_ops->bop_mark == NULL) + return 0; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_mark(bmap, key, level); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state + * @bmap: bmap + * + * Description: nilfs_test_and_clear() is the atomic operation to test and + * clear the dirty state of @bmap. + * + * Return Value: 1 is returned if @bmap is dirty, or 0 if clear. + */ +int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_dirty(bmap); + nilfs_bmap_clear_dirty(bmap); + up_write(&bmap->b_sem); + return ret; +} + + +/* + * Internal use only + */ + +void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n) +{ + inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); + if (NILFS_MDT(bmap->b_inode)) + nilfs_mdt_mark_dirty(bmap->b_inode); + else + mark_inode_dirty(bmap->b_inode); +} + +void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n) +{ + inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); + if (NILFS_MDT(bmap->b_inode)) + nilfs_mdt_mark_dirty(bmap->b_inode); + else + mark_inode_dirty(bmap->b_inode); +} + +int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr, + struct buffer_head **bhp) +{ + return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache, + ptr, 0, bhp, 0); +} + +void nilfs_bmap_put_block(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + brelse(bh); +} + +int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr, + struct buffer_head **bhp) +{ + int ret; + + ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache, + ptr, 0, bhp, 1); + if (ret < 0) + return ret; + set_buffer_nilfs_volatile(*bhp); + return 0; +} + +void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + nilfs_btnode_delete(bh); +} + +__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, + const struct buffer_head *bh) +{ + struct buffer_head *pbh; + __u64 key; + + key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - + bmap->b_inode->i_blkbits); + for (pbh = page_buffers(bh->b_page); pbh != bh; + pbh = pbh->b_this_page, key++); + + return key; +} + +__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key) +{ + __s64 diff; + + diff = key - bmap->b_last_allocated_key; + if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) && + (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) && + (bmap->b_last_allocated_ptr + diff > 0)) + return bmap->b_last_allocated_ptr + diff; + else + return NILFS_BMAP_INVALID_PTR; +} + +static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) +{ + return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); +} + +#define NILFS_BMAP_GROUP_DIV 8 +__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) +{ + struct inode *dat = nilfs_bmap_get_dat(bmap); + unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat); + unsigned long group = bmap->b_inode->i_ino / entries_per_group; + + return group * entries_per_group + + (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) * + (entries_per_group / NILFS_BMAP_GROUP_DIV); +} + +static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + sector_t blocknr) +{ + nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req, + blocknr); +} + +static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0); +} + +static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1); +} + +static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr, + sector_t blocknr) +{ + return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr); +} + +int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr) +{ + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr); +} + +int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) +{ + int ret; + + ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq); + if (ret < 0) + return ret; + ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq); + if (ret < 0) + bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq); + + return ret; +} + +void nilfs_bmap_commit_update(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) +{ + bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq); + bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq); +} + +void nilfs_bmap_abort_update(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) +{ + bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq); + bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq); +} + +static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr, + __u64 *ptrp) +{ + sector_t blocknr; + int ret; + + ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr); + if (ret < 0) + return ret; + if (ptrp != NULL) + *ptrp = blocknr; + return 0; +} + +static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + /* ignore target ptr */ + req->bpr_ptr = bmap->b_last_allocated_ptr++; + return 0; +} + +static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + /* do nothing */ +} + +static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + bmap->b_last_allocated_ptr--; +} + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = { + .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v, + .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v, + .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v, + .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v, + .bpop_commit_start_ptr = nilfs_bmap_commit_start_v, + .bpop_abort_start_ptr = nilfs_bmap_abort_start_v, + .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v, + .bpop_commit_end_ptr = nilfs_bmap_commit_end_v, + .bpop_abort_end_ptr = nilfs_bmap_abort_end_v, + + .bpop_translate = nilfs_bmap_translate_v, +}; + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = { + .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v, + .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v, + .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v, + .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v, + .bpop_commit_start_ptr = nilfs_bmap_commit_start_v, + .bpop_abort_start_ptr = nilfs_bmap_abort_start_v, + .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v, + .bpop_commit_end_ptr = nilfs_bmap_commit_end_vmdt, + .bpop_abort_end_ptr = nilfs_bmap_abort_end_v, + + .bpop_translate = nilfs_bmap_translate_v, +}; + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = { + .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_p, + .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_p, + .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_p, + .bpop_prepare_start_ptr = NULL, + .bpop_commit_start_ptr = NULL, + .bpop_abort_start_ptr = NULL, + .bpop_prepare_end_ptr = NULL, + .bpop_commit_end_ptr = NULL, + .bpop_abort_end_ptr = NULL, + + .bpop_translate = NULL, +}; + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = { + .bpop_prepare_alloc_ptr = NULL, + .bpop_commit_alloc_ptr = NULL, + .bpop_abort_alloc_ptr = NULL, + .bpop_prepare_start_ptr = NULL, + .bpop_commit_start_ptr = NULL, + .bpop_abort_start_ptr = NULL, + .bpop_prepare_end_ptr = NULL, + .bpop_commit_end_ptr = NULL, + .bpop_abort_end_ptr = NULL, + + .bpop_translate = NULL, +}; + +/** + * nilfs_bmap_read - read a bmap from an inode + * @bmap: bmap + * @raw_inode: on-disk inode + * + * Description: nilfs_bmap_read() initializes the bmap @bmap. + * + * Return Value: On success, 0 is returned. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) +{ + if (raw_inode == NULL) + memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE); + else + memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE); + + init_rwsem(&bmap->b_sem); + bmap->b_state = 0; + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + bmap->b_pops = &nilfs_bmap_ptr_ops_p; + bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + break; + case NILFS_CPFILE_INO: + case NILFS_SUFILE_INO: + bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt; + bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + break; + default: + bmap->b_pops = &nilfs_bmap_ptr_ops_v; + bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + break; + } + + return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ? + nilfs_btree_init(bmap, + NILFS_BMAP_LARGE_LOW, + NILFS_BMAP_LARGE_HIGH) : + nilfs_direct_init(bmap, + NILFS_BMAP_SMALL_LOW, + NILFS_BMAP_SMALL_HIGH); +} + +/** + * nilfs_bmap_write - write back a bmap to an inode + * @bmap: bmap + * @raw_inode: on-disk inode + * + * Description: nilfs_bmap_write() stores @bmap in @raw_inode. + */ +void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) +{ + down_write(&bmap->b_sem); + memcpy(raw_inode->i_bmap, bmap->b_u.u_data, + NILFS_INODE_BMAP_SIZE * sizeof(__le64)); + if (bmap->b_inode->i_ino == NILFS_DAT_INO) + bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + + up_write(&bmap->b_sem); +} + +void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) +{ + memset(&bmap->b_u, 0, NILFS_BMAP_SIZE); + init_rwsem(&bmap->b_sem); + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; + bmap->b_pops = &nilfs_bmap_ptr_ops_gc; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + bmap->b_state = 0; + nilfs_btree_init_gc(bmap); +} + +void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) +{ + memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union)); + init_rwsem(&gcbmap->b_sem); + gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; +} + +void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) +{ + memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union)); + init_rwsem(&bmap->b_sem); + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; +} diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h new file mode 100644 index 000000000000..4f2708abb1ba --- /dev/null +++ b/fs/nilfs2/bmap.h @@ -0,0 +1,244 @@ +/* + * bmap.h - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_BMAP_H +#define _NILFS_BMAP_H + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "alloc.h" + +#define NILFS_BMAP_INVALID_PTR 0 + +#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey) +#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key) +#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr) +#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr) + +#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) + + +struct nilfs_bmap; + +/** + * union nilfs_bmap_ptr_req - request for bmap ptr + * @bpr_ptr: bmap pointer + * @bpr_req: request for persistent allocator + */ +union nilfs_bmap_ptr_req { + __u64 bpr_ptr; + struct nilfs_palloc_req bpr_req; +}; + +/** + * struct nilfs_bmap_stats - bmap statistics + * @bs_nblocks: number of blocks created or deleted + */ +struct nilfs_bmap_stats { + unsigned int bs_nblocks; +}; + +/** + * struct nilfs_bmap_operations - bmap operation table + */ +struct nilfs_bmap_operations { + int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *); + int (*bop_insert)(struct nilfs_bmap *, __u64, __u64); + int (*bop_delete)(struct nilfs_bmap *, __u64); + void (*bop_clear)(struct nilfs_bmap *); + + int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *); + void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, + struct list_head *); + + int (*bop_assign)(struct nilfs_bmap *, + struct buffer_head **, + sector_t, + union nilfs_binfo *); + int (*bop_mark)(struct nilfs_bmap *, __u64, int); + + /* The following functions are internal use only. */ + int (*bop_last_key)(const struct nilfs_bmap *, __u64 *); + int (*bop_check_insert)(const struct nilfs_bmap *, __u64); + int (*bop_check_delete)(struct nilfs_bmap *, __u64); + int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int); +}; + + +/** + * struct nilfs_bmap_ptr_operations - bmap ptr operation table + */ +struct nilfs_bmap_ptr_operations { + int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + int (*bpop_prepare_start_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_commit_start_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + sector_t); + void (*bpop_abort_start_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + int (*bpop_prepare_end_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_commit_end_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_abort_end_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + + int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *); +}; + + +#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64)) +#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */) +#define NILFS_BMAP_NEW_PTR_INIT \ + (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1)) + +static inline int nilfs_bmap_is_new_ptr(unsigned long ptr) +{ + return !!(ptr & NILFS_BMAP_NEW_PTR_INIT); +} + + +/** + * struct nilfs_bmap - bmap structure + * @b_u: raw data + * @b_sem: semaphore + * @b_inode: owner of bmap + * @b_ops: bmap operation table + * @b_pops: bmap ptr operation table + * @b_low: low watermark of conversion + * @b_high: high watermark of conversion + * @b_last_allocated_key: last allocated key for data block + * @b_last_allocated_ptr: last allocated ptr for data block + * @b_state: state + */ +struct nilfs_bmap { + union { + __u8 u_flags; + __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)]; + } b_u; + struct rw_semaphore b_sem; + struct inode *b_inode; + const struct nilfs_bmap_operations *b_ops; + const struct nilfs_bmap_ptr_operations *b_pops; + __u64 b_low; + __u64 b_high; + __u64 b_last_allocated_key; + __u64 b_last_allocated_ptr; + int b_state; +}; + +/* state */ +#define NILFS_BMAP_DIRTY 0x00000001 + + +int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); +int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); +void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); +int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *); +int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); +int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); +int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *); +int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long); +void nilfs_bmap_clear(struct nilfs_bmap *); +int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *); +void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *); +int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **, + unsigned long, union nilfs_binfo *); +int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *); +int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int); + +void nilfs_bmap_init_gc(struct nilfs_bmap *); +void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); +void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); + + +/* + * Internal use only + */ + +int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t); +int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64); + + +__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, + const struct buffer_head *); + +__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); +__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); + +int nilfs_bmap_prepare_update(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); +void nilfs_bmap_commit_update(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); +void nilfs_bmap_abort_update(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); + +void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); +void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); + + +int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64, + struct buffer_head **); +void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *); +int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64, + struct buffer_head **); +void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *); + + +/* Assume that bmap semaphore is locked. */ +static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap) +{ + return !!(bmap->b_state & NILFS_BMAP_DIRTY); +} + +/* Assume that bmap semaphore is locked. */ +static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap) +{ + bmap->b_state |= NILFS_BMAP_DIRTY; +} + +/* Assume that bmap semaphore is locked. */ +static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap) +{ + bmap->b_state &= ~NILFS_BMAP_DIRTY; +} + + +#define NILFS_BMAP_LARGE 0x1 + +#define NILFS_BMAP_SMALL_LOW NILFS_DIRECT_KEY_MIN +#define NILFS_BMAP_SMALL_HIGH NILFS_DIRECT_KEY_MAX +#define NILFS_BMAP_LARGE_LOW NILFS_BTREE_ROOT_NCHILDREN_MAX +#define NILFS_BMAP_LARGE_HIGH NILFS_BTREE_KEY_MAX + +#endif /* _NILFS_BMAP_H */ diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h new file mode 100644 index 000000000000..d41509bff47b --- /dev/null +++ b/fs/nilfs2/bmap_union.h @@ -0,0 +1,42 @@ +/* + * bmap_union.h - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_BMAP_UNION_H +#define _NILFS_BMAP_UNION_H + +#include "bmap.h" +#include "direct.h" +#include "btree.h" + +/** + * nilfs_bmap_union - + * @bi_bmap: bmap structure + * @bi_btree: direct map structure + * @bi_direct: B-tree structure + */ +union nilfs_bmap_union { + struct nilfs_bmap bi_bmap; + struct nilfs_direct bi_direct; + struct nilfs_btree bi_btree; +}; + +#endif /* _NILFS_BMAP_UNION_H */ diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c new file mode 100644 index 000000000000..4cc07b2c30e0 --- /dev/null +++ b/fs/nilfs2/btnode.c @@ -0,0 +1,316 @@ +/* + * btnode.c - NILFS B-tree node cache + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * This file was originally written by Seiji Kihara <kihara@osrg.net> + * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for + * stabilization and simplification. + * + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/mm.h> +#include <linux/backing-dev.h> +#include "nilfs.h" +#include "mdt.h" +#include "dat.h" +#include "page.h" +#include "btnode.h" + + +void nilfs_btnode_cache_init_once(struct address_space *btnc) +{ + INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); + spin_lock_init(&btnc->tree_lock); + INIT_LIST_HEAD(&btnc->private_list); + spin_lock_init(&btnc->private_lock); + + spin_lock_init(&btnc->i_mmap_lock); + INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap); + INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); +} + +static struct address_space_operations def_btnode_aops; + +void nilfs_btnode_cache_init(struct address_space *btnc) +{ + btnc->host = NULL; /* can safely set to host inode ? */ + btnc->flags = 0; + mapping_set_gfp_mask(btnc, GFP_NOFS); + btnc->assoc_mapping = NULL; + btnc->backing_dev_info = &default_backing_dev_info; + btnc->a_ops = &def_btnode_aops; +} + +void nilfs_btnode_cache_clear(struct address_space *btnc) +{ + invalidate_mapping_pages(btnc, 0, -1); + truncate_inode_pages(btnc, 0); +} + +int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, + sector_t pblocknr, struct buffer_head **pbh, + int newblk) +{ + struct buffer_head *bh; + struct inode *inode = NILFS_BTNC_I(btnc); + int err; + + bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + if (unlikely(!bh)) + return -ENOMEM; + + err = -EEXIST; /* internal code */ + if (newblk) { + if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || + buffer_dirty(bh))) { + brelse(bh); + BUG(); + } + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_blocknr = blocknr; + set_buffer_mapped(bh); + set_buffer_uptodate(bh); + goto found; + } + + if (buffer_uptodate(bh) || buffer_dirty(bh)) + goto found; + + if (pblocknr == 0) { + pblocknr = blocknr; + if (inode->i_ino != NILFS_DAT_INO) { + struct inode *dat = + nilfs_dat_inode(NILFS_I_NILFS(inode)); + + /* blocknr is a virtual block number */ + err = nilfs_dat_translate(dat, blocknr, &pblocknr); + if (unlikely(err)) { + brelse(bh); + goto out_locked; + } + } + } + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + err = -EEXIST; /* internal code */ + goto found; + } + set_buffer_mapped(bh); + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_blocknr = pblocknr; /* set block address for read */ + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ, bh); + bh->b_blocknr = blocknr; /* set back to the given block address */ + err = 0; +found: + *pbh = bh; + +out_locked: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return err; +} + +int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr, + sector_t pblocknr, struct buffer_head **pbh, int newblk) +{ + struct buffer_head *bh; + int err; + + err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk); + if (err == -EEXIST) /* internal code (cache hit) */ + return 0; + if (unlikely(err)) + return err; + + bh = *pbh; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + brelse(bh); + return -EIO; + } + return 0; +} + +/** + * nilfs_btnode_delete - delete B-tree node buffer + * @bh: buffer to be deleted + * + * nilfs_btnode_delete() invalidates the specified buffer and delete the page + * including the buffer if the page gets unbusy. + */ +void nilfs_btnode_delete(struct buffer_head *bh) +{ + struct address_space *mapping; + struct page *page = bh->b_page; + pgoff_t index = page_index(page); + int still_dirty; + + page_cache_get(page); + lock_page(page); + wait_on_page_writeback(page); + + nilfs_forget_buffer(bh); + still_dirty = PageDirty(page); + mapping = page->mapping; + unlock_page(page); + page_cache_release(page); + + if (!still_dirty && mapping) + invalidate_inode_pages2_range(mapping, index, index); +} + +/** + * nilfs_btnode_prepare_change_key + * prepare to move contents of the block for old key to one of new key. + * the old buffer will not be removed, but might be reused for new buffer. + * it might return -ENOMEM because of memory allocation errors, + * and might return -EIO because of disk read errors. + */ +int nilfs_btnode_prepare_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *obh, *nbh; + struct inode *inode = NILFS_BTNC_I(btnc); + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + int err; + + if (oldkey == newkey) + return 0; + + obh = ctxt->bh; + ctxt->newbh = NULL; + + if (inode->i_blkbits == PAGE_CACHE_SHIFT) { + lock_page(obh->b_page); + /* + * We cannot call radix_tree_preload for the kernels older + * than 2.6.23, because it is not exported for modules. + */ + err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (err) + goto failed_unlock; + /* BUG_ON(oldkey != obh->b_page->index); */ + if (unlikely(oldkey != obh->b_page->index)) + NILFS_PAGE_BUG(obh->b_page, + "invalid oldkey %lld (newkey=%lld)", + (unsigned long long)oldkey, + (unsigned long long)newkey); + +retry: + spin_lock_irq(&btnc->tree_lock); + err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); + spin_unlock_irq(&btnc->tree_lock); + /* + * Note: page->index will not change to newkey until + * nilfs_btnode_commit_change_key() will be called. + * To protect the page in intermediate state, the page lock + * is held. + */ + radix_tree_preload_end(); + if (!err) + return 0; + else if (err != -EEXIST) + goto failed_unlock; + + err = invalidate_inode_pages2_range(btnc, newkey, newkey); + if (!err) + goto retry; + /* fallback to copy mode */ + unlock_page(obh->b_page); + } + + err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1); + if (likely(!err)) { + BUG_ON(nbh == obh); + ctxt->newbh = nbh; + } + return err; + + failed_unlock: + unlock_page(obh->b_page); + return err; +} + +/** + * nilfs_btnode_commit_change_key + * commit the change_key operation prepared by prepare_change_key(). + */ +void nilfs_btnode_commit_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh; + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + struct page *opage; + + if (oldkey == newkey) + return; + + if (nbh == NULL) { /* blocksize == pagesize */ + opage = obh->b_page; + if (unlikely(oldkey != opage->index)) + NILFS_PAGE_BUG(opage, + "invalid oldkey %lld (newkey=%lld)", + (unsigned long long)oldkey, + (unsigned long long)newkey); + if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage)) + BUG(); + + spin_lock_irq(&btnc->tree_lock); + radix_tree_delete(&btnc->page_tree, oldkey); + radix_tree_tag_set(&btnc->page_tree, newkey, + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&btnc->tree_lock); + + opage->index = obh->b_blocknr = newkey; + unlock_page(opage); + } else { + nilfs_copy_buffer(nbh, obh); + nilfs_btnode_mark_dirty(nbh); + + nbh->b_blocknr = newkey; + ctxt->bh = nbh; + nilfs_btnode_delete(obh); /* will decrement bh->b_count */ + } +} + +/** + * nilfs_btnode_abort_change_key + * abort the change_key operation prepared by prepare_change_key(). + */ +void nilfs_btnode_abort_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *nbh = ctxt->newbh; + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + + if (oldkey == newkey) + return; + + if (nbh == NULL) { /* blocksize == pagesize */ + spin_lock_irq(&btnc->tree_lock); + radix_tree_delete(&btnc->page_tree, newkey); + spin_unlock_irq(&btnc->tree_lock); + unlock_page(ctxt->bh->b_page); + } else + brelse(nbh); +} diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h new file mode 100644 index 000000000000..35faa86444a7 --- /dev/null +++ b/fs/nilfs2/btnode.h @@ -0,0 +1,58 @@ +/* + * btnode.h - NILFS B-tree node cache + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara <kihara@osrg.net> + * Revised by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#ifndef _NILFS_BTNODE_H +#define _NILFS_BTNODE_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/backing-dev.h> + + +struct nilfs_btnode_chkey_ctxt { + __u64 oldkey; + __u64 newkey; + struct buffer_head *bh; + struct buffer_head *newbh; +}; + +void nilfs_btnode_cache_init_once(struct address_space *); +void nilfs_btnode_cache_init(struct address_space *); +void nilfs_btnode_cache_clear(struct address_space *); +int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, + struct buffer_head **, int); +int nilfs_btnode_get(struct address_space *, __u64, sector_t, + struct buffer_head **, int); +void nilfs_btnode_delete(struct buffer_head *); +int nilfs_btnode_prepare_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); +void nilfs_btnode_commit_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); +void nilfs_btnode_abort_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); + +#define nilfs_btnode_mark_dirty(bh) nilfs_mark_buffer_dirty(bh) + + +#endif /* _NILFS_BTNODE_H */ diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c new file mode 100644 index 000000000000..6b37a2767293 --- /dev/null +++ b/fs/nilfs2/btree.c @@ -0,0 +1,2269 @@ +/* + * btree.c - NILFS B-tree. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/pagevec.h> +#include "nilfs.h" +#include "page.h" +#include "btnode.h" +#include "btree.h" +#include "alloc.h" + +/** + * struct nilfs_btree_path - A path on which B-tree operations are executed + * @bp_bh: buffer head of node block + * @bp_sib_bh: buffer head of sibling node block + * @bp_index: index of child node + * @bp_oldreq: ptr end request for old ptr + * @bp_newreq: ptr alloc request for new ptr + * @bp_op: rebalance operation + */ +struct nilfs_btree_path { + struct buffer_head *bp_bh; + struct buffer_head *bp_sib_bh; + int bp_index; + union nilfs_bmap_ptr_req bp_oldreq; + union nilfs_bmap_ptr_req bp_newreq; + struct nilfs_btnode_chkey_ctxt bp_ctxt; + void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, + int, __u64 *, __u64 *); +}; + +/* + * B-tree path operations + */ + +static struct kmem_cache *nilfs_btree_path_cache; + +int __init nilfs_btree_path_cache_init(void) +{ + nilfs_btree_path_cache = + kmem_cache_create("nilfs2_btree_path_cache", + sizeof(struct nilfs_btree_path) * + NILFS_BTREE_LEVEL_MAX, 0, 0, NULL); + return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM; +} + +void nilfs_btree_path_cache_destroy(void) +{ + kmem_cache_destroy(nilfs_btree_path_cache); +} + +static inline struct nilfs_btree_path * +nilfs_btree_alloc_path(const struct nilfs_btree *btree) +{ + return (struct nilfs_btree_path *) + kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); +} + +static inline void nilfs_btree_free_path(const struct nilfs_btree *btree, + struct nilfs_btree_path *path) +{ + kmem_cache_free(nilfs_btree_path_cache, path); +} + +static void nilfs_btree_init_path(const struct nilfs_btree *btree, + struct nilfs_btree_path *path) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_DATA; + level < NILFS_BTREE_LEVEL_MAX; + level++) { + path[level].bp_bh = NULL; + path[level].bp_sib_bh = NULL; + path[level].bp_index = 0; + path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_op = NULL; + } +} + +static void nilfs_btree_clear_path(const struct nilfs_btree *btree, + struct nilfs_btree_path *path) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_DATA; + level < NILFS_BTREE_LEVEL_MAX; + level++) { + if (path[level].bp_bh != NULL) { + nilfs_bmap_put_block(&btree->bt_bmap, + path[level].bp_bh); + path[level].bp_bh = NULL; + } + /* sib_bh is released or deleted by prepare or commit + * operations. */ + path[level].bp_sib_bh = NULL; + path[level].bp_index = 0; + path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_op = NULL; + } +} + + +/* + * B-tree node operations + */ + +static inline int +nilfs_btree_node_get_flags(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return node->bn_flags; +} + +static inline void +nilfs_btree_node_set_flags(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int flags) +{ + node->bn_flags = flags; +} + +static inline int nilfs_btree_node_root(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT; +} + +static inline int +nilfs_btree_node_get_level(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return node->bn_level; +} + +static inline void +nilfs_btree_node_set_level(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int level) +{ + node->bn_level = level; +} + +static inline int +nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return le16_to_cpu(node->bn_nchildren); +} + +static inline void +nilfs_btree_node_set_nchildren(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int nchildren) +{ + node->bn_nchildren = cpu_to_le16(nchildren); +} + +static inline int +nilfs_btree_node_size(const struct nilfs_btree *btree) +{ + return 1 << btree->bt_bmap.b_inode->i_blkbits; +} + +static inline int +nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_root(btree, node) ? + NILFS_BTREE_ROOT_NCHILDREN_MIN : + NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); +} + +static inline int +nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_root(btree, node) ? + NILFS_BTREE_ROOT_NCHILDREN_MAX : + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); +} + +static inline __le64 * +nilfs_btree_node_dkeys(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return (__le64 *)((char *)(node + 1) + + (nilfs_btree_node_root(btree, node) ? + 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); +} + +static inline __le64 * +nilfs_btree_node_dptrs(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return (__le64 *)(nilfs_btree_node_dkeys(btree, node) + + nilfs_btree_node_nchildren_max(btree, node)); +} + +static inline __u64 +nilfs_btree_node_get_key(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node, int index) +{ + return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) + + index)); +} + +static inline void +nilfs_btree_node_set_key(struct nilfs_btree *btree, + struct nilfs_btree_node *node, int index, __u64 key) +{ + *(nilfs_btree_node_dkeys(btree, node) + index) = + nilfs_bmap_key_to_dkey(key); +} + +static inline __u64 +nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node, + int index) +{ + return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) + + index)); +} + +static inline void +nilfs_btree_node_set_ptr(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int index, + __u64 ptr) +{ + *(nilfs_btree_node_dptrs(btree, node) + index) = + nilfs_bmap_ptr_to_dptr(ptr); +} + +static void nilfs_btree_node_init(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int flags, int level, int nchildren, + const __u64 *keys, const __u64 *ptrs) +{ + __le64 *dkeys; + __le64 *dptrs; + int i; + + nilfs_btree_node_set_flags(btree, node, flags); + nilfs_btree_node_set_level(btree, node, level); + nilfs_btree_node_set_nchildren(btree, node, nchildren); + + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + for (i = 0; i < nchildren; i++) { + dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); + dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); + } +} + +/* Assume the buffer heads corresponding to left and right are locked. */ +static void nilfs_btree_node_move_left(struct nilfs_btree *btree, + struct nilfs_btree_node *left, + struct nilfs_btree_node *right, + int n) +{ + __le64 *ldkeys, *rdkeys; + __le64 *ldptrs, *rdptrs; + int lnchildren, rnchildren; + + ldkeys = nilfs_btree_node_dkeys(btree, left); + ldptrs = nilfs_btree_node_dptrs(btree, left); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + + rdkeys = nilfs_btree_node_dkeys(btree, right); + rdptrs = nilfs_btree_node_dptrs(btree, right); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + + memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); + memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); + memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys)); + memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs)); + + lnchildren += n; + rnchildren -= n; + nilfs_btree_node_set_nchildren(btree, left, lnchildren); + nilfs_btree_node_set_nchildren(btree, right, rnchildren); +} + +/* Assume that the buffer heads corresponding to left and right are locked. */ +static void nilfs_btree_node_move_right(struct nilfs_btree *btree, + struct nilfs_btree_node *left, + struct nilfs_btree_node *right, + int n) +{ + __le64 *ldkeys, *rdkeys; + __le64 *ldptrs, *rdptrs; + int lnchildren, rnchildren; + + ldkeys = nilfs_btree_node_dkeys(btree, left); + ldptrs = nilfs_btree_node_dptrs(btree, left); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + + rdkeys = nilfs_btree_node_dkeys(btree, right); + rdptrs = nilfs_btree_node_dptrs(btree, right); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + + memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); + memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); + memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys)); + memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs)); + + lnchildren -= n; + rnchildren += n; + nilfs_btree_node_set_nchildren(btree, left, lnchildren); + nilfs_btree_node_set_nchildren(btree, right, rnchildren); +} + +/* Assume that the buffer head corresponding to node is locked. */ +static void nilfs_btree_node_insert(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + __u64 key, __u64 ptr, int index) +{ + __le64 *dkeys; + __le64 *dptrs; + int nchildren; + + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + if (index < nchildren) { + memmove(dkeys + index + 1, dkeys + index, + (nchildren - index) * sizeof(*dkeys)); + memmove(dptrs + index + 1, dptrs + index, + (nchildren - index) * sizeof(*dptrs)); + } + dkeys[index] = nilfs_bmap_key_to_dkey(key); + dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); + nchildren++; + nilfs_btree_node_set_nchildren(btree, node, nchildren); +} + +/* Assume that the buffer head corresponding to node is locked. */ +static void nilfs_btree_node_delete(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + __u64 *keyp, __u64 *ptrp, int index) +{ + __u64 key; + __u64 ptr; + __le64 *dkeys; + __le64 *dptrs; + int nchildren; + + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + key = nilfs_bmap_dkey_to_key(dkeys[index]); + ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + if (keyp != NULL) + *keyp = key; + if (ptrp != NULL) + *ptrp = ptr; + + if (index < nchildren - 1) { + memmove(dkeys + index, dkeys + index + 1, + (nchildren - index - 1) * sizeof(*dkeys)); + memmove(dptrs + index, dptrs + index + 1, + (nchildren - index - 1) * sizeof(*dptrs)); + } + nchildren--; + nilfs_btree_node_set_nchildren(btree, node, nchildren); +} + +static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node, + __u64 key, int *indexp) +{ + __u64 nkey; + int index, low, high, s; + + /* binary search */ + low = 0; + high = nilfs_btree_node_get_nchildren(btree, node) - 1; + index = 0; + s = 0; + while (low <= high) { + index = (low + high) / 2; + nkey = nilfs_btree_node_get_key(btree, node, index); + if (nkey == key) { + s = 0; + goto out; + } else if (nkey < key) { + low = index + 1; + s = -1; + } else { + high = index - 1; + s = 1; + } + } + + /* adjust index */ + if (nilfs_btree_node_get_level(btree, node) > + NILFS_BTREE_LEVEL_NODE_MIN) { + if ((s > 0) && (index > 0)) + index--; + } else if (s < 0) + index++; + + out: + *indexp = index; + + return s == 0; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_root(const struct nilfs_btree *btree) +{ + return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + int level) +{ + return (struct nilfs_btree_node *)path[level].bp_bh->b_data; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_sib_node(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + int level) +{ + return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; +} + +static inline int nilfs_btree_height(const struct nilfs_btree *btree) +{ + return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree)) + + 1; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_node(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + int level) +{ + return (level == nilfs_btree_height(btree) - 1) ? + nilfs_btree_get_root(btree) : + nilfs_btree_get_nonroot_node(btree, path, level); +} + +static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, + struct nilfs_btree_path *path, + __u64 key, __u64 *ptrp, int minlevel) +{ + struct nilfs_btree_node *node; + __u64 ptr; + int level, index, found, ret; + + node = nilfs_btree_get_root(btree); + level = nilfs_btree_node_get_level(btree, node); + if ((level < minlevel) || + (nilfs_btree_node_get_nchildren(btree, node) <= 0)) + return -ENOENT; + + found = nilfs_btree_node_lookup(btree, node, key, &index); + ptr = nilfs_btree_node_get_ptr(btree, node, index); + path[level].bp_bh = NULL; + path[level].bp_index = index; + + for (level--; level >= minlevel; level--) { + ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, + &path[level].bp_bh); + if (ret < 0) + return ret; + node = nilfs_btree_get_nonroot_node(btree, path, level); + BUG_ON(level != nilfs_btree_node_get_level(btree, node)); + if (!found) + found = nilfs_btree_node_lookup(btree, node, key, + &index); + else + index = 0; + if (index < nilfs_btree_node_nchildren_max(btree, node)) + ptr = nilfs_btree_node_get_ptr(btree, node, index); + else { + WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); + /* insert */ + ptr = NILFS_BMAP_INVALID_PTR; + } + path[level].bp_index = index; + } + if (!found) + return -ENOENT; + + if (ptrp != NULL) + *ptrp = ptr; + + return 0; +} + +static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, + struct nilfs_btree_path *path, + __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + __u64 ptr; + int index, level, ret; + + node = nilfs_btree_get_root(btree); + index = nilfs_btree_node_get_nchildren(btree, node) - 1; + if (index < 0) + return -ENOENT; + level = nilfs_btree_node_get_level(btree, node); + ptr = nilfs_btree_node_get_ptr(btree, node, index); + path[level].bp_bh = NULL; + path[level].bp_index = index; + + for (level--; level > 0; level--) { + ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, + &path[level].bp_bh); + if (ret < 0) + return ret; + node = nilfs_btree_get_nonroot_node(btree, path, level); + BUG_ON(level != nilfs_btree_node_get_level(btree, node)); + index = nilfs_btree_node_get_nchildren(btree, node) - 1; + ptr = nilfs_btree_node_get_ptr(btree, node, index); + path[level].bp_index = index; + } + + if (keyp != NULL) + *keyp = nilfs_btree_node_get_key(btree, node, index); + if (ptrp != NULL) + *ptrp = ptr; + + return 0; +} + +static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, + __u64 key, int level, __u64 *ptrp) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + __u64 ptr; + int ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); + + if (ptrp != NULL) + *ptrp = ptr; + + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static void nilfs_btree_promote_key(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 key) +{ + if (level < nilfs_btree_height(btree) - 1) { + do { + lock_buffer(path[level].bp_bh); + nilfs_btree_node_set_key( + btree, + nilfs_btree_get_nonroot_node( + btree, path, level), + path[level].bp_index, key); + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + unlock_buffer(path[level].bp_bh); + } while ((path[level].bp_index == 0) && + (++level < nilfs_btree_height(btree) - 1)); + } + + /* root */ + if (level == nilfs_btree_height(btree) - 1) { + nilfs_btree_node_set_key(btree, + nilfs_btree_get_root(btree), + path[level].bp_index, key); + } +} + +static void nilfs_btree_do_insert(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + + if (level < nilfs_btree_height(btree) - 1) { + lock_buffer(path[level].bp_bh); + node = nilfs_btree_get_nonroot_node(btree, path, level); + nilfs_btree_node_insert(btree, node, *keyp, *ptrp, + path[level].bp_index); + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + unlock_buffer(path[level].bp_bh); + + if (path[level].bp_index == 0) + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key( + btree, node, 0)); + } else { + node = nilfs_btree_get_root(btree); + nilfs_btree_node_insert(btree, node, *keyp, *ptrp, + path[level].bp_index); + } +} + +static void nilfs_btree_carry_left(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int nchildren, lnchildren, n, move; + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + left = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + move = 0; + + n = (nchildren + lnchildren + 1) / 2 - lnchildren; + if (n > path[level].bp_index) { + /* move insert point */ + n--; + move = 1; + } + + nilfs_btree_node_move_left(btree, left, node, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, node, 0)); + + if (move) { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index += lnchildren; + path[level + 1].bp_index--; + } else { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level].bp_index -= n; + } + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); +} + +static void nilfs_btree_carry_right(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int nchildren, rnchildren, n, move; + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + move = 0; + + n = (nchildren + rnchildren + 1) / 2 - rnchildren; + if (n > nchildren - path[level].bp_index) { + /* move insert point */ + n--; + move = 1; + } + + nilfs_btree_node_move_right(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + path[level + 1].bp_index++; + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, right, 0)); + path[level + 1].bp_index--; + + if (move) { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index -= + nilfs_btree_node_get_nchildren(btree, node); + path[level + 1].bp_index++; + } else { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + } + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); +} + +static void nilfs_btree_split(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + __u64 newkey; + __u64 newptr; + int nchildren, n, move; + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + move = 0; + + n = (nchildren + 1) / 2; + if (n > nchildren - path[level].bp_index) { + n--; + move = 1; + } + + nilfs_btree_node_move_right(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + newkey = nilfs_btree_node_get_key(btree, right, 0); + newptr = path[level].bp_newreq.bpr_ptr; + + if (move) { + path[level].bp_index -= + nilfs_btree_node_get_nchildren(btree, node); + nilfs_btree_node_insert(btree, right, *keyp, *ptrp, + path[level].bp_index); + + *keyp = nilfs_btree_node_get_key(btree, right, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + } else { + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); + + *keyp = nilfs_btree_node_get_key(btree, right, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + } + + path[level + 1].bp_index++; +} + +static void nilfs_btree_grow(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *root, *child; + int n; + + lock_buffer(path[level].bp_sib_bh); + + root = nilfs_btree_get_root(btree); + child = nilfs_btree_get_sib_node(btree, path, level); + + n = nilfs_btree_node_get_nchildren(btree, root); + + nilfs_btree_node_move_right(btree, root, child, n); + nilfs_btree_node_set_level(btree, root, level + 1); + + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_sib_bh); + + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); + + *keyp = nilfs_btree_node_get_key(btree, child, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; +} + +static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path) +{ + struct nilfs_btree_node *node; + int level; + + if (path == NULL) + return NILFS_BMAP_INVALID_PTR; + + /* left sibling */ + level = NILFS_BTREE_LEVEL_NODE_MIN; + if (path[level].bp_index > 0) { + node = nilfs_btree_get_node(btree, path, level); + return nilfs_btree_node_get_ptr(btree, node, + path[level].bp_index - 1); + } + + /* parent */ + level = NILFS_BTREE_LEVEL_NODE_MIN + 1; + if (level <= nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_node(btree, path, level); + return nilfs_btree_node_get_ptr(btree, node, + path[level].bp_index); + } + + return NILFS_BMAP_INVALID_PTR; +} + +static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + __u64 key) +{ + __u64 ptr; + + ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* sequential access */ + return ptr; + else { + ptr = nilfs_btree_find_near(btree, path); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* near */ + return ptr; + } + /* block group */ + return nilfs_bmap_find_target_in_group(&btree->bt_bmap); +} + +static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key, + __u64 ptr) +{ + btree->bt_bmap.b_last_allocated_key = key; + btree->bt_bmap.b_last_allocated_ptr = ptr; +} + +static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int *levelp, __u64 key, __u64 ptr, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *parent, *sib; + __u64 sibptr; + int pindex, level, ret; + + stats->bs_nblocks = 0; + level = NILFS_BTREE_LEVEL_DATA; + + /* allocate a new ptr for data block */ + if (btree->bt_ops->btop_find_target != NULL) + path[level].bp_newreq.bpr_ptr = + btree->bt_ops->btop_find_target(btree, path, key); + + ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + if (ret < 0) + goto err_out_data; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < nilfs_btree_height(btree) - 1; + level++) { + node = nilfs_btree_get_nonroot_node(btree, path, level); + if (nilfs_btree_node_get_nchildren(btree, node) < + nilfs_btree_node_nchildren_max(btree, node)) { + path[level].bp_op = nilfs_btree_do_insert; + stats->bs_nblocks++; + goto out; + } + + parent = nilfs_btree_get_node(btree, path, level + 1); + pindex = path[level + 1].bp_index; + + /* left sibling */ + if (pindex > 0) { + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex - 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_child_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) < + nilfs_btree_node_nchildren_max(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_carry_left; + stats->bs_nblocks++; + goto out; + } else + nilfs_bmap_put_block(&btree->bt_bmap, bh); + } + + /* right sibling */ + if (pindex < + nilfs_btree_node_get_nchildren(btree, parent) - 1) { + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex + 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_child_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) < + nilfs_btree_node_nchildren_max(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_carry_right; + stats->bs_nblocks++; + goto out; + } else + nilfs_bmap_put_block(&btree->bt_bmap, bh); + } + + /* split */ + path[level].bp_newreq.bpr_ptr = + path[level - 1].bp_newreq.bpr_ptr + 1; + ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + if (ret < 0) + goto err_out_child_node; + ret = nilfs_bmap_get_new_block(&btree->bt_bmap, + path[level].bp_newreq.bpr_ptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + + stats->bs_nblocks++; + + lock_buffer(bh); + nilfs_btree_node_init(btree, + (struct nilfs_btree_node *)bh->b_data, + 0, level, 0, NULL, NULL); + unlock_buffer(bh); + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_split; + } + + /* root */ + node = nilfs_btree_get_root(btree); + if (nilfs_btree_node_get_nchildren(btree, node) < + nilfs_btree_node_nchildren_max(btree, node)) { + path[level].bp_op = nilfs_btree_do_insert; + stats->bs_nblocks++; + goto out; + } + + /* grow */ + path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; + ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + if (ret < 0) + goto err_out_child_node; + ret = nilfs_bmap_get_new_block(&btree->bt_bmap, + path[level].bp_newreq.bpr_ptr, &bh); + if (ret < 0) + goto err_out_curr_node; + + lock_buffer(bh); + nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, + 0, level, 0, NULL, NULL); + unlock_buffer(bh); + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_grow; + + level++; + path[level].bp_op = nilfs_btree_do_insert; + + /* a newly-created node block and a data block are added */ + stats->bs_nblocks += 2; + + /* success */ + out: + *levelp = level; + return ret; + + /* error */ + err_out_curr_node: + btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap, + &path[level].bp_newreq); + err_out_child_node: + for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh); + btree->bt_bmap.b_pops->bpop_abort_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + + } + + btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap, + &path[level].bp_newreq); + err_out_data: + *levelp = level; + stats->bs_nblocks = 0; + return ret; +} + +static void nilfs_btree_commit_insert(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int maxlevel, __u64 key, __u64 ptr) +{ + int level; + + set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); + ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; + if (btree->bt_ops->btop_set_target != NULL) + btree->bt_ops->btop_set_target(btree, key, ptr); + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { + if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) { + btree->bt_bmap.b_pops->bpop_commit_alloc_ptr( + &btree->bt_bmap, &path[level - 1].bp_newreq); + } + path[level].bp_op(btree, path, level, &key, &ptr); + } + + if (!nilfs_bmap_dirty(&btree->bt_bmap)) + nilfs_bmap_set_dirty(&btree->bt_bmap); +} + +static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_bmap_stats stats; + int level, ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, + NILFS_BTREE_LEVEL_NODE_MIN); + if (ret != -ENOENT) { + if (ret == 0) + ret = -EEXIST; + goto out; + } + + ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats); + if (ret < 0) + goto out; + nilfs_btree_commit_insert(btree, path, level, key, ptr); + nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + return ret; +} + +static void nilfs_btree_do_delete(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + + if (level < nilfs_btree_height(btree) - 1) { + lock_buffer(path[level].bp_bh); + node = nilfs_btree_get_nonroot_node(btree, path, level); + nilfs_btree_node_delete(btree, node, keyp, ptrp, + path[level].bp_index); + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + unlock_buffer(path[level].bp_bh); + if (path[level].bp_index == 0) + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, node, 0)); + } else { + node = nilfs_btree_get_root(btree); + nilfs_btree_node_delete(btree, node, keyp, ptrp, + path[level].bp_index); + } +} + +static void nilfs_btree_borrow_left(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int nchildren, lnchildren, n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + left = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + + n = (nchildren + lnchildren) / 2 - nchildren; + + nilfs_btree_node_move_right(btree, left, node, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, node, 0)); + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level].bp_index += n; +} + +static void nilfs_btree_borrow_right(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int nchildren, rnchildren, n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + + n = (nchildren + rnchildren) / 2 - nchildren; + + nilfs_btree_node_move_left(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + path[level + 1].bp_index++; + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, right, 0)); + path[level + 1].bp_index--; + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; +} + +static void nilfs_btree_concat_left(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + left = nilfs_btree_get_sib_node(btree, path, level); + + n = nilfs_btree_node_get_nchildren(btree, node); + + nilfs_btree_node_move_left(btree, left, node, n); + + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); +} + +static void nilfs_btree_concat_right(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + + n = nilfs_btree_node_get_nchildren(btree, right); + + nilfs_btree_node_move_left(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level + 1].bp_index++; +} + +static void nilfs_btree_shrink(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *root, *child; + int n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + root = nilfs_btree_get_root(btree); + child = nilfs_btree_get_nonroot_node(btree, path, level); + + nilfs_btree_node_delete(btree, root, NULL, NULL, 0); + nilfs_btree_node_set_level(btree, root, level); + n = nilfs_btree_node_get_nchildren(btree, child); + nilfs_btree_node_move_left(btree, root, child, n); + unlock_buffer(path[level].bp_bh); + + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = NULL; +} + + +static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int *levelp, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *parent, *sib; + __u64 sibptr; + int pindex, level, ret; + + ret = 0; + stats->bs_nblocks = 0; + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < nilfs_btree_height(btree) - 1; + level++) { + node = nilfs_btree_get_nonroot_node(btree, path, level); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(btree, node, + path[level].bp_index); + if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) { + ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + if (ret < 0) + goto err_out_child_node; + } + + if (nilfs_btree_node_get_nchildren(btree, node) > + nilfs_btree_node_nchildren_min(btree, node)) { + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + goto out; + } + + parent = nilfs_btree_get_node(btree, path, level + 1); + pindex = path[level + 1].bp_index; + + if (pindex > 0) { + /* left sibling */ + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex - 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) > + nilfs_btree_node_nchildren_min(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_borrow_left; + stats->bs_nblocks++; + goto out; + } else { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_concat_left; + stats->bs_nblocks++; + /* continue; */ + } + } else if (pindex < + nilfs_btree_node_get_nchildren(btree, parent) - 1) { + /* right sibling */ + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex + 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) > + nilfs_btree_node_nchildren_min(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_borrow_right; + stats->bs_nblocks++; + goto out; + } else { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_concat_right; + stats->bs_nblocks++; + /* continue; */ + } + } else { + /* no siblings */ + /* the only child of the root node */ + WARN_ON(level != nilfs_btree_height(btree) - 2); + if (nilfs_btree_node_get_nchildren(btree, node) - 1 <= + NILFS_BTREE_ROOT_NCHILDREN_MAX) { + path[level].bp_op = nilfs_btree_shrink; + stats->bs_nblocks += 2; + } else { + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + } + + goto out; + + } + } + + node = nilfs_btree_get_root(btree); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); + if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) { + ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + if (ret < 0) + goto err_out_child_node; + } + /* child of the root node is deleted */ + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + + /* success */ + out: + *levelp = level; + return ret; + + /* error */ + err_out_curr_node: + if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL) + btree->bt_bmap.b_pops->bpop_abort_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + err_out_child_node: + for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL) + btree->bt_bmap.b_pops->bpop_abort_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + } + *levelp = level; + stats->bs_nblocks = 0; + return ret; +} + +static void nilfs_btree_commit_delete(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int maxlevel) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { + if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL) + btree->bt_bmap.b_pops->bpop_commit_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + path[level].bp_op(btree, path, level, NULL, NULL); + } + + if (!nilfs_bmap_dirty(&btree->bt_bmap)) + nilfs_bmap_set_dirty(&btree->bt_bmap); +} + +static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) + +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_bmap_stats stats; + int level, ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + ret = nilfs_btree_do_lookup(btree, path, key, NULL, + NILFS_BTREE_LEVEL_NODE_MIN); + if (ret < 0) + goto out; + + ret = nilfs_btree_prepare_delete(btree, path, &level, &stats); + if (ret < 0) + goto out; + nilfs_btree_commit_delete(btree, path, level); + nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + +out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + return ret; +} + +static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + int ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); + + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + struct nilfs_btree_node *root, *node; + __u64 maxkey, nextmaxkey; + __u64 ptr; + int nchildren, ret; + + btree = (struct nilfs_btree *)bmap; + root = nilfs_btree_get_root(btree); + switch (nilfs_btree_height(btree)) { + case 2: + bh = NULL; + node = root; + break; + case 3: + nchildren = nilfs_btree_node_get_nchildren(btree, root); + if (nchildren > 1) + return 0; + ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); + ret = nilfs_bmap_get_block(bmap, ptr, &bh); + if (ret < 0) + return ret; + node = (struct nilfs_btree_node *)bh->b_data; + break; + default: + return 0; + } + + nchildren = nilfs_btree_node_get_nchildren(btree, node); + maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1); + nextmaxkey = (nchildren > 1) ? + nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; + if (bh != NULL) + nilfs_bmap_put_block(bmap, bh); + + return (maxkey == key) && (nextmaxkey < bmap->b_low); +} + +static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, + __u64 *keys, __u64 *ptrs, int nitems) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + struct nilfs_btree_node *node, *root; + __le64 *dkeys; + __le64 *dptrs; + __u64 ptr; + int nchildren, i, ret; + + btree = (struct nilfs_btree *)bmap; + root = nilfs_btree_get_root(btree); + switch (nilfs_btree_height(btree)) { + case 2: + bh = NULL; + node = root; + break; + case 3: + nchildren = nilfs_btree_node_get_nchildren(btree, root); + WARN_ON(nchildren > 1); + ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); + ret = nilfs_bmap_get_block(bmap, ptr, &bh); + if (ret < 0) + return ret; + node = (struct nilfs_btree_node *)bh->b_data; + break; + default: + node = NULL; + return -EINVAL; + } + + nchildren = nilfs_btree_node_get_nchildren(btree, node); + if (nchildren < nitems) + nitems = nchildren; + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + for (i = 0; i < nitems; i++) { + keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); + ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); + } + + if (bh != NULL) + nilfs_bmap_put_block(bmap, bh); + + return nitems; +} + +static int +nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, + union nilfs_bmap_ptr_req *dreq, + union nilfs_bmap_ptr_req *nreq, + struct buffer_head **bhp, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + int ret; + + btree = (struct nilfs_btree *)bmap; + stats->bs_nblocks = 0; + + /* for data */ + /* cannot find near ptr */ + if (btree->bt_ops->btop_find_target != NULL) + dreq->bpr_ptr + = btree->bt_ops->btop_find_target(btree, NULL, key); + ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq); + if (ret < 0) + return ret; + + *bhp = NULL; + stats->bs_nblocks++; + if (nreq != NULL) { + nreq->bpr_ptr = dreq->bpr_ptr + 1; + ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq); + if (ret < 0) + goto err_out_dreq; + + ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh); + if (ret < 0) + goto err_out_nreq; + + *bhp = bh; + stats->bs_nblocks++; + } + + /* success */ + return 0; + + /* error */ + err_out_nreq: + bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq); + err_out_dreq: + bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq); + stats->bs_nblocks = 0; + return ret; + +} + +static void +nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, + __u64 key, __u64 ptr, + const __u64 *keys, const __u64 *ptrs, + int n, __u64 low, __u64 high, + union nilfs_bmap_ptr_req *dreq, + union nilfs_bmap_ptr_req *nreq, + struct buffer_head *bh) +{ + struct nilfs_btree *btree; + struct nilfs_btree_node *node; + __u64 tmpptr; + + /* free resources */ + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + + /* ptr must be a pointer to a buffer head. */ + set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); + + /* convert and insert */ + btree = (struct nilfs_btree *)bmap; + nilfs_btree_init(bmap, low, high); + if (nreq != NULL) { + if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) { + bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq); + bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq); + } + + /* create child node at level 1 */ + lock_buffer(bh); + node = (struct nilfs_btree_node *)bh->b_data; + nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); + nilfs_btree_node_insert(btree, node, + key, dreq->bpr_ptr, n); + if (!buffer_dirty(bh)) + nilfs_btnode_mark_dirty(bh); + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); + + unlock_buffer(bh); + nilfs_bmap_put_block(bmap, bh); + + /* create root node at level 2 */ + node = nilfs_btree_get_root(btree); + tmpptr = nreq->bpr_ptr; + nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, + 2, 1, &keys[0], &tmpptr); + } else { + if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) + bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq); + + /* create root node at level 1 */ + node = nilfs_btree_get_root(btree); + nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, + 1, n, keys, ptrs); + nilfs_btree_node_insert(btree, node, + key, dreq->bpr_ptr, n); + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); + } + + if (btree->bt_ops->btop_set_target != NULL) + btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr); +} + +/** + * nilfs_btree_convert_and_insert - + * @bmap: + * @key: + * @ptr: + * @keys: + * @ptrs: + * @n: + * @low: + * @high: + */ +int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, + __u64 key, __u64 ptr, + const __u64 *keys, const __u64 *ptrs, + int n, __u64 low, __u64 high) +{ + struct buffer_head *bh; + union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; + struct nilfs_bmap_stats stats; + int ret; + + if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { + di = &dreq; + ni = NULL; + } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( + 1 << bmap->b_inode->i_blkbits)) { + di = &dreq; + ni = &nreq; + } else { + di = NULL; + ni = NULL; + BUG(); + } + + ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh, + &stats); + if (ret < 0) + return ret; + nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n, + low, high, di, ni, bh); + nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + return 0; +} + +static int nilfs_btree_propagate_p(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head *bh) +{ + while ((++level < nilfs_btree_height(btree) - 1) && + !buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + + return 0; +} + +static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level) +{ + struct nilfs_btree_node *parent; + int ret; + + parent = nilfs_btree_get_node(btree, path, level + 1); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; + ret = nilfs_bmap_prepare_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + if (ret < 0) + return ret; + + if (buffer_nilfs_node(path[level].bp_bh)) { + path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr; + path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; + path[level].bp_ctxt.bh = path[level].bp_bh; + ret = nilfs_btnode_prepare_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + if (ret < 0) { + nilfs_bmap_abort_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + return ret; + } + } + + return 0; +} + +static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level) +{ + struct nilfs_btree_node *parent; + + nilfs_bmap_commit_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + + if (buffer_nilfs_node(path[level].bp_bh)) { + nilfs_btnode_commit_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + path[level].bp_bh = path[level].bp_ctxt.bh; + } + set_buffer_nilfs_volatile(path[level].bp_bh); + + parent = nilfs_btree_get_node(btree, path, level + 1); + nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, + path[level].bp_newreq.bpr_ptr); +} + +static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level) +{ + nilfs_bmap_abort_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + if (buffer_nilfs_node(path[level].bp_bh)) + nilfs_btnode_abort_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); +} + +static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int minlevel, + int *maxlevelp) +{ + int level, ret; + + level = minlevel; + if (!buffer_nilfs_volatile(path[level].bp_bh)) { + ret = nilfs_btree_prepare_update_v(btree, path, level); + if (ret < 0) + return ret; + } + while ((++level < nilfs_btree_height(btree) - 1) && + !buffer_dirty(path[level].bp_bh)) { + + WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); + ret = nilfs_btree_prepare_update_v(btree, path, level); + if (ret < 0) + goto out; + } + + /* success */ + *maxlevelp = level - 1; + return 0; + + /* error */ + out: + while (--level > minlevel) + nilfs_btree_abort_update_v(btree, path, level); + if (!buffer_nilfs_volatile(path[level].bp_bh)) + nilfs_btree_abort_update_v(btree, path, level); + return ret; +} + +static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int minlevel, + int maxlevel, + struct buffer_head *bh) +{ + int level; + + if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) + nilfs_btree_commit_update_v(btree, path, minlevel); + + for (level = minlevel + 1; level <= maxlevel; level++) + nilfs_btree_commit_update_v(btree, path, level); +} + +static int nilfs_btree_propagate_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head *bh) +{ + int maxlevel, ret; + struct nilfs_btree_node *parent; + __u64 ptr; + + get_bh(bh); + path[level].bp_bh = bh; + ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel); + if (ret < 0) + goto out; + + if (buffer_nilfs_volatile(path[level].bp_bh)) { + parent = nilfs_btree_get_node(btree, path, level + 1); + ptr = nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr); + if (ret < 0) + goto out; + } + + nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh); + + out: + brelse(path[level].bp_bh); + path[level].bp_bh = NULL; + return ret; +} + +static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + __u64 key; + int level, ret; + + WARN_ON(!buffer_dirty(bh)); + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + if (buffer_nilfs_node(bh)) { + node = (struct nilfs_btree_node *)bh->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + level = nilfs_btree_node_get_level(btree, node); + } else { + key = nilfs_bmap_data_get_key(bmap, bh); + level = NILFS_BTREE_LEVEL_DATA; + } + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); + if (ret < 0) { + if (unlikely(ret == -ENOENT)) + printk(KERN_CRIT "%s: key = %llu, level == %d\n", + __func__, (unsigned long long)key, level); + goto out; + } + + ret = btree->bt_ops->btop_propagate(btree, path, level, bh); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr); +} + +static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, + struct list_head *lists, + struct buffer_head *bh) +{ + struct list_head *head; + struct buffer_head *cbh; + struct nilfs_btree_node *node, *cnode; + __u64 key, ckey; + int level; + + get_bh(bh); + node = (struct nilfs_btree_node *)bh->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + level = nilfs_btree_node_get_level(btree, node); + list_for_each(head, &lists[level]) { + cbh = list_entry(head, struct buffer_head, b_assoc_buffers); + cnode = (struct nilfs_btree_node *)cbh->b_data; + ckey = nilfs_btree_node_get_key(btree, cnode, 0); + if (key < ckey) + break; + } + list_add_tail(&bh->b_assoc_buffers, head); +} + +static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, + struct list_head *listp) +{ + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; + struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache; + struct list_head lists[NILFS_BTREE_LEVEL_MAX]; + struct pagevec pvec; + struct buffer_head *bh, *head; + pgoff_t index = 0; + int level, i; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < NILFS_BTREE_LEVEL_MAX; + level++) + INIT_LIST_HEAD(&lists[level]); + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + bh = head = page_buffers(pvec.pages[i]); + do { + if (buffer_dirty(bh)) + nilfs_btree_add_dirty_buffer(btree, + lists, bh); + } while ((bh = bh->b_this_page) != head); + } + pagevec_release(&pvec); + cond_resched(); + } + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < NILFS_BTREE_LEVEL_MAX; + level++) + list_splice(&lists[level], listp->prev); +} + +static int nilfs_btree_assign_p(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *parent; + __u64 key; + __u64 ptr; + int ret; + + parent = nilfs_btree_get_node(btree, path, level + 1); + ptr = nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + if (buffer_nilfs_node(*bh)) { + path[level].bp_ctxt.oldkey = ptr; + path[level].bp_ctxt.newkey = blocknr; + path[level].bp_ctxt.bh = *bh; + ret = nilfs_btnode_prepare_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + if (ret < 0) + return ret; + nilfs_btnode_commit_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + *bh = path[level].bp_ctxt.bh; + } + + nilfs_btree_node_set_ptr(btree, parent, + path[level + 1].bp_index, blocknr); + + key = nilfs_btree_node_get_key(btree, parent, + path[level + 1].bp_index); + /* on-disk format */ + binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_dat.bi_level = level; + + return 0; +} + +static int nilfs_btree_assign_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *parent; + __u64 key; + __u64 ptr; + union nilfs_bmap_ptr_req req; + int ret; + + parent = nilfs_btree_get_node(btree, path, level + 1); + ptr = nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + req.bpr_ptr = ptr; + ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap, + &req); + if (ret < 0) + return ret; + btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap, + &req, blocknr); + + key = nilfs_btree_node_get_key(btree, parent, + path[level + 1].bp_index); + /* on-disk format */ + binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + + return 0; +} + +static int nilfs_btree_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + __u64 key; + int level, ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + if (buffer_nilfs_node(*bh)) { + node = (struct nilfs_btree_node *)(*bh)->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + level = nilfs_btree_node_get_level(btree, node); + } else { + key = nilfs_bmap_data_get_key(bmap, *bh); + level = NILFS_BTREE_LEVEL_DATA; + } + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + + ret = btree->bt_ops->btop_assign(btree, path, level, bh, + blocknr, binfo); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree *btree; + struct nilfs_btree_node *node; + __u64 key; + int ret; + + btree = (struct nilfs_btree *)bmap; + ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr); + if (ret < 0) + return ret; + + if (buffer_nilfs_node(*bh)) { + node = (struct nilfs_btree_node *)(*bh)->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + } else + key = nilfs_bmap_data_get_key(bmap, *bh); + + /* on-disk format */ + binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + + return 0; +} + +static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + __u64 ptr; + int ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + + if (!buffer_dirty(bh)) + nilfs_btnode_mark_dirty(bh); + nilfs_bmap_put_block(&btree->bt_bmap, bh); + if (!nilfs_bmap_dirty(&btree->bt_bmap)) + nilfs_bmap_set_dirty(&btree->bt_bmap); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + return ret; +} + +static const struct nilfs_bmap_operations nilfs_btree_ops = { + .bop_lookup = nilfs_btree_lookup, + .bop_insert = nilfs_btree_insert, + .bop_delete = nilfs_btree_delete, + .bop_clear = NULL, + + .bop_propagate = nilfs_btree_propagate, + + .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, + + .bop_assign = nilfs_btree_assign, + .bop_mark = nilfs_btree_mark, + + .bop_last_key = nilfs_btree_last_key, + .bop_check_insert = NULL, + .bop_check_delete = nilfs_btree_check_delete, + .bop_gather_data = nilfs_btree_gather_data, +}; + +static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { + .bop_lookup = NULL, + .bop_insert = NULL, + .bop_delete = NULL, + .bop_clear = NULL, + + .bop_propagate = nilfs_btree_propagate_gc, + + .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, + + .bop_assign = nilfs_btree_assign_gc, + .bop_mark = NULL, + + .bop_last_key = NULL, + .bop_check_insert = NULL, + .bop_check_delete = NULL, + .bop_gather_data = NULL, +}; + +static const struct nilfs_btree_operations nilfs_btree_ops_v = { + .btop_find_target = nilfs_btree_find_target_v, + .btop_set_target = nilfs_btree_set_target_v, + .btop_propagate = nilfs_btree_propagate_v, + .btop_assign = nilfs_btree_assign_v, +}; + +static const struct nilfs_btree_operations nilfs_btree_ops_p = { + .btop_find_target = NULL, + .btop_set_target = NULL, + .btop_propagate = nilfs_btree_propagate_p, + .btop_assign = nilfs_btree_assign_p, +}; + +int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high) +{ + struct nilfs_btree *btree; + + btree = (struct nilfs_btree *)bmap; + bmap->b_ops = &nilfs_btree_ops; + bmap->b_low = low; + bmap->b_high = high; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + btree->bt_ops = &nilfs_btree_ops_p; + break; + default: + btree->bt_ops = &nilfs_btree_ops_v; + break; + } + + return 0; +} + +void nilfs_btree_init_gc(struct nilfs_bmap *bmap) +{ + bmap->b_low = NILFS_BMAP_LARGE_LOW; + bmap->b_high = NILFS_BMAP_LARGE_HIGH; + bmap->b_ops = &nilfs_btree_ops_gc; +} diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h new file mode 100644 index 000000000000..4766deb52fb1 --- /dev/null +++ b/fs/nilfs2/btree.h @@ -0,0 +1,117 @@ +/* + * btree.h - NILFS B-tree. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_BTREE_H +#define _NILFS_BTREE_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/list.h> +#include <linux/nilfs2_fs.h> +#include "btnode.h" +#include "bmap.h" + +struct nilfs_btree; +struct nilfs_btree_path; + +/** + * struct nilfs_btree_operations - B-tree operation table + */ +struct nilfs_btree_operations { + __u64 (*btop_find_target)(const struct nilfs_btree *, + const struct nilfs_btree_path *, __u64); + void (*btop_set_target)(struct nilfs_btree *, __u64, __u64); + + struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *); + + int (*btop_propagate)(struct nilfs_btree *, + struct nilfs_btree_path *, + int, + struct buffer_head *); + int (*btop_assign)(struct nilfs_btree *, + struct nilfs_btree_path *, + int, + struct buffer_head **, + sector_t, + union nilfs_binfo *); +}; + +/** + * struct nilfs_btree_node - B-tree node + * @bn_flags: flags + * @bn_level: level + * @bn_nchildren: number of children + * @bn_pad: padding + */ +struct nilfs_btree_node { + __u8 bn_flags; + __u8 bn_level; + __le16 bn_nchildren; + __le32 bn_pad; +}; + +/* flags */ +#define NILFS_BTREE_NODE_ROOT 0x01 + +/* level */ +#define NILFS_BTREE_LEVEL_DATA 0 +#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1) +#define NILFS_BTREE_LEVEL_MAX 14 + +/** + * struct nilfs_btree - B-tree structure + * @bt_bmap: bmap base structure + * @bt_ops: B-tree operation table + */ +struct nilfs_btree { + struct nilfs_bmap bt_bmap; + + /* B-tree-specific members */ + const struct nilfs_btree_operations *bt_ops; +}; + + +#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE +#define NILFS_BTREE_ROOT_NCHILDREN_MAX \ + ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \ + (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */))) +#define NILFS_BTREE_ROOT_NCHILDREN_MIN 0 +#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64)) +#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) \ + (((nodesize) - sizeof(struct nilfs_btree_node) - \ + NILFS_BTREE_NODE_EXTRA_PAD_SIZE) / \ + (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */))) +#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize) \ + ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1) +#define NILFS_BTREE_KEY_MIN ((__u64)0) +#define NILFS_BTREE_KEY_MAX (~(__u64)0) + + +int nilfs_btree_path_cache_init(void); +void nilfs_btree_path_cache_destroy(void); +int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64); +int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, + const __u64 *, const __u64 *, + int, __u64, __u64); +void nilfs_btree_init_gc(struct nilfs_bmap *); + +#endif /* _NILFS_BTREE_H */ diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c new file mode 100644 index 000000000000..e90b60dfced9 --- /dev/null +++ b/fs/nilfs2/cpfile.c @@ -0,0 +1,925 @@ +/* + * cpfile.c - NILFS checkpoint file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/errno.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" +#include "cpfile.h" + + +static inline unsigned long +nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile) +{ + return NILFS_MDT(cpfile)->mi_entries_per_block; +} + +/* block number from the beginning of the file */ +static unsigned long +nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno) +{ + __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); + return (unsigned long)tcno; +} + +/* offset in block */ +static unsigned long +nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno) +{ + __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); +} + +static unsigned long +nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile, + __u64 curr, + __u64 max) +{ + return min_t(__u64, + nilfs_cpfile_checkpoints_per_block(cpfile) - + nilfs_cpfile_get_offset(cpfile, curr), + max - curr); +} + +static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile, + __u64 cno) +{ + return nilfs_cpfile_get_blkoff(cpfile, cno) == 0; +} + +static unsigned int +nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr, + unsigned int n) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + unsigned int count; + + count = le32_to_cpu(cp->cp_checkpoints_count) + n; + cp->cp_checkpoints_count = cpu_to_le32(count); + return count; +} + +static unsigned int +nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr, + unsigned int n) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + unsigned int count; + + WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n); + count = le32_to_cpu(cp->cp_checkpoints_count) - n; + cp->cp_checkpoints_count = cpu_to_le32(count); + return count; +} + +static inline struct nilfs_cpfile_header * +nilfs_cpfile_block_get_header(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh); +} + +static struct nilfs_checkpoint * +nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) * + NILFS_MDT(cpfile)->mi_entry_size; +} + +static void nilfs_cpfile_block_init(struct inode *cpfile, + struct buffer_head *bh, + void *kaddr) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + int n = nilfs_cpfile_checkpoints_per_block(cpfile); + + while (n-- > 0) { + nilfs_checkpoint_set_invalid(cp); + cp = (void *)cp + cpsz; + } +} + +static inline int nilfs_cpfile_get_header_block(struct inode *cpfile, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp); +} + +static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile, + __u64 cno, + int create, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(cpfile, + nilfs_cpfile_get_blkoff(cpfile, cno), + create, nilfs_cpfile_block_init, bhp); +} + +static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, + __u64 cno) +{ + return nilfs_mdt_delete_block(cpfile, + nilfs_cpfile_get_blkoff(cpfile, cno)); +} + +/** + * nilfs_cpfile_get_checkpoint - get a checkpoint + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @create: create flag + * @cpp: pointer to a checkpoint + * @bhp: pointer to a buffer head + * + * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint + * specified by @cno. A new checkpoint will be created if @cno is the current + * checkpoint number and @create is nonzero. + * + * Return Value: On success, 0 is returned, and the checkpoint and the + * buffer head of the buffer on which the checkpoint is located are stored in + * the place pointed by @cpp and @bhp, respectively. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + * + * %-EINVAL - invalid checkpoint. + */ +int nilfs_cpfile_get_checkpoint(struct inode *cpfile, + __u64 cno, + int create, + struct nilfs_checkpoint **cpp, + struct buffer_head **bhp) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) || + (cno < nilfs_mdt_cno(cpfile) && create))) + return -EINVAL; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_sem; + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh); + if (ret < 0) + goto out_header; + kaddr = kmap(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + if (!create) { + kunmap(cp_bh->b_page); + brelse(cp_bh); + ret = -ENOENT; + goto out_header; + } + /* a newly-created checkpoint */ + nilfs_checkpoint_clear_invalid(cp); + if (!nilfs_cpfile_is_in_first(cpfile, cno)) + nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, + kaddr, 1); + nilfs_mdt_mark_buffer_dirty(cp_bh); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, 1); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + } + + if (cpp != NULL) + *cpp = cp; + *bhp = cp_bh; + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_put_checkpoint - put a checkpoint + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @bh: buffer head + * + * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint + * specified by @cno. @bh must be the buffer head which has been returned by + * a previous call to nilfs_cpfile_get_checkpoint() with @cno. + */ +void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno, + struct buffer_head *bh) +{ + kunmap(bh->b_page); + brelse(bh); +} + +/** + * nilfs_cpfile_delete_checkpoints - delete checkpoints + * @cpfile: inode of checkpoint file + * @start: start checkpoint number + * @end: end checkpoint numer + * + * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in + * the period from @start to @end, excluding @end itself. The checkpoints + * which have been already deleted are ignored. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - invalid checkpoints. + */ +int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, + __u64 start, + __u64 end) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + __u64 cno; + void *kaddr; + unsigned long tnicps; + int ret, ncps, nicps, count, i; + + if (unlikely(start == 0 || start > end)) { + printk(KERN_ERR "%s: invalid range of checkpoint numbers: " + "[%llu, %llu)\n", __func__, + (unsigned long long)start, (unsigned long long)end); + return -EINVAL; + } + + /* cannot delete the latest checkpoint */ + if (start == nilfs_mdt_cno(cpfile) - 1) + return -EPERM; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_sem; + tnicps = 0; + + for (cno = start; cno < end; cno += ncps) { + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_sem; + /* skip hole */ + ret = 0; + continue; + } + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint( + cpfile, cno, cp_bh, kaddr); + nicps = 0; + for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) { + WARN_ON(nilfs_checkpoint_snapshot(cp)); + if (!nilfs_checkpoint_invalid(cp)) { + nilfs_checkpoint_set_invalid(cp); + nicps++; + } + } + if (nicps > 0) { + tnicps += nicps; + nilfs_mdt_mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_dirty(cpfile); + if (!nilfs_cpfile_is_in_first(cpfile, cno) && + (count = nilfs_cpfile_block_sub_valid_checkpoints( + cpfile, cp_bh, kaddr, nicps)) == 0) { + /* make hole */ + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + ret = nilfs_cpfile_delete_checkpoint_block( + cpfile, cno); + if (ret == 0) + continue; + printk(KERN_ERR "%s: cannot delete block\n", + __func__); + goto out_sem; + } + } + + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + } + + if (tnicps > 0) { + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + kunmap_atomic(kaddr, KM_USER0); + } + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile, + struct nilfs_checkpoint *cp, + struct nilfs_cpinfo *ci) +{ + ci->ci_flags = le32_to_cpu(cp->cp_flags); + ci->ci_cno = le64_to_cpu(cp->cp_cno); + ci->ci_create = le64_to_cpu(cp->cp_create); + ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc); + ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count); + ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count); + ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); +} + +static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, + struct nilfs_cpinfo *ci, size_t nci) +{ + struct nilfs_checkpoint *cp; + struct buffer_head *bh; + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop; + void *kaddr; + int n, ret; + int ncps, i; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_read(&NILFS_MDT(cpfile)->mi_sem); + + for (n = 0; cno < cur_cno && n < nci; cno += ncps) { + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out; + continue; /* skip hole */ + } + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { + if (!nilfs_checkpoint_invalid(cp)) + nilfs_cpfile_checkpoint_to_cpinfo( + cpfile, cp, &ci[n++]); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + } + + ret = n; + if (n > 0) + *cnop = ci[n - 1].ci_cno + 1; + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, + struct nilfs_cpinfo *ci, size_t nci) +{ + struct buffer_head *bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + __u64 curr = *cnop, next; + unsigned long curr_blkoff, next_blkoff; + void *kaddr; + int n = 0, ret; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + + if (curr == 0) { + ret = nilfs_cpfile_get_header_block(cpfile, &bh); + if (ret < 0) + goto out; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + if (curr == 0) { + ret = 0; + goto out; + } + } else if (unlikely(curr == ~(__u64)0)) { + ret = 0; + goto out; + } + + curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + ret = 0; /* No snapshots (started from a hole block) */ + goto out; + } + kaddr = kmap_atomic(bh->b_page, KM_USER0); + while (n < nci) { + cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr); + curr = ~(__u64)0; /* Terminator */ + if (unlikely(nilfs_checkpoint_invalid(cp) || + !nilfs_checkpoint_snapshot(cp))) + break; + nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]); + next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); + if (next == 0) + break; /* reach end of the snapshot list */ + + next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next); + if (curr_blkoff != next_blkoff) { + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, + 0, &bh); + if (unlikely(ret < 0)) { + WARN_ON(ret == -ENOENT); + goto out; + } + kaddr = kmap_atomic(bh->b_page, KM_USER0); + } + curr = next; + curr_blkoff = next_blkoff; + } + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + *cnop = curr; + ret = n; + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_get_cpinfo - + * @cpfile: + * @cno: + * @ci: + * @nci: + */ + +ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, + struct nilfs_cpinfo *ci, size_t nci) +{ + switch (mode) { + case NILFS_CHECKPOINT: + return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci); + case NILFS_SNAPSHOT: + return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci); + default: + return -EINVAL; + } +} + +/** + * nilfs_cpfile_delete_checkpoint - + * @cpfile: + * @cno: + */ +int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno) +{ + struct nilfs_cpinfo ci; + __u64 tcno = cno; + ssize_t nci; + int ret; + + nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1); + if (nci < 0) + return nci; + else if (nci == 0 || ci.ci_cno != cno) + return -ENOENT; + + /* cannot delete the latest checkpoint nor snapshots */ + ret = nilfs_cpinfo_snapshot(&ci); + if (ret < 0) + return ret; + else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1) + return -EPERM; + + return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1); +} + +static struct nilfs_snapshot_list * +nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile, + __u64 cno, + struct buffer_head *bh, + void *kaddr) +{ + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + + if (cno != 0) { + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + list = &cp->cp_snapshot_list; + } else { + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + list = &header->ch_snapshot_list; + } + return list; +} + +static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + __u64 curr, prev; + unsigned long curr_blkoff, prev_blkoff; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -ENOENT; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + if (nilfs_checkpoint_snapshot(cp)) { + ret = 0; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + kunmap_atomic(kaddr, KM_USER0); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_cp; + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + list = &header->ch_snapshot_list; + curr_bh = header_bh; + get_bh(curr_bh); + curr = 0; + curr_blkoff = 0; + prev = le64_to_cpu(list->ssl_prev); + while (prev > cno) { + prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev); + curr = prev; + if (curr_blkoff != prev_blkoff) { + kunmap_atomic(kaddr, KM_USER0); + brelse(curr_bh); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, + 0, &curr_bh); + if (ret < 0) + goto out_header; + kaddr = kmap_atomic(curr_bh->b_page, KM_USER0); + } + curr_blkoff = prev_blkoff; + cp = nilfs_cpfile_block_get_checkpoint( + cpfile, curr, curr_bh, kaddr); + list = &cp->cp_snapshot_list; + prev = le64_to_cpu(list->ssl_prev); + } + kunmap_atomic(kaddr, KM_USER0); + + if (prev != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, + &prev_bh); + if (ret < 0) + goto out_curr; + } else { + prev_bh = header_bh; + get_bh(prev_bh); + } + + kaddr = kmap_atomic(curr_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, curr, curr_bh, kaddr); + list->ssl_prev = cpu_to_le64(cno); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr); + cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev); + nilfs_checkpoint_set_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(prev_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, prev, prev_bh, kaddr); + list->ssl_next = cpu_to_le64(cno); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + le64_add_cpu(&header->ch_nsnapshots, 1); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(prev_bh); + nilfs_mdt_mark_buffer_dirty(curr_bh); + nilfs_mdt_mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + + brelse(prev_bh); + + out_curr: + brelse(curr_bh); + + out_header: + brelse(header_bh); + + out_cp: + brelse(cp_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + __u64 next, prev; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -ENOENT; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + if (!nilfs_checkpoint_snapshot(cp)) { + ret = 0; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + + list = &cp->cp_snapshot_list; + next = le64_to_cpu(list->ssl_next); + prev = le64_to_cpu(list->ssl_prev); + kunmap_atomic(kaddr, KM_USER0); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_cp; + if (next != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, + &next_bh); + if (ret < 0) + goto out_header; + } else { + next_bh = header_bh; + get_bh(next_bh); + } + if (prev != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, + &prev_bh); + if (ret < 0) + goto out_next; + } else { + prev_bh = header_bh; + get_bh(prev_bh); + } + + kaddr = kmap_atomic(next_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, next, next_bh, kaddr); + list->ssl_prev = cpu_to_le64(prev); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(prev_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, prev, prev_bh, kaddr); + list->ssl_next = cpu_to_le64(next); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + cp->cp_snapshot_list.ssl_next = cpu_to_le64(0); + cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0); + nilfs_checkpoint_clear_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + le64_add_cpu(&header->ch_nsnapshots, -1); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(next_bh); + nilfs_mdt_mark_buffer_dirty(prev_bh); + nilfs_mdt_mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + + brelse(prev_bh); + + out_next: + brelse(next_bh); + + out_header: + brelse(header_bh); + + out_cp: + brelse(cp_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_is_snapshot - + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * + * Description: + * + * Return Value: On success, 1 is returned if the checkpoint specified by + * @cno is a snapshot, or 0 if not. On error, one of the following negative + * error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + */ +int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *bh; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_read(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); + if (ret < 0) + goto out; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + ret = nilfs_checkpoint_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_change_cpmode - change checkpoint mode + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @status: mode of checkpoint + * + * Description: nilfs_change_cpmode() changes the mode of the checkpoint + * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + */ +int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) +{ + struct the_nilfs *nilfs; + int ret; + + nilfs = NILFS_MDT(cpfile)->mi_nilfs; + + switch (mode) { + case NILFS_CHECKPOINT: + /* + * Check for protecting existing snapshot mounts: + * bd_mount_sem is used to make this operation atomic and + * exclusive with a new mount job. Though it doesn't cover + * umount, it's enough for the purpose. + */ + down(&nilfs->ns_bdev->bd_mount_sem); + if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { + /* Current implementation does not have to protect + plain read-only mounts since they are exclusive + with a read/write mount and are protected from the + cleaner. */ + ret = -EBUSY; + } else + ret = nilfs_cpfile_clear_snapshot(cpfile, cno); + up(&nilfs->ns_bdev->bd_mount_sem); + return ret; + case NILFS_SNAPSHOT: + return nilfs_cpfile_set_snapshot(cpfile, cno); + default: + return -EINVAL; + } +} + +/** + * nilfs_cpfile_get_stat - get checkpoint statistics + * @cpfile: inode of checkpoint file + * @stat: pointer to a structure of checkpoint statistics + * + * Description: nilfs_cpfile_get_stat() returns information about checkpoints. + * + * Return Value: On success, 0 is returned, and checkpoints information is + * stored in the place pointed by @stat. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) +{ + struct buffer_head *bh; + struct nilfs_cpfile_header *header; + void *kaddr; + int ret; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + cpstat->cs_cno = nilfs_mdt_cno(cpfile); + cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints); + cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + + out_sem: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h new file mode 100644 index 000000000000..1a8a1008c342 --- /dev/null +++ b/fs/nilfs2/cpfile.h @@ -0,0 +1,45 @@ +/* + * cpfile.h - NILFS checkpoint file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_CPFILE_H +#define _NILFS_CPFILE_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> + +#define NILFS_CPFILE_GFP NILFS_MDT_GFP + + +int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, + struct nilfs_checkpoint **, + struct buffer_head **); +void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *); +int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64); +int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); +int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); +int nilfs_cpfile_is_snapshot(struct inode *, __u64); +int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); +ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, + struct nilfs_cpinfo *, size_t); + +#endif /* _NILFS_CPFILE_H */ diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c new file mode 100644 index 000000000000..bb8a5818e7f1 --- /dev/null +++ b/fs/nilfs2/dat.c @@ -0,0 +1,430 @@ +/* + * dat.c - NILFS disk address translation. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/string.h> +#include <linux/errno.h> +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "dat.h" + + +#define NILFS_CNO_MIN ((__u64)1) +#define NILFS_CNO_MAX (~(__u64)0) + +static int nilfs_dat_prepare_entry(struct inode *dat, + struct nilfs_palloc_req *req, int create) +{ + return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, + create, &req->pr_entry_bh); +} + +static void nilfs_dat_commit_entry(struct inode *dat, + struct nilfs_palloc_req *req) +{ + nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh); + nilfs_mdt_mark_dirty(dat); + brelse(req->pr_entry_bh); +} + +static void nilfs_dat_abort_entry(struct inode *dat, + struct nilfs_palloc_req *req) +{ + brelse(req->pr_entry_bh); +} + +int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_palloc_prepare_alloc_entry(dat, req); + if (ret < 0) + return ret; + + ret = nilfs_dat_prepare_entry(dat, req, 1); + if (ret < 0) + nilfs_palloc_abort_alloc_entry(dat, req); + + return ret; +} + +void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(NILFS_CNO_MIN); + entry->de_end = cpu_to_le64(NILFS_CNO_MAX); + entry->de_blocknr = cpu_to_le64(0); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_palloc_commit_alloc_entry(dat, req); + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); + nilfs_palloc_abort_alloc_entry(dat, req); +} + +int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_palloc_prepare_free_entry(dat, req); + if (ret < 0) + return ret; + ret = nilfs_dat_prepare_entry(dat, req, 0); + if (ret < 0) { + nilfs_palloc_abort_free_entry(dat, req); + return ret; + } + return 0; +} + +void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(NILFS_CNO_MIN); + entry->de_end = cpu_to_le64(NILFS_CNO_MIN); + entry->de_blocknr = cpu_to_le64(0); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_dat_commit_entry(dat, req); + nilfs_palloc_commit_free_entry(dat, req); +} + +void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); + nilfs_palloc_abort_free_entry(dat, req); +} + +int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); + WARN_ON(ret == -ENOENT); + return ret; +} + +void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, + sector_t blocknr) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); + if (entry->de_blocknr != cpu_to_le64(0) || + entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) { + printk(KERN_CRIT + "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n", + __func__, (unsigned long long)req->pr_entry_nr, + (unsigned long long)le64_to_cpu(entry->de_start), + (unsigned long long)le64_to_cpu(entry->de_end), + (unsigned long long)le64_to_cpu(entry->de_blocknr)); + } + entry->de_blocknr = cpu_to_le64(blocknr); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); +} + +int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + __u64 start; + sector_t blocknr; + void *kaddr; + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (blocknr == 0) { + ret = nilfs_palloc_prepare_free_entry(dat, req); + if (ret < 0) { + nilfs_dat_abort_entry(dat, req); + return ret; + } + } + + return 0; +} + +void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, + int dead) +{ + struct nilfs_dat_entry *entry; + __u64 start, end; + sector_t blocknr; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + end = start = le64_to_cpu(entry->de_start); + if (!dead) { + end = nilfs_mdt_cno(dat); + WARN_ON(start > end); + } + entry->de_end = cpu_to_le64(end); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (blocknr == 0) + nilfs_dat_commit_free(dat, req); + else + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + __u64 start; + sector_t blocknr; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (start == nilfs_mdt_cno(dat) && blocknr == 0) + nilfs_palloc_abort_free_entry(dat, req); + nilfs_dat_abort_entry(dat, req); +} + +/** + * nilfs_dat_mark_dirty - + * @dat: DAT file inode + * @vblocknr: virtual block number + * + * Description: + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr) +{ + struct nilfs_palloc_req req; + int ret; + + req.pr_entry_nr = vblocknr; + ret = nilfs_dat_prepare_entry(dat, &req, 0); + if (ret == 0) + nilfs_dat_commit_entry(dat, &req); + return ret; +} + +/** + * nilfs_dat_freev - free virtual block numbers + * @dat: DAT file inode + * @vblocknrs: array of virtual block numbers + * @nitems: number of virtual block numbers + * + * Description: nilfs_dat_freev() frees the virtual block numbers specified by + * @vblocknrs and @nitems. + * + * Return Value: On success, 0 is returned. On error, one of the following + * nagative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The virtual block number have not been allocated. + */ +int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems) +{ + return nilfs_palloc_freev(dat, vblocknrs, nitems); +} + +/** + * nilfs_dat_move - change a block number + * @dat: DAT file inode + * @vblocknr: virtual block number + * @blocknr: block number + * + * Description: nilfs_dat_move() changes the block number associated with + * @vblocknr to @blocknr. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + void *kaddr; + int ret; + + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); + if (ret < 0) + return ret; + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { + printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__, + (unsigned long long)vblocknr, + (unsigned long long)le64_to_cpu(entry->de_start), + (unsigned long long)le64_to_cpu(entry->de_end)); + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + return -EINVAL; + } + WARN_ON(blocknr == 0); + entry->de_blocknr = cpu_to_le64(blocknr); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(entry_bh); + nilfs_mdt_mark_dirty(dat); + + brelse(entry_bh); + + return 0; +} + +/** + * nilfs_dat_translate - translate a virtual block number to a block number + * @dat: DAT file inode + * @vblocknr: virtual block number + * @blocknrp: pointer to a block number + * + * Description: nilfs_dat_translate() maps the virtual block number @vblocknr + * to the corresponding block number. + * + * Return Value: On success, 0 is returned and the block number associated + * with @vblocknr is stored in the place pointed by @blocknrp. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A block number associated with @vblocknr does not exist. + */ +int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + sector_t blocknr; + void *kaddr; + int ret; + + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); + if (ret < 0) + return ret; + + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + blocknr = le64_to_cpu(entry->de_blocknr); + if (blocknr == 0) { + ret = -ENOENT; + goto out; + } + if (blocknrp != NULL) + *blocknrp = blocknr; + + out: + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + return ret; +} + +ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo, + size_t nvi) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + __u64 first, last; + void *kaddr; + unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block; + int i, j, n, ret; + + for (i = 0; i < nvi; i += n) { + ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr, + 0, &entry_bh); + if (ret < 0) + return ret; + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + /* last virtual block number in this block */ + first = vinfo[i].vi_vblocknr; + do_div(first, entries_per_block); + first *= entries_per_block; + last = first + entries_per_block - 1; + for (j = i, n = 0; + j < nvi && vinfo[j].vi_vblocknr >= first && + vinfo[j].vi_vblocknr <= last; + j++, n++) { + entry = nilfs_palloc_block_get_entry( + dat, vinfo[j].vi_vblocknr, entry_bh, kaddr); + vinfo[j].vi_start = le64_to_cpu(entry->de_start); + vinfo[j].vi_end = le64_to_cpu(entry->de_end); + vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + } + + return nvi; +} diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h new file mode 100644 index 000000000000..d9560654a4b7 --- /dev/null +++ b/fs/nilfs2/dat.h @@ -0,0 +1,52 @@ +/* + * dat.h - NILFS disk address translation. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_DAT_H +#define _NILFS_DAT_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> + +#define NILFS_DAT_GFP NILFS_MDT_GFP + +struct nilfs_palloc_req; + +int nilfs_dat_translate(struct inode *, __u64, sector_t *); + +int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, + sector_t); +void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); +void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); + +int nilfs_dat_mark_dirty(struct inode *, __u64); +int nilfs_dat_freev(struct inode *, __u64 *, size_t); +int nilfs_dat_move(struct inode *, __u64, sector_t); +ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t); + +#endif /* _NILFS_DAT_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c new file mode 100644 index 000000000000..54100acc1102 --- /dev/null +++ b/fs/nilfs2/dir.c @@ -0,0 +1,711 @@ +/* + * dir.c - NILFS directory entry operations + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net> + */ +/* + * linux/fs/ext2/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * All code that works with directory layout had been switched to pagecache + * and moved here. AV + */ + +#include <linux/pagemap.h> +#include <linux/smp_lock.h> +#include "nilfs.h" +#include "page.h" + +/* + * nilfs uses block-sized chunks. Arguably, sector-sized ones would be + * more robust, but we have what we have + */ +static inline unsigned nilfs_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void nilfs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int nilfs_prepare_chunk_uninterruptible(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + loff_t pos = page_offset(page) + from; + return block_write_begin(NULL, mapping, pos, to - from, + AOP_FLAG_UNINTERRUPTIBLE, &page, + NULL, nilfs_get_block); +} + +static int nilfs_prepare_chunk(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + loff_t pos = page_offset(page) + from; + return block_write_begin(NULL, mapping, pos, to - from, 0, &page, + NULL, nilfs_get_block); +} + +static int nilfs_commit_chunk(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + struct inode *dir = mapping->host; + struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb); + loff_t pos = page_offset(page) + from; + unsigned len = to - from; + unsigned nr_dirty, copied; + int err; + + nr_dirty = nilfs_page_count_clean_buffers(page, from, to); + copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (pos + copied > dir->i_size) { + i_size_write(dir, pos + copied); + mark_inode_dirty(dir); + } + if (IS_DIRSYNC(dir)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + err = nilfs_set_file_dirty(sbi, dir, nr_dirty); + unlock_page(page); + return err; +} + +static void nilfs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + struct super_block *sb = dir->i_sb; + unsigned chunk_size = nilfs_chunk_size(dir); + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + struct nilfs_dir_entry *p; + char *error; + + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct nilfs_dir_entry *)(kaddr + offs); + rec_len = le16_to_cpu(p->rec_len); + + if (rec_len < NILFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < NILFS_DIR_REC_LEN(p->name_len)) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) + goto Espan; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + + /* Too bad, we had an error */ + +Ebadsize: + nilfs_error(sb, "nilfs_check_page", + "size of directory #%lu is not a multiple of chunk size", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; +bad_entry: + nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le64_to_cpu(p->inode), + rec_len, p->name_len); + goto fail; +Eend: + p = (struct nilfs_dir_entry *)(kaddr + offs); + nilfs_error(sb, "nilfs_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%lu", + dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le64_to_cpu(p->inode)); +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page *nilfs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_cache_page(mapping, n, + (filler_t *)mapping->a_ops->readpage, NULL); + if (!IS_ERR(page)) { + wait_on_page_locked(page); + kmap(page); + if (!PageUptodate(page)) + goto fail; + if (!PageChecked(page)) + nilfs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + nilfs_put_page(page); + return ERR_PTR(-EIO); +} + +/* + * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure. + * + * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. + */ +static int +nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * p is at least 6 bytes before the end of page + */ +static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) +{ + return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); +} + +static unsigned char +nilfs_filetype_table[NILFS_FT_MAX] = { + [NILFS_FT_UNKNOWN] = DT_UNKNOWN, + [NILFS_FT_REG_FILE] = DT_REG, + [NILFS_FT_DIR] = DT_DIR, + [NILFS_FT_CHRDEV] = DT_CHR, + [NILFS_FT_BLKDEV] = DT_BLK, + [NILFS_FT_FIFO] = DT_FIFO, + [NILFS_FT_SOCK] = DT_SOCK, + [NILFS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char +nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK, +}; + +static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) +{ + mode_t mode = inode->i_mode; + + de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + loff_t pos = filp->f_pos; + struct inode *inode = filp->f_dentry->d_inode; + struct super_block *sb = inode->i_sb; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); +/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ + unsigned char *types = NULL; + int ret; + + if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) + goto success; + + types = nilfs_filetype_table; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct nilfs_dir_entry *de; + struct page *page = nilfs_get_page(inode, n); + + if (IS_ERR(page)) { + nilfs_error(sb, __func__, "bad page in #%lu", + inode->i_ino); + filp->f_pos += PAGE_CACHE_SIZE - offset; + ret = -EIO; + goto done; + } + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)(kaddr + offset); + limit = kaddr + nilfs_last_byte(inode, n) - + NILFS_DIR_REC_LEN(1); + for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { + if (de->rec_len == 0) { + nilfs_error(sb, __func__, + "zero-length directory entry"); + ret = -EIO; + nilfs_put_page(page); + goto done; + } + if (de->inode) { + int over; + unsigned char d_type = DT_UNKNOWN; + + if (types && de->file_type < NILFS_FT_MAX) + d_type = types[de->file_type]; + + offset = (char *)de - kaddr; + over = filldir(dirent, de->name, de->name_len, + (n<<PAGE_CACHE_SHIFT) | offset, + le64_to_cpu(de->inode), d_type); + if (over) { + nilfs_put_page(page); + goto success; + } + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + nilfs_put_page(page); + } + +success: + ret = 0; +done: + return ret; +} + +/* + * nilfs_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the page in which the entry was found, and the entry itself + * (as a parameter - res_dir). Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct nilfs_dir_entry * +nilfs_find_entry(struct inode *dir, struct dentry *dentry, + struct page **res_page) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct nilfs_inode_info *ei = NILFS_I(dir); + struct nilfs_dir_entry *de; + + if (npages == 0) + goto out; + + /* OFFSET_CACHE */ + *res_page = NULL; + + start = ei->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = nilfs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, __func__, + "zero-length directory entry"); + nilfs_put_page(page); + goto out; + } + if (nilfs_match(namelen, name, de)) + goto found; + de = nilfs_next_entry(de); + } + nilfs_put_page(page); + } + if (++n >= npages) + n = 0; + /* next page is past the blocks we've got */ + if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { + nilfs_error(dir->i_sb, __func__, + "dir %lu size %lld exceeds block cout %llu", + dir->i_ino, dir->i_size, + (unsigned long long)dir->i_blocks); + goto out; + } + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + ei->i_dir_start_lookup = n; + return de; +} + +struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = nilfs_get_page(dir, 0); + struct nilfs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = nilfs_next_entry( + (struct nilfs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry) +{ + ino_t res = 0; + struct nilfs_dir_entry *de; + struct page *page; + + de = nilfs_find_entry(dir, dentry, &page); + if (de) { + res = le64_to_cpu(de->inode); + kunmap(page); + page_cache_release(page); + } + return res; +} + +/* Releases the page */ +void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, + struct page *page, struct inode *inode) +{ + unsigned from = (char *) de - (char *) page_address(page); + unsigned to = from + le16_to_cpu(de->rec_len); + struct address_space *mapping = page->mapping; + int err; + + lock_page(page); + err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to); + BUG_ON(err); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + err = nilfs_commit_chunk(page, mapping, from, to); + nilfs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; +/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ + mark_inode_dirty(dir); +} + +/* + * Parent is locked. + */ +int nilfs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = nilfs_chunk_size(dir); + unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + struct nilfs_dir_entry *de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + unsigned from, to; + int err; + + /* + * We take care of directory expansion in the same loop. + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = nilfs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + nilfs_last_byte(dir, n); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + /* We hit i_size */ + name_len = 0; + rec_len = chunk_size; + de->rec_len = cpu_to_le16(chunk_size); + de->inode = 0; + goto got_it; + } + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (nilfs_match(namelen, name, de)) + goto out_unlock; + name_len = NILFS_DIR_REC_LEN(de->name_len); + rec_len = le16_to_cpu(de->rec_len); + if (!de->inode && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct nilfs_dir_entry *)((char *)de + rec_len); + } + unlock_page(page); + nilfs_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + from = (char *)de - (char *)page_address(page); + to = from + rec_len; + err = nilfs_prepare_chunk(page, page->mapping, from, to); + if (err) + goto out_unlock; + if (de->inode) { + struct nilfs_dir_entry *de1; + + de1 = (struct nilfs_dir_entry *)((char *)de + name_len); + de1->rec_len = cpu_to_le16(rec_len - name_len); + de->rec_len = cpu_to_le16(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + err = nilfs_commit_chunk(page, page->mapping, from, to); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; +/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ + mark_inode_dirty(dir); + /* OFFSET_CACHE */ +out_put: + nilfs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +/* + * nilfs_delete_entry deletes a directory entry by merging it with the + * previous entry. Page is up-to-date. Releases the page. + */ +int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + char *kaddr = page_address(page); + unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); + unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); + struct nilfs_dir_entry *pde = NULL; + struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); + int err; + + while ((char *)de < (char *)dir) { + if (de->rec_len == 0) { + nilfs_error(inode->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = nilfs_next_entry(de); + } + if (pde) + from = (char *)pde - (char *)page_address(page); + lock_page(page); + err = nilfs_prepare_chunk(page, mapping, from, to); + BUG_ON(err); + if (pde) + pde->rec_len = cpu_to_le16(to - from); + dir->inode = 0; + err = nilfs_commit_chunk(page, mapping, from, to); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; +/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */ + mark_inode_dirty(inode); +out: + nilfs_put_page(page); + return err; +} + +/* + * Set the first fragment of directory. + */ +int nilfs_make_empty(struct inode *inode, struct inode *parent) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + unsigned chunk_size = nilfs_chunk_size(inode); + struct nilfs_dir_entry *de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = nilfs_prepare_chunk(page, mapping, 0, chunk_size); + if (unlikely(err)) { + unlock_page(page); + goto fail; + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, chunk_size); + de = (struct nilfs_dir_entry *)kaddr; + de->name_len = 1; + de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1)); + memcpy(de->name, ".\0\0", 4); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + + de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1)); + de->inode = cpu_to_le64(parent->i_ino); + memcpy(de->name, "..\0", 4); + nilfs_set_de_type(de, inode); + kunmap_atomic(kaddr, KM_USER0); + err = nilfs_commit_chunk(page, mapping, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int nilfs_empty_dir(struct inode *inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct nilfs_dir_entry *de; + + page = nilfs_get_page(inode, i); + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(inode->i_sb, __func__, + "zero-length directory entry " + "(kaddr=%p, de=%p)\n", kaddr, de); + goto not_empty; + } + if (de->inode != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (de->inode != + cpu_to_le64(inode->i_ino)) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = nilfs_next_entry(de); + } + nilfs_put_page(page); + } + return 1; + +not_empty: + nilfs_put_page(page); + return 0; +} + +struct file_operations nilfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = nilfs_readdir, + .unlocked_ioctl = nilfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nilfs_ioctl, +#endif /* CONFIG_COMPAT */ + .fsync = nilfs_sync_file, + +}; diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c new file mode 100644 index 000000000000..c6379e482781 --- /dev/null +++ b/fs/nilfs2/direct.c @@ -0,0 +1,436 @@ +/* + * direct.c - NILFS direct block pointer. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/errno.h> +#include "nilfs.h" +#include "page.h" +#include "direct.h" +#include "alloc.h" + +static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct) +{ + return (__le64 *) + ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1); +} + +static inline __u64 +nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key) +{ + return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key)); +} + +static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct, + __u64 key, __u64 ptr) +{ + *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr); +} + +static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, + __u64 key, int level, __u64 *ptrp) +{ + struct nilfs_direct *direct; + __u64 ptr; + + direct = (struct nilfs_direct *)bmap; + if ((key > NILFS_DIRECT_KEY_MAX) || + (level != 1) || /* XXX: use macro for level 1 */ + ((ptr = nilfs_direct_get_ptr(direct, key)) == + NILFS_BMAP_INVALID_PTR)) + return -ENOENT; + + if (ptrp != NULL) + *ptrp = ptr; + return 0; +} + +static __u64 +nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key) +{ + __u64 ptr; + + ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* sequential access */ + return ptr; + else + /* block group */ + return nilfs_bmap_find_target_in_group(&direct->d_bmap); +} + +static void nilfs_direct_set_target_v(struct nilfs_direct *direct, + __u64 key, __u64 ptr) +{ + direct->d_bmap.b_last_allocated_key = key; + direct->d_bmap.b_last_allocated_ptr = ptr; +} + +static int nilfs_direct_prepare_insert(struct nilfs_direct *direct, + __u64 key, + union nilfs_bmap_ptr_req *req, + struct nilfs_bmap_stats *stats) +{ + int ret; + + if (direct->d_ops->dop_find_target != NULL) + req->bpr_ptr = direct->d_ops->dop_find_target(direct, key); + ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap, + req); + if (ret < 0) + return ret; + + stats->bs_nblocks = 1; + return 0; +} + +static void nilfs_direct_commit_insert(struct nilfs_direct *direct, + union nilfs_bmap_ptr_req *req, + __u64 key, __u64 ptr) +{ + struct buffer_head *bh; + + /* ptr must be a pointer to a buffer head. */ + bh = (struct buffer_head *)((unsigned long)ptr); + set_buffer_nilfs_volatile(bh); + + if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL) + direct->d_bmap.b_pops->bpop_commit_alloc_ptr( + &direct->d_bmap, req); + nilfs_direct_set_ptr(direct, key, req->bpr_ptr); + + if (!nilfs_bmap_dirty(&direct->d_bmap)) + nilfs_bmap_set_dirty(&direct->d_bmap); + + if (direct->d_ops->dop_set_target != NULL) + direct->d_ops->dop_set_target(direct, key, req->bpr_ptr); +} + +static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + struct nilfs_direct *direct; + union nilfs_bmap_ptr_req req; + struct nilfs_bmap_stats stats; + int ret; + + direct = (struct nilfs_direct *)bmap; + if (key > NILFS_DIRECT_KEY_MAX) + return -ENOENT; + if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) + return -EEXIST; + + ret = nilfs_direct_prepare_insert(direct, key, &req, &stats); + if (ret < 0) + return ret; + nilfs_direct_commit_insert(direct, &req, key, ptr); + nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + + return 0; +} + +static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, + union nilfs_bmap_ptr_req *req, + __u64 key, + struct nilfs_bmap_stats *stats) +{ + int ret; + + if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) { + req->bpr_ptr = nilfs_direct_get_ptr(direct, key); + ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr( + &direct->d_bmap, req); + if (ret < 0) + return ret; + } + + stats->bs_nblocks = 1; + return 0; +} + +static void nilfs_direct_commit_delete(struct nilfs_direct *direct, + union nilfs_bmap_ptr_req *req, + __u64 key) +{ + if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL) + direct->d_bmap.b_pops->bpop_commit_end_ptr( + &direct->d_bmap, req); + nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); +} + +static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) +{ + struct nilfs_direct *direct; + union nilfs_bmap_ptr_req req; + struct nilfs_bmap_stats stats; + int ret; + + direct = (struct nilfs_direct *)bmap; + if ((key > NILFS_DIRECT_KEY_MAX) || + nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) + return -ENOENT; + + ret = nilfs_direct_prepare_delete(direct, &req, key, &stats); + if (ret < 0) + return ret; + nilfs_direct_commit_delete(direct, &req, key); + nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + + return 0; +} + +static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) +{ + struct nilfs_direct *direct; + __u64 key, lastkey; + + direct = (struct nilfs_direct *)bmap; + lastkey = NILFS_DIRECT_KEY_MAX + 1; + for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++) + if (nilfs_direct_get_ptr(direct, key) != + NILFS_BMAP_INVALID_PTR) + lastkey = key; + + if (lastkey == NILFS_DIRECT_KEY_MAX + 1) + return -ENOENT; + + *keyp = lastkey; + + return 0; +} + +static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key) +{ + return key > NILFS_DIRECT_KEY_MAX; +} + +static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, + __u64 *keys, __u64 *ptrs, int nitems) +{ + struct nilfs_direct *direct; + __u64 key; + __u64 ptr; + int n; + + direct = (struct nilfs_direct *)bmap; + if (nitems > NILFS_DIRECT_NBLOCKS) + nitems = NILFS_DIRECT_NBLOCKS; + n = 0; + for (key = 0; key < nitems; key++) { + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr != NILFS_BMAP_INVALID_PTR) { + keys[n] = key; + ptrs[n] = ptr; + n++; + } + } + return n; +} + +int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, + __u64 key, __u64 *keys, __u64 *ptrs, + int n, __u64 low, __u64 high) +{ + struct nilfs_direct *direct; + __le64 *dptrs; + int ret, i, j; + + /* no need to allocate any resource for conversion */ + + /* delete */ + ret = bmap->b_ops->bop_delete(bmap, key); + if (ret < 0) + return ret; + + /* free resources */ + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + + /* convert */ + direct = (struct nilfs_direct *)bmap; + dptrs = nilfs_direct_dptrs(direct); + for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) { + if ((j < n) && (i == keys[j])) { + dptrs[i] = (i != key) ? + nilfs_bmap_ptr_to_dptr(ptrs[j]) : + NILFS_BMAP_INVALID_PTR; + j++; + } else + dptrs[i] = NILFS_BMAP_INVALID_PTR; + } + + nilfs_direct_init(bmap, low, high); + + return 0; +} + +static int nilfs_direct_propagate_v(struct nilfs_direct *direct, + struct buffer_head *bh) +{ + union nilfs_bmap_ptr_req oldreq, newreq; + __u64 key; + __u64 ptr; + int ret; + + key = nilfs_bmap_data_get_key(&direct->d_bmap, bh); + ptr = nilfs_direct_get_ptr(direct, key); + if (!buffer_nilfs_volatile(bh)) { + oldreq.bpr_ptr = ptr; + newreq.bpr_ptr = ptr; + ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq, + &newreq); + if (ret < 0) + return ret; + nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq); + set_buffer_nilfs_volatile(bh); + nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); + } else + ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr); + + return ret; +} + +static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + struct nilfs_direct *direct; + + direct = (struct nilfs_direct *)bmap; + return (direct->d_ops->dop_propagate != NULL) ? + direct->d_ops->dop_propagate(direct, bh) : + 0; +} + +static int nilfs_direct_assign_v(struct nilfs_direct *direct, + __u64 key, __u64 ptr, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + union nilfs_bmap_ptr_req req; + int ret; + + req.bpr_ptr = ptr; + ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr( + &direct->d_bmap, &req); + if (ret < 0) + return ret; + direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap, + &req, blocknr); + + binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + + return 0; +} + +static int nilfs_direct_assign_p(struct nilfs_direct *direct, + __u64 key, __u64 ptr, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + nilfs_direct_set_ptr(direct, key, blocknr); + + binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_dat.bi_level = 0; + + return 0; +} + +static int nilfs_direct_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_direct *direct; + __u64 key; + __u64 ptr; + + direct = (struct nilfs_direct *)bmap; + key = nilfs_bmap_data_get_key(bmap, *bh); + if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { + printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, + (unsigned long long)key); + return -EINVAL; + } + ptr = nilfs_direct_get_ptr(direct, key); + if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { + printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, + (unsigned long long)ptr); + return -EINVAL; + } + + return direct->d_ops->dop_assign(direct, key, ptr, bh, + blocknr, binfo); +} + +static const struct nilfs_bmap_operations nilfs_direct_ops = { + .bop_lookup = nilfs_direct_lookup, + .bop_insert = nilfs_direct_insert, + .bop_delete = nilfs_direct_delete, + .bop_clear = NULL, + + .bop_propagate = nilfs_direct_propagate, + + .bop_lookup_dirty_buffers = NULL, + + .bop_assign = nilfs_direct_assign, + .bop_mark = NULL, + + .bop_last_key = nilfs_direct_last_key, + .bop_check_insert = nilfs_direct_check_insert, + .bop_check_delete = NULL, + .bop_gather_data = nilfs_direct_gather_data, +}; + + +static const struct nilfs_direct_operations nilfs_direct_ops_v = { + .dop_find_target = nilfs_direct_find_target_v, + .dop_set_target = nilfs_direct_set_target_v, + .dop_propagate = nilfs_direct_propagate_v, + .dop_assign = nilfs_direct_assign_v, +}; + +static const struct nilfs_direct_operations nilfs_direct_ops_p = { + .dop_find_target = NULL, + .dop_set_target = NULL, + .dop_propagate = NULL, + .dop_assign = nilfs_direct_assign_p, +}; + +int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high) +{ + struct nilfs_direct *direct; + + direct = (struct nilfs_direct *)bmap; + bmap->b_ops = &nilfs_direct_ops; + bmap->b_low = low; + bmap->b_high = high; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + direct->d_ops = &nilfs_direct_ops_p; + break; + default: + direct->d_ops = &nilfs_direct_ops_v; + break; + } + + return 0; +} diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h new file mode 100644 index 000000000000..45d2c5cda812 --- /dev/null +++ b/fs/nilfs2/direct.h @@ -0,0 +1,78 @@ +/* + * direct.h - NILFS direct block pointer. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_DIRECT_H +#define _NILFS_DIRECT_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include "bmap.h" + + +struct nilfs_direct; + +/** + * struct nilfs_direct_operations - direct mapping operation table + */ +struct nilfs_direct_operations { + __u64 (*dop_find_target)(const struct nilfs_direct *, __u64); + void (*dop_set_target)(struct nilfs_direct *, __u64, __u64); + int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *); + int (*dop_assign)(struct nilfs_direct *, __u64, __u64, + struct buffer_head **, sector_t, + union nilfs_binfo *); +}; + +/** + * struct nilfs_direct_node - direct node + * @dn_flags: flags + * @dn_pad: padding + */ +struct nilfs_direct_node { + __u8 dn_flags; + __u8 pad[7]; +}; + +/** + * struct nilfs_direct - direct mapping + * @d_bmap: bmap structure + * @d_ops: direct mapping operation table + */ +struct nilfs_direct { + struct nilfs_bmap d_bmap; + + /* direct-mapping-specific members */ + const struct nilfs_direct_operations *d_ops; +}; + + +#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) +#define NILFS_DIRECT_KEY_MIN 0 +#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) + + +int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64); +int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *, + __u64 *, int, __u64, __u64); + + +#endif /* _NILFS_DIRECT_H */ diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c new file mode 100644 index 000000000000..6bd84a0d8238 --- /dev/null +++ b/fs/nilfs2/file.c @@ -0,0 +1,160 @@ +/* + * file.c - NILFS regular file handling primitives including fsync(). + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji <amagai@osrg.net>, + * Ryusuke Konishi <ryusuke@osrg.net> + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/writeback.h> +#include "nilfs.h" +#include "segment.h" + +int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + /* + * Called from fsync() system call + * This is the only entry point that can catch write and synch + * timing for both data blocks and intermediate blocks. + * + * This function should be implemented when the writeback function + * will be implemented. + */ + struct inode *inode = dentry->d_inode; + int err; + + if (!nilfs_inode_dirty(inode)) + return 0; + + if (datasync) + err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0, + LLONG_MAX); + else + err = nilfs_construct_segment(inode->i_sb); + + return err; +} + +static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct nilfs_transaction_info ti; + int ret; + + if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs))) + return VM_FAULT_SIGBUS; /* -ENOSPC */ + + lock_page(page); + if (page->mapping != inode->i_mapping || + page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { + unlock_page(page); + return VM_FAULT_NOPAGE; /* make the VM retry the fault */ + } + + /* + * check to see if the page is mapped already (no holes) + */ + if (PageMappedToDisk(page)) { + unlock_page(page); + goto mapped; + } + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + int fully_mapped = 1; + + bh = head = page_buffers(page); + do { + if (!buffer_mapped(bh)) { + fully_mapped = 0; + break; + } + } while (bh = bh->b_this_page, bh != head); + + if (fully_mapped) { + SetPageMappedToDisk(page); + unlock_page(page); + goto mapped; + } + } + unlock_page(page); + + /* + * fill hole blocks + */ + ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); + /* never returns -ENOMEM, but may return -ENOSPC */ + if (unlikely(ret)) + return VM_FAULT_SIGBUS; + + ret = block_page_mkwrite(vma, vmf, nilfs_get_block); + if (unlikely(ret)) { + nilfs_transaction_abort(inode->i_sb); + return ret; + } + nilfs_transaction_commit(inode->i_sb); + + mapped: + SetPageChecked(page); + wait_on_page_writeback(page); + return 0; +} + +struct vm_operations_struct nilfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = nilfs_page_mkwrite, +}; + +static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &nilfs_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +/* + * We have mostly NULL's here: the current defaults are ok for + * the nilfs filesystem. + */ +struct file_operations nilfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = nilfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nilfs_ioctl, +#endif /* CONFIG_COMPAT */ + .mmap = nilfs_file_mmap, + .open = generic_file_open, + /* .release = nilfs_release_file, */ + .fsync = nilfs_sync_file, + .splice_read = generic_file_splice_read, +}; + +struct inode_operations nilfs_file_inode_operations = { + .truncate = nilfs_truncate, + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +/* end of file */ diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c new file mode 100644 index 000000000000..93383c5cee90 --- /dev/null +++ b/fs/nilfs2/gcdat.c @@ -0,0 +1,84 @@ +/* + * gcdat.c - NILFS shadow DAT inode for GC + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>, + * and Ryusuke Konishi <ryusuke@osrg.net>. + * + */ + +#include <linux/buffer_head.h> +#include "nilfs.h" +#include "page.h" +#include "mdt.h" + +int nilfs_init_gcdat_inode(struct the_nilfs *nilfs) +{ + struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat; + struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat); + int err; + + gcdat->i_state = 0; + gcdat->i_blocks = dat->i_blocks; + gii->i_flags = dii->i_flags; + gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT); + gii->i_cno = 0; + nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap); + err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping); + if (unlikely(err)) + return err; + + return nilfs_copy_dirty_pages(&gii->i_btnode_cache, + &dii->i_btnode_cache); +} + +void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs) +{ + struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat; + struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat); + struct address_space *mapping = dat->i_mapping; + struct address_space *gmapping = gcdat->i_mapping; + + down_write(&NILFS_MDT(dat)->mi_sem); + dat->i_blocks = gcdat->i_blocks; + dii->i_flags = gii->i_flags; + dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT); + + nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap); + + nilfs_clear_dirty_pages(mapping); + nilfs_copy_back_pages(mapping, gmapping); + /* note: mdt dirty flags should be cleared by segctor. */ + + nilfs_clear_dirty_pages(&dii->i_btnode_cache); + nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache); + + up_write(&NILFS_MDT(dat)->mi_sem); +} + +void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs) +{ + struct inode *gcdat = nilfs->ns_gc_dat; + struct nilfs_inode_info *gii = NILFS_I(gcdat); + + gcdat->i_state = I_CLEAR; + gii->i_flags = 0; + + truncate_inode_pages(gcdat->i_mapping, 0); + truncate_inode_pages(&gii->i_btnode_cache, 0); +} diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c new file mode 100644 index 000000000000..19d2102b6a69 --- /dev/null +++ b/fs/nilfs2/gcinode.c @@ -0,0 +1,288 @@ +/* + * gcinode.c - dummy inodes to buffer blocks for garbage collection + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>, + * and Ryusuke Konishi <ryusuke@osrg.net>. + * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * + */ +/* + * This file adds the cache of on-disk blocks to be moved in garbage + * collection. The disk blocks are held with dummy inodes (called + * gcinodes), and this file provides lookup function of the dummy + * inodes and their buffer read function. + * + * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it + * has to treat blocks that belong to a same file but have different + * checkpoint numbers. To avoid interference among generations, dummy + * inodes are managed separatly from actual inodes, and their lookup + * function (nilfs_gc_iget) is designed to be specified with a + * checkpoint number argument as well as an inode number. + * + * Buffers and pages held by the dummy inodes will be released each + * time after they are copied to a new log. Dirty blocks made on the + * current generation and the blocks to be moved by GC never overlap + * because the dirty blocks make a new generation; they rather must be + * written individually. + */ + +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/hash.h> +#include <linux/swap.h> +#include "nilfs.h" +#include "page.h" +#include "mdt.h" +#include "dat.h" +#include "ifile.h" + +static struct address_space_operations def_gcinode_aops = {}; +/* XXX need def_gcinode_iops/fops? */ + +/* + * nilfs_gccache_submit_read_data() - add data buffer and submit read request + * @inode - gc inode + * @blkoff - dummy offset treated as the key for the page cache + * @pbn - physical block number of the block + * @vbn - virtual block number of the block, 0 for non-virtual block + * @out_bh - indirect pointer to a buffer_head struct to receive the results + * + * Description: nilfs_gccache_submit_read_data() registers the data buffer + * specified by @pbn to the GC pagecache with the key @blkoff. + * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer. + * + * Return Value: On success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The block specified with @pbn does not exist. + */ +int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, + sector_t pbn, __u64 vbn, + struct buffer_head **out_bh) +{ + struct buffer_head *bh; + int err; + + bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); + if (unlikely(!bh)) + return -ENOMEM; + + if (buffer_uptodate(bh)) + goto out; + + if (pbn == 0) { + struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat; + /* use original dat, not gc dat. */ + err = nilfs_dat_translate(dat_inode, vbn, &pbn); + if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ + brelse(bh); + goto failed; + } + } + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + goto out; + } + + if (!buffer_mapped(bh)) { + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + set_buffer_mapped(bh); + } + bh->b_blocknr = pbn; + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ, bh); + if (vbn) + bh->b_blocknr = vbn; + out: + err = 0; + *out_bh = bh; + + failed: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return err; +} + +/* + * nilfs_gccache_submit_read_node() - add node buffer and submit read request + * @inode - gc inode + * @pbn - physical block number for the block + * @vbn - virtual block number for the block + * @out_bh - indirect pointer to a buffer_head struct to receive the results + * + * Description: nilfs_gccache_submit_read_node() registers the node buffer + * specified by @vbn to the GC pagecache. @pbn can be supplied by the + * caller to avoid translation of the disk block address. + * + * Return Value: On success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, + __u64 vbn, struct buffer_head **out_bh) +{ + int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, + vbn ? : pbn, pbn, out_bh, 0); + if (ret == -EEXIST) /* internal code (cache hit) */ + ret = 0; + return ret; +} + +int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) +{ + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + return -EIO; + if (buffer_dirty(bh)) + return -EEXIST; + + if (buffer_nilfs_node(bh)) + nilfs_btnode_mark_dirty(bh); + else + nilfs_mdt_mark_buffer_dirty(bh); + return 0; +} + +/* + * nilfs_init_gccache() - allocate and initialize gc_inode hash table + * @nilfs - the_nilfs + * + * Return Value: On success, 0. + * On error, a negative error code is returned. + */ +int nilfs_init_gccache(struct the_nilfs *nilfs) +{ + int loop; + + BUG_ON(nilfs->ns_gc_inodes_h); + + INIT_LIST_HEAD(&nilfs->ns_gc_inodes); + + nilfs->ns_gc_inodes_h = + kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE, + GFP_NOFS); + if (nilfs->ns_gc_inodes_h == NULL) + return -ENOMEM; + + for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++) + INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]); + return 0; +} + +/* + * nilfs_destroy_gccache() - free gc_inode hash table + * @nilfs - the nilfs + */ +void nilfs_destroy_gccache(struct the_nilfs *nilfs) +{ + if (nilfs->ns_gc_inodes_h) { + nilfs_remove_all_gcinode(nilfs); + kfree(nilfs->ns_gc_inodes_h); + nilfs->ns_gc_inodes_h = NULL; + } +} + +static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino, + __u64 cno) +{ + struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS); + struct nilfs_inode_info *ii; + + if (!inode) + return NULL; + + inode->i_op = NULL; + inode->i_fop = NULL; + inode->i_mapping->a_ops = &def_gcinode_aops; + + ii = NILFS_I(inode); + ii->i_cno = cno; + ii->i_flags = 0; + ii->i_state = 1 << NILFS_I_GCINODE; + ii->i_bh = NULL; + nilfs_bmap_init_gc(ii->i_bmap); + + return inode; +} + +static unsigned long ihash(ino_t ino, __u64 cno) +{ + return hash_long((unsigned long)((ino << 2) + cno), + NILFS_GCINODE_HASH_BITS); +} + +/* + * nilfs_gc_iget() - find or create gc inode with specified (ino,cno) + */ +struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno) +{ + struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno); + struct hlist_node *node; + struct inode *inode; + + hlist_for_each_entry(inode, node, head, i_hash) { + if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno) + return inode; + } + + inode = alloc_gcinode(nilfs, ino, cno); + if (likely(inode)) { + hlist_add_head(&inode->i_hash, head); + list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes); + } + return inode; +} + +/* + * nilfs_clear_gcinode() - clear and free a gc inode + */ +void nilfs_clear_gcinode(struct inode *inode) +{ + nilfs_mdt_clear(inode); + nilfs_mdt_destroy(inode); +} + +/* + * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs + */ +void nilfs_remove_all_gcinode(struct the_nilfs *nilfs) +{ + struct hlist_head *head = nilfs->ns_gc_inodes_h; + struct hlist_node *node, *n; + struct inode *inode; + int loop; + + for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) { + hlist_for_each_entry_safe(inode, node, n, head, i_hash) { + hlist_del_init(&inode->i_hash); + list_del_init(&NILFS_I(inode)->i_dirty); + nilfs_clear_gcinode(inode); /* might sleep */ + } + } +} diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c new file mode 100644 index 000000000000..de86401f209f --- /dev/null +++ b/fs/nilfs2/ifile.c @@ -0,0 +1,150 @@ +/* + * ifile.c - NILFS inode file + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji <amagai@osrg.net>. + * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "ifile.h" + +/** + * nilfs_ifile_create_inode - create a new disk inode + * @ifile: ifile inode + * @out_ino: pointer to a variable to store inode number + * @out_bh: buffer_head contains newly allocated disk inode + * + * Return Value: On success, 0 is returned and the newly allocated inode + * number is stored in the place pointed by @ino, and buffer_head pointer + * that contains newly allocated disk inode structure is stored in the + * place pointed by @out_bh + * On error, one of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - No inode left. + */ +int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, + struct buffer_head **out_bh) +{ + struct nilfs_palloc_req req; + int ret; + + req.pr_entry_nr = 0; /* 0 says find free inode from beginning of + a group. dull code!! */ + req.pr_entry_bh = NULL; + + ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); + if (!ret) { + ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1, + &req.pr_entry_bh); + if (ret < 0) + nilfs_palloc_abort_alloc_entry(ifile, &req); + } + if (ret < 0) { + brelse(req.pr_entry_bh); + return ret; + } + nilfs_palloc_commit_alloc_entry(ifile, &req); + nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh); + nilfs_mdt_mark_dirty(ifile); + *out_ino = (ino_t)req.pr_entry_nr; + *out_bh = req.pr_entry_bh; + return 0; +} + +/** + * nilfs_ifile_delete_inode - delete a disk inode + * @ifile: ifile inode + * @ino: inode number + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The inode number @ino have not been allocated. + */ +int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) +{ + struct nilfs_palloc_req req = { + .pr_entry_nr = ino, .pr_entry_bh = NULL + }; + struct nilfs_inode *raw_inode; + void *kaddr; + int ret; + + ret = nilfs_palloc_prepare_free_entry(ifile, &req); + if (!ret) { + ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0, + &req.pr_entry_bh); + if (ret < 0) + nilfs_palloc_abort_free_entry(ifile, &req); + } + if (ret < 0) { + brelse(req.pr_entry_bh); + return ret; + } + + kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0); + raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr, + req.pr_entry_bh, kaddr); + raw_inode->i_flags = 0; + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh); + brelse(req.pr_entry_bh); + + nilfs_palloc_commit_free_entry(ifile, &req); + + return 0; +} + +int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, + struct buffer_head **out_bh) +{ + struct super_block *sb = ifile->i_sb; + int err; + + if (unlikely(!NILFS_VALID_INODE(sb, ino))) { + nilfs_error(sb, __func__, "bad inode number: %lu", + (unsigned long) ino); + return -EINVAL; + } + + err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); + if (unlikely(err)) { + if (err == -EINVAL) + nilfs_error(sb, __func__, "ifile is broken"); + else + nilfs_warning(sb, __func__, + "unable to read inode: %lu", + (unsigned long) ino); + } + return err; +} diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h new file mode 100644 index 000000000000..5d30a35679b5 --- /dev/null +++ b/fs/nilfs2/ifile.h @@ -0,0 +1,53 @@ +/* + * ifile.h - NILFS inode file + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji <amagai@osrg.net> + * Revised by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#ifndef _NILFS_IFILE_H +#define _NILFS_IFILE_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" +#include "alloc.h" + +#define NILFS_IFILE_GFP NILFS_MDT_GFP + +static inline struct nilfs_inode * +nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) +{ + void *kaddr = kmap(ibh->b_page); + return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); +} + +static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino, + struct buffer_head *ibh) +{ + kunmap(ibh->b_page); +} + +int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); +int nilfs_ifile_delete_inode(struct inode *, ino_t); +int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); + +#endif /* _NILFS_IFILE_H */ diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c new file mode 100644 index 000000000000..49ab4a49bb4f --- /dev/null +++ b/fs/nilfs2/inode.c @@ -0,0 +1,785 @@ +/* + * inode.c - NILFS inode operations. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/uio.h> +#include "nilfs.h" +#include "segment.h" +#include "page.h" +#include "mdt.h" +#include "cpfile.h" +#include "ifile.h" + + +/** + * nilfs_get_block() - get a file block on the filesystem (callback function) + * @inode - inode struct of the target file + * @blkoff - file block number + * @bh_result - buffer head to be mapped on + * @create - indicate whether allocating the block or not when it has not + * been allocated yet. + * + * This function does not issue actual read request of the specified data + * block. It is done by VFS. + * Bulk read for direct-io is not supported yet. (should be supported) + */ +int nilfs_get_block(struct inode *inode, sector_t blkoff, + struct buffer_head *bh_result, int create) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + unsigned long blknum = 0; + int err = 0, ret; + struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); + + /* This exclusion control is a workaround; should be revised */ + down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum); + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + if (ret == 0) { /* found */ + map_bh(bh_result, inode->i_sb, blknum); + goto out; + } + /* data block was not found */ + if (ret == -ENOENT && create) { + struct nilfs_transaction_info ti; + + bh_result->b_blocknr = 0; + err = nilfs_transaction_begin(inode->i_sb, &ti, 1); + if (unlikely(err)) + goto out; + err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff, + (unsigned long)bh_result); + if (unlikely(err != 0)) { + if (err == -EEXIST) { + /* + * The get_block() function could be called + * from multiple callers for an inode. + * However, the page having this block must + * be locked in this case. + */ + printk(KERN_WARNING + "nilfs_get_block: a race condition " + "while inserting a data block. " + "(inode number=%lu, file block " + "offset=%llu)\n", + inode->i_ino, + (unsigned long long)blkoff); + err = 0; + } else if (err == -EINVAL) { + nilfs_error(inode->i_sb, __func__, + "broken bmap (inode=%lu)\n", + inode->i_ino); + err = -EIO; + } + nilfs_transaction_abort(inode->i_sb); + goto out; + } + nilfs_transaction_commit(inode->i_sb); /* never fails */ + /* Error handling should be detailed */ + set_buffer_new(bh_result); + map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed + to proper value */ + } else if (ret == -ENOENT) { + /* not found is not error (e.g. hole); must return without + the mapped state flag. */ + ; + } else { + err = ret; + } + + out: + return err; +} + +/** + * nilfs_readpage() - implement readpage() method of nilfs_aops {} + * address_space_operations. + * @file - file struct of the file to be read + * @page - the page to be read + */ +static int nilfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, nilfs_get_block); +} + +/** + * nilfs_readpages() - implement readpages() method of nilfs_aops {} + * address_space_operations. + * @file - file struct of the file to be read + * @mapping - address_space struct used for reading multiple pages + * @pages - the pages to be read + * @nr_pages - number of pages to be read + */ +static int nilfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); +} + +static int nilfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int err = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + err = nilfs_construct_dsync_segment(inode->i_sb, inode, + wbc->range_start, + wbc->range_end); + return err; +} + +static int nilfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + int err; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + + if (wbc->sync_mode == WB_SYNC_ALL) { + err = nilfs_construct_segment(inode->i_sb); + if (unlikely(err)) + return err; + } else if (wbc->for_reclaim) + nilfs_flush_segment(inode->i_sb, inode->i_ino); + + return 0; +} + +static int nilfs_set_page_dirty(struct page *page) +{ + int ret = __set_page_dirty_buffers(page); + + if (ret) { + struct inode *inode = page->mapping->host; + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); + unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); + + nilfs_set_file_dirty(sbi, inode, nr_dirty); + } + return ret; +} + +static int nilfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) + +{ + struct inode *inode = mapping->host; + int err = nilfs_transaction_begin(inode->i_sb, NULL, 1); + + if (unlikely(err)) + return err; + + *pagep = NULL; + err = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, nilfs_get_block); + if (unlikely(err)) + nilfs_transaction_abort(inode->i_sb); + return err; +} + +static int nilfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned nr_dirty; + int err; + + nr_dirty = nilfs_page_count_clean_buffers(page, start, + start + copied); + copied = generic_write_end(file, mapping, pos, len, copied, page, + fsdata); + nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty); + err = nilfs_transaction_commit(inode->i_sb); + return err ? : copied; +} + +static ssize_t +nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t size; + + if (rw == WRITE) + return 0; + + /* Needs synchronization with the cleaner */ + size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, nilfs_get_block, NULL); + return size; +} + +struct address_space_operations nilfs_aops = { + .writepage = nilfs_writepage, + .readpage = nilfs_readpage, + /* .sync_page = nilfs_sync_page, */ + .writepages = nilfs_writepages, + .set_page_dirty = nilfs_set_page_dirty, + .readpages = nilfs_readpages, + .write_begin = nilfs_write_begin, + .write_end = nilfs_write_end, + /* .releasepage = nilfs_releasepage, */ + .invalidatepage = block_invalidatepage, + .direct_IO = nilfs_direct_IO, +}; + +struct inode *nilfs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb = dir->i_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct inode *inode; + struct nilfs_inode_info *ii; + int err = -ENOMEM; + ino_t ino; + + inode = new_inode(sb); + if (unlikely(!inode)) + goto failed; + + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + + ii = NILFS_I(inode); + ii->i_state = 1 << NILFS_I_NEW; + + err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh); + if (unlikely(err)) + goto failed_ifile_create_inode; + /* reference count of i_bh inherits from nilfs_mdt_read_block() */ + + atomic_inc(&sbi->s_inodes_count); + + inode->i_uid = current_fsuid(); + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + + inode->i_mode = mode; + inode->i_ino = ino; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { + err = nilfs_bmap_read(ii->i_bmap, NULL); + if (err < 0) + goto failed_bmap; + + set_bit(NILFS_I_BMAP, &ii->i_state); + /* No lock is needed; iget() ensures it. */ + } + + ii->i_flags = NILFS_I(dir)->i_flags; + if (S_ISLNK(mode)) + ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL); + if (!S_ISDIR(mode)) + ii->i_flags &= ~NILFS_DIRSYNC_FL; + + /* ii->i_file_acl = 0; */ + /* ii->i_dir_acl = 0; */ + ii->i_dir_start_lookup = 0; +#ifdef CONFIG_NILFS_FS_POSIX_ACL + ii->i_acl = NULL; + ii->i_default_acl = NULL; +#endif + ii->i_cno = 0; + nilfs_set_inode_flags(inode); + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + insert_inode_hash(inode); + + err = nilfs_init_acl(inode, dir); + if (unlikely(err)) + goto failed_acl; /* never occur. When supporting + nilfs_init_acl(), proper cancellation of + above jobs should be considered */ + + mark_inode_dirty(inode); + return inode; + + failed_acl: + failed_bmap: + inode->i_nlink = 0; + iput(inode); /* raw_inode will be deleted through + generic_delete_inode() */ + goto failed; + + failed_ifile_create_inode: + make_bad_inode(inode); + iput(inode); /* if i_nlink == 1, generic_forget_inode() will be + called */ + failed: + return ERR_PTR(err); +} + +void nilfs_free_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + + clear_inode(inode); + /* XXX: check error code? Is there any thing I can do? */ + (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino); + atomic_dec(&sbi->s_inodes_count); +} + +void nilfs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = NILFS_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC); + if (flags & NILFS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & NILFS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & NILFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; +#ifndef NILFS_ATIME_DISABLE + if (flags & NILFS_NOATIME_FL) +#endif + inode->i_flags |= S_NOATIME; + if (flags & NILFS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); +} + +int nilfs_read_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); + inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le64_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (inode->i_nlink == 0 && inode->i_mode == 0) + return -EINVAL; /* this inode is deleted */ + + inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); + ii->i_flags = le32_to_cpu(raw_inode->i_flags); +#if 0 + ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + ii->i_dir_acl = S_ISREG(inode->i_mode) ? + 0 : le32_to_cpu(raw_inode->i_dir_acl); +#endif + ii->i_cno = 0; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + err = nilfs_bmap_read(ii->i_bmap, raw_inode); + if (err < 0) + return err; + set_bit(NILFS_I_BMAP, &ii->i_state); + /* No lock is needed; iget() ensures it. */ + } + return 0; +} + +static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, + struct inode *inode) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct inode *dat = nilfs_dat_inode(sbi->s_nilfs); + struct buffer_head *bh; + struct nilfs_inode *raw_inode; + int err; + + down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh); + if (unlikely(err)) + goto bad_inode; + + raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); + +#ifdef CONFIG_NILFS_FS_POSIX_ACL + ii->i_acl = NILFS_ACL_NOT_CACHED; + ii->i_default_acl = NILFS_ACL_NOT_CACHED; +#endif + if (nilfs_read_inode_common(inode, raw_inode)) + goto failed_unmap; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &nilfs_file_inode_operations; + inode->i_fop = &nilfs_file_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &nilfs_dir_inode_operations; + inode->i_fop = &nilfs_dir_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &nilfs_symlink_inode_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else { + inode->i_op = &nilfs_special_inode_operations; + init_special_inode( + inode, inode->i_mode, + new_decode_dev(le64_to_cpu(raw_inode->i_device_code))); + } + nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); + brelse(bh); + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + nilfs_set_inode_flags(inode); + return 0; + + failed_unmap: + nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); + brelse(bh); + + bad_inode: + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + return err; +} + +struct inode *nilfs_iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; + int err; + + inode = iget_locked(sb, ino); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = __nilfs_read_inode(sb, ino, inode); + if (unlikely(err)) { + iget_failed(inode); + return ERR_PTR(err); + } + unlock_new_inode(inode); + return inode; +} + +void nilfs_write_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode, int has_bmap) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + raw_inode->i_uid = cpu_to_le32(inode->i_uid); + raw_inode->i_gid = cpu_to_le32(inode->i_gid); + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le64(inode->i_size); + raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); + + raw_inode->i_flags = cpu_to_le32(ii->i_flags); + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + + if (has_bmap) + nilfs_bmap_write(ii->i_bmap, raw_inode); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_device_code = + cpu_to_le64(new_encode_dev(inode->i_rdev)); + /* When extending inode, nilfs->ns_inode_size should be checked + for substitutions of appended fields */ +} + +void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) +{ + ino_t ino = inode->i_ino; + struct nilfs_inode_info *ii = NILFS_I(inode); + struct super_block *sb = inode->i_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_inode *raw_inode; + + raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); + + /* The buffer is guarded with lock_buffer() by the caller */ + if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) + memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); + set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + + nilfs_write_inode_common(inode, raw_inode, 0); + /* XXX: call with has_bmap = 0 is a workaround to avoid + deadlock of bmap. This delays update of i_bmap to just + before writing */ + nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh); +} + +#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ + +static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, + unsigned long from) +{ + unsigned long b; + int ret; + + if (!test_bit(NILFS_I_BMAP, &ii->i_state)) + return; + repeat: + ret = nilfs_bmap_last_key(ii->i_bmap, &b); + if (ret == -ENOENT) + return; + else if (ret < 0) + goto failed; + + if (b < from) + return; + + b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from); + ret = nilfs_bmap_truncate(ii->i_bmap, b); + nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); + if (!ret || (ret == -ENOMEM && + nilfs_bmap_truncate(ii->i_bmap, b) == 0)) + goto repeat; + + failed: + if (ret == -EINVAL) + nilfs_error(ii->vfs_inode.i_sb, __func__, + "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino); + else + nilfs_warning(ii->vfs_inode.i_sb, __func__, + "failed to truncate bmap (ino=%lu, err=%d)", + ii->vfs_inode.i_ino, ret); +} + +void nilfs_truncate(struct inode *inode) +{ + unsigned long blkoff; + unsigned int blocksize; + struct nilfs_transaction_info ti; + struct super_block *sb = inode->i_sb; + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (!test_bit(NILFS_I_BMAP, &ii->i_state)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + blocksize = sb->s_blocksize; + blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits; + nilfs_transaction_begin(sb, &ti, 0); /* never fails */ + + block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block); + + nilfs_truncate_bmap(ii, blkoff); + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + + nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); + nilfs_transaction_commit(sb); + /* May construct a logical segment and may fail in sync mode. + But truncate has no return value. */ +} + +void nilfs_delete_inode(struct inode *inode) +{ + struct nilfs_transaction_info ti; + struct super_block *sb = inode->i_sb; + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (unlikely(is_bad_inode(inode))) { + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + return; + } + nilfs_transaction_begin(sb, &ti, 0); /* never fails */ + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + nilfs_truncate_bmap(ii, 0); + nilfs_free_inode(inode); + /* nilfs_free_inode() marks inode buffer dirty */ + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + nilfs_transaction_commit(sb); + /* May construct a logical segment and may fail in sync mode. + But delete_inode has no return value. */ +} + +int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct nilfs_transaction_info ti; + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + int err; + + err = inode_change_ok(inode, iattr); + if (err) + return err; + + err = nilfs_transaction_begin(sb, &ti, 0); + if (unlikely(err)) + return err; + err = inode_setattr(inode, iattr); + if (!err && (iattr->ia_valid & ATTR_MODE)) + err = nilfs_acl_chmod(inode); + if (likely(!err)) + err = nilfs_transaction_commit(sb); + else + nilfs_transaction_abort(sb); + + return err; +} + +int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, + struct buffer_head **pbh) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + spin_lock(&sbi->s_inode_lock); + /* Caller of this function MUST lock s_inode_lock */ + if (ii->i_bh == NULL) { + spin_unlock(&sbi->s_inode_lock); + err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, + pbh); + if (unlikely(err)) + return err; + spin_lock(&sbi->s_inode_lock); + if (ii->i_bh == NULL) + ii->i_bh = *pbh; + else { + brelse(*pbh); + *pbh = ii->i_bh; + } + } else + *pbh = ii->i_bh; + + get_bh(*pbh); + spin_unlock(&sbi->s_inode_lock); + return 0; +} + +int nilfs_inode_dirty(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); + int ret = 0; + + if (!list_empty(&ii->i_dirty)) { + spin_lock(&sbi->s_inode_lock); + ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || + test_bit(NILFS_I_BUSY, &ii->i_state); + spin_unlock(&sbi->s_inode_lock); + } + return ret; +} + +int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode, + unsigned nr_dirty) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); + + if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) + return 0; + + spin_lock(&sbi->s_inode_lock); + if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && + !test_bit(NILFS_I_BUSY, &ii->i_state)) { + /* Because this routine may race with nilfs_dispose_list(), + we have to check NILFS_I_QUEUED here, too. */ + if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { + /* This will happen when somebody is freeing + this inode. */ + nilfs_warning(sbi->s_super, __func__, + "cannot get inode (ino=%lu)\n", + inode->i_ino); + spin_unlock(&sbi->s_inode_lock); + return -EINVAL; /* NILFS_I_DIRTY may remain for + freeing inode */ + } + list_del(&ii->i_dirty); + list_add_tail(&ii->i_dirty, &sbi->s_dirty_files); + set_bit(NILFS_I_QUEUED, &ii->i_state); + } + spin_unlock(&sbi->s_inode_lock); + return 0; +} + +int nilfs_mark_inode_dirty(struct inode *inode) +{ + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); + struct buffer_head *ibh; + int err; + + err = nilfs_load_inode_block(sbi, inode, &ibh); + if (unlikely(err)) { + nilfs_warning(inode->i_sb, __func__, + "failed to reget inode block.\n"); + return err; + } + lock_buffer(ibh); + nilfs_update_inode(inode, ibh); + unlock_buffer(ibh); + nilfs_mdt_mark_buffer_dirty(ibh); + nilfs_mdt_mark_dirty(sbi->s_ifile); + brelse(ibh); + return 0; +} + +/** + * nilfs_dirty_inode - reflect changes on given inode to an inode block. + * @inode: inode of the file to be registered. + * + * nilfs_dirty_inode() loads a inode block containing the specified + * @inode and copies data from a nilfs_inode to a corresponding inode + * entry in the inode block. This operation is excluded from the segment + * construction. This function can be called both as a single operation + * and as a part of indivisible file operations. + */ +void nilfs_dirty_inode(struct inode *inode) +{ + struct nilfs_transaction_info ti; + + if (is_bad_inode(inode)) { + nilfs_warning(inode->i_sb, __func__, + "tried to mark bad_inode dirty. ignored.\n"); + dump_stack(); + return; + } + nilfs_transaction_begin(inode->i_sb, &ti, 0); + nilfs_mark_inode_dirty(inode); + nilfs_transaction_commit(inode->i_sb); /* never fails */ +} diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c new file mode 100644 index 000000000000..108d281ebca5 --- /dev/null +++ b/fs/nilfs2/ioctl.c @@ -0,0 +1,654 @@ +/* + * ioctl.c - NILFS ioctl operations. + * + * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/fs.h> +#include <linux/wait.h> +#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */ +#include <linux/capability.h> /* capable() */ +#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ +#include <linux/nilfs2_fs.h> +#include "nilfs.h" +#include "segment.h" +#include "bmap.h" +#include "cpfile.h" +#include "sufile.h" +#include "dat.h" + + +static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, + struct nilfs_argv *argv, int dir, + ssize_t (*dofunc)(struct the_nilfs *, + __u64 *, int, + void *, size_t, size_t)) +{ + void *buf; + void __user *base = (void __user *)(unsigned long)argv->v_base; + size_t maxmembs, total, n; + ssize_t nr; + int ret, i; + __u64 pos, ppos; + + if (argv->v_nmembs == 0) + return 0; + + if (argv->v_size > PAGE_SIZE) + return -EINVAL; + + buf = (void *)__get_free_pages(GFP_NOFS, 0); + if (unlikely(!buf)) + return -ENOMEM; + maxmembs = PAGE_SIZE / argv->v_size; + + ret = 0; + total = 0; + pos = argv->v_index; + for (i = 0; i < argv->v_nmembs; i += n) { + n = (argv->v_nmembs - i < maxmembs) ? + argv->v_nmembs - i : maxmembs; + if ((dir & _IOC_WRITE) && + copy_from_user(buf, base + argv->v_size * i, + argv->v_size * n)) { + ret = -EFAULT; + break; + } + ppos = pos; + nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size, + n); + if (nr < 0) { + ret = nr; + break; + } + if ((dir & _IOC_READ) && + copy_to_user(base + argv->v_size * i, buf, + argv->v_size * nr)) { + ret = -EFAULT; + break; + } + total += nr; + if ((size_t)nr < n) + break; + if (pos == ppos) + pos += n; + } + argv->v_nmembs = total; + + free_pages((unsigned long)buf, 0); + return ret; +} + +static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; + struct nilfs_transaction_info ti; + struct nilfs_cpmode cpmode; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&cpmode, argp, sizeof(cpmode))) + return -EFAULT; + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_change_cpmode( + cpfile, cpmode.cm_cno, cpmode.cm_mode); + if (unlikely(ret < 0)) { + nilfs_transaction_abort(inode->i_sb); + return ret; + } + nilfs_transaction_commit(inode->i_sb); /* never fails */ + return ret; +} + +static int +nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; + struct nilfs_transaction_info ti; + __u64 cno; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&cno, argp, sizeof(cno))) + return -EFAULT; + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); + if (unlikely(ret < 0)) { + nilfs_transaction_abort(inode->i_sb); + return ret; + } + nilfs_transaction_commit(inode->i_sb); /* never fails */ + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + return nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf, + nmembs); +} + +static int nilfs_ioctl_get_cpinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_cpinfo); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_cpstat cpstat; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &cpstat, sizeof(cpstat))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + return nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs); +} + +static int nilfs_ioctl_get_suinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_suinfo); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_sustat sustat; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &sustat, sizeof(sustat))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + return nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs); +} + +static int nilfs_ioctl_get_vinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_vinfo); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct inode *dat = nilfs_dat_inode(nilfs); + struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap; + struct nilfs_bdesc *bdescs = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + ret = nilfs_bmap_lookup_at_level(bmap, + bdescs[i].bd_offset, + bdescs[i].bd_level + 1, + &bdescs[i].bd_blocknr); + if (ret < 0) { + if (ret != -ENOENT) + return ret; + bdescs[i].bd_blocknr = 0; + } + } + return nmembs; +} + +static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_bdescs); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static int nilfs_ioctl_move_inode_block(struct inode *inode, + struct nilfs_vdesc *vdesc, + struct list_head *buffers) +{ + struct buffer_head *bh; + int ret; + + if (vdesc->vd_flags == 0) + ret = nilfs_gccache_submit_read_data( + inode, vdesc->vd_offset, vdesc->vd_blocknr, + vdesc->vd_vblocknr, &bh); + else + ret = nilfs_gccache_submit_read_node( + inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh); + + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + printk(KERN_CRIT + "%s: invalid virtual block address (%s): " + "ino=%llu, cno=%llu, offset=%llu, " + "blocknr=%llu, vblocknr=%llu\n", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + return ret; + } + bh->b_private = vdesc; + list_add_tail(&bh->b_assoc_buffers, buffers); + return 0; +} + +static ssize_t +nilfs_ioctl_do_move_blocks(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct inode *inode; + struct nilfs_vdesc *vdesc; + struct buffer_head *bh, *n; + LIST_HEAD(buffers); + ino_t ino; + __u64 cno; + int i, ret; + + for (i = 0, vdesc = buf; i < nmembs; ) { + ino = vdesc->vd_ino; + cno = vdesc->vd_cno; + inode = nilfs_gc_iget(nilfs, ino, cno); + if (unlikely(inode == NULL)) { + ret = -ENOMEM; + goto failed; + } + do { + ret = nilfs_ioctl_move_inode_block(inode, vdesc, + &buffers); + if (unlikely(ret < 0)) + goto failed; + vdesc++; + } while (++i < nmembs && + vdesc->vd_ino == ino && vdesc->vd_cno == cno); + } + + list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { + ret = nilfs_gccache_wait_and_mark_dirty(bh); + if (unlikely(ret < 0)) { + if (ret == -EEXIST) { + vdesc = bh->b_private; + printk(KERN_CRIT + "%s: conflicting %s buffer: " + "ino=%llu, cno=%llu, offset=%llu, " + "blocknr=%llu, vblocknr=%llu\n", + __func__, + vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + } + goto failed; + } + list_del_init(&bh->b_assoc_buffers); + bh->b_private = NULL; + brelse(bh); + } + return nmembs; + + failed: + list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + bh->b_private = NULL; + brelse(bh); + } + return ret; +} + +static inline int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_move_blocks); +} + +static ssize_t +nilfs_ioctl_do_delete_checkpoints(struct the_nilfs *nilfs, __u64 *posp, + int flags, void *buf, size_t size, + size_t nmembs) +{ + struct inode *cpfile = nilfs->ns_cpfile; + struct nilfs_period *periods = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + ret = nilfs_cpfile_delete_checkpoints( + cpfile, periods[i].p_start, periods[i].p_end); + if (ret < 0) + return ret; + } + return nmembs; +} + +static inline int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_delete_checkpoints); +} + +static ssize_t +nilfs_ioctl_do_free_vblocknrs(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + int ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs); + + return (ret < 0) ? ret : nmembs; +} + +static inline int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_free_vblocknrs); +} + +static ssize_t +nilfs_ioctl_do_mark_blocks_dirty(struct the_nilfs *nilfs, __u64 *posp, + int flags, void *buf, size_t size, + size_t nmembs) +{ + struct inode *dat = nilfs_dat_inode(nilfs); + struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap; + struct nilfs_bdesc *bdescs = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + /* XXX: use macro or inline func to check liveness */ + ret = nilfs_bmap_lookup_at_level(bmap, + bdescs[i].bd_offset, + bdescs[i].bd_level + 1, + &bdescs[i].bd_blocknr); + if (ret < 0) { + if (ret != -ENOENT) + return ret; + bdescs[i].bd_blocknr = 0; + } + if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr) + /* skip dead block */ + continue; + if (bdescs[i].bd_level == 0) { + ret = nilfs_mdt_mark_block_dirty(dat, + bdescs[i].bd_offset); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + } else { + ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset, + bdescs[i].bd_level); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + } + } + return nmembs; +} + +static inline int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_mark_blocks_dirty); +} + +static ssize_t +nilfs_ioctl_do_free_segments(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs); + int ret; + + if (unlikely(!sbi)) + return -EROFS; + ret = nilfs_segctor_add_segments_to_be_freed( + NILFS_SC(sbi), buf, nmembs); + nilfs_put_writer(nilfs); + + return (ret < 0) ? ret : nmembs; +} + +static inline int nilfs_ioctl_free_segments(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_free_segments); +} + +int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, + void __user *argp) +{ + struct nilfs_argv argv[5]; + const char *msg; + int dir, ret; + + if (copy_from_user(argv, argp, sizeof(argv))) + return -EFAULT; + + dir = _IOC_WRITE; + ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], dir); + if (ret < 0) { + msg = "cannot read source blocks"; + goto failed; + } + ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], dir); + if (ret < 0) { + /* + * can safely abort because checkpoints can be removed + * independently. + */ + msg = "cannot delete checkpoints"; + goto failed; + } + ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], dir); + if (ret < 0) { + /* + * can safely abort because DAT file is updated atomically + * using a copy-on-write technique. + */ + msg = "cannot delete virtual blocks from DAT file"; + goto failed; + } + ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], dir); + if (ret < 0) { + /* + * can safely abort because the operation is nondestructive. + */ + msg = "cannot mark copying blocks dirty"; + goto failed; + } + ret = nilfs_ioctl_free_segments(nilfs, &argv[4], dir); + if (ret < 0) { + /* + * can safely abort because this operation is atomic. + */ + msg = "cannot set segments to be freed"; + goto failed; + } + return 0; + + failed: + nilfs_remove_all_gcinode(nilfs); + printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", + msg, ret); + return ret; +} + +static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return nilfs_clean_segments(inode->i_sb, argp); +} + +static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + __u64 cno; + int ret; + + ret = nilfs_construct_segment(inode->i_sb); + if (ret < 0) + return ret; + + if (argp != NULL) { + cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1; + if (copy_to_user(argp, &cno, sizeof(cno))) + return -EFAULT; + } + return 0; +} + +long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + void __user *argp = (void * __user *)arg; + + switch (cmd) { + case NILFS_IOCTL_CHANGE_CPMODE: + return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp); + case NILFS_IOCTL_DELETE_CHECKPOINT: + return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_CPINFO: + return nilfs_ioctl_get_cpinfo(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_CPSTAT: + return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_SUINFO: + return nilfs_ioctl_get_suinfo(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_SUSTAT: + return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_VINFO: + /* XXX: rename to ??? */ + return nilfs_ioctl_get_vinfo(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_BDESCS: + return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp); + case NILFS_IOCTL_CLEAN_SEGMENTS: + return nilfs_ioctl_clean_segments(inode, filp, cmd, argp); + case NILFS_IOCTL_SYNC: + return nilfs_ioctl_sync(inode, filp, cmd, argp); + default: + return -ENOTTY; + } +} diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c new file mode 100644 index 000000000000..47dd815433fd --- /dev/null +++ b/fs/nilfs2/mdt.c @@ -0,0 +1,563 @@ +/* + * mdt.c - meta data file for NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/mm.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/swap.h> +#include "nilfs.h" +#include "segment.h" +#include "page.h" +#include "mdt.h" + + +#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) + +#define INIT_UNUSED_INODE_FIELDS + +static int +nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, + struct buffer_head *bh, + void (*init_block)(struct inode *, + struct buffer_head *, void *)) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + void *kaddr; + int ret; + + /* Caller exclude read accesses using page lock */ + + /* set_buffer_new(bh); */ + bh->b_blocknr = 0; + + ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh); + if (unlikely(ret)) + return ret; + + set_buffer_mapped(bh); + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits); + if (init_block) + init_block(inode, bh, kaddr); + flush_dcache_page(bh->b_page); + kunmap_atomic(kaddr, KM_USER0); + + set_buffer_uptodate(bh); + nilfs_mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(inode); + return 0; +} + +static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, + struct buffer_head **out_bh, + void (*init_block)(struct inode *, + struct buffer_head *, + void *)) +{ + struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs; + struct nilfs_sb_info *writer = NULL; + struct super_block *sb = inode->i_sb; + struct nilfs_transaction_info ti; + struct buffer_head *bh; + int err; + + if (!sb) { + writer = nilfs_get_writer(nilfs); + if (!writer) { + err = -EROFS; + goto out; + } + sb = writer->s_super; + } + + nilfs_transaction_begin(sb, &ti, 0); + + err = -ENOMEM; + bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0); + if (unlikely(!bh)) + goto failed_unlock; + + err = -EEXIST; + if (buffer_uptodate(bh) || buffer_mapped(bh)) + goto failed_bh; +#if 0 + /* The uptodate flag is not protected by the page lock, but + the mapped flag is. Thus, we don't have to wait the buffer. */ + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + goto failed_bh; +#endif + + bh->b_bdev = nilfs->ns_bdev; + err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); + if (likely(!err)) { + get_bh(bh); + *out_bh = bh; + } + + failed_bh: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + brelse(bh); + + failed_unlock: + if (likely(!err)) + err = nilfs_transaction_commit(sb); + else + nilfs_transaction_abort(sb); + if (writer) + nilfs_put_writer(nilfs); + out: + return err; +} + +static int +nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, + int mode, struct buffer_head **out_bh) +{ + struct buffer_head *bh; + unsigned long blknum = 0; + int ret = -ENOMEM; + + bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); + if (unlikely(!bh)) + goto failed; + + ret = -EEXIST; /* internal code */ + if (buffer_uptodate(bh)) + goto out; + + if (mode == READA) { + if (!trylock_buffer(bh)) { + ret = -EBUSY; + goto failed_bh; + } + } else /* mode == READ */ + lock_buffer(bh); + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + goto out; + } + if (!buffer_mapped(bh)) { /* unused buffer */ + ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, + &blknum); + if (unlikely(ret)) { + unlock_buffer(bh); + goto failed_bh; + } + bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; + bh->b_blocknr = blknum; + set_buffer_mapped(bh); + } + + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(mode, bh); + ret = 0; + out: + get_bh(bh); + *out_bh = bh; + + failed_bh: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + brelse(bh); + failed: + return ret; +} + +static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, + struct buffer_head **out_bh) +{ + struct buffer_head *first_bh, *bh; + unsigned long blkoff; + int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; + int err; + + err = nilfs_mdt_submit_block(inode, block, READ, &first_bh); + if (err == -EEXIST) /* internal code */ + goto out; + + if (unlikely(err)) + goto failed; + + blkoff = block + 1; + for (i = 0; i < nr_ra_blocks; i++, blkoff++) { + err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); + if (likely(!err || err == -EEXIST)) + brelse(bh); + else if (err != -EBUSY) + break; /* abort readahead if bmap lookup failed */ + + if (!buffer_locked(first_bh)) + goto out_no_wait; + } + + wait_on_buffer(first_bh); + + out_no_wait: + err = -EIO; + if (!buffer_uptodate(first_bh)) + goto failed_bh; + out: + *out_bh = first_bh; + return 0; + + failed_bh: + brelse(first_bh); + failed: + return err; +} + +/** + * nilfs_mdt_get_block - read or create a buffer on meta data file. + * @inode: inode of the meta data file + * @blkoff: block offset + * @create: create flag + * @init_block: initializer used for newly allocated block + * @out_bh: output of a pointer to the buffer_head + * + * nilfs_mdt_get_block() looks up the specified buffer and tries to create + * a new buffer if @create is not zero. On success, the returned buffer is + * assured to be either existing or formatted using a buffer lock on success. + * @out_bh is substituted only when zero is returned. + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + * + * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) + * + * %-EROFS - Read only filesystem (for create mode) + */ +int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, + void (*init_block)(struct inode *, + struct buffer_head *, void *), + struct buffer_head **out_bh) +{ + int ret; + + /* Should be rewritten with merging nilfs_mdt_read_block() */ + retry: + ret = nilfs_mdt_read_block(inode, blkoff, out_bh); + if (!create || ret != -ENOENT) + return ret; + + ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block); + if (unlikely(ret == -EEXIST)) { + /* create = 0; */ /* limit read-create loop retries */ + goto retry; + } + return ret; +} + +/** + * nilfs_mdt_delete_block - make a hole on the meta data file. + * @inode: inode of the meta data file + * @block: block offset + * + * Return Value: On success, zero is returned. + * On error, one of the following negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) + */ +int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + err = nilfs_bmap_delete(ii->i_bmap, block); + if (likely(!err)) { + nilfs_mdt_mark_dirty(inode); + nilfs_mdt_forget_block(inode, block); + } + return err; +} + +/** + * nilfs_mdt_forget_block - discard dirty state and try to remove the page + * @inode: inode of the meta data file + * @block: block offset + * + * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and + * tries to release the page including the buffer from a page cache. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EBUSY - page has an active buffer. + * + * %-ENOENT - page cache has no page addressed by the offset. + */ +int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) +{ + pgoff_t index = (pgoff_t)block >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + struct page *page; + unsigned long first_block; + int ret = 0; + int still_dirty; + + page = find_lock_page(inode->i_mapping, index); + if (!page) + return -ENOENT; + + wait_on_page_writeback(page); + + first_block = (unsigned long)index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (page_has_buffers(page)) { + struct buffer_head *bh; + + bh = nilfs_page_get_nth_block(page, block - first_block); + nilfs_forget_buffer(bh); + } + still_dirty = PageDirty(page); + unlock_page(page); + page_cache_release(page); + + if (still_dirty || + invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) + ret = -EBUSY; + return ret; +} + +/** + * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty. + * @inode: inode of the meta data file + * @block: block offset + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + * + * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) + */ +int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) +{ + struct buffer_head *bh; + int err; + + err = nilfs_mdt_read_block(inode, block, &bh); + if (unlikely(err)) + return err; + nilfs_mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(inode); + brelse(bh); + return 0; +} + +int nilfs_mdt_fetch_dirty(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) { + set_bit(NILFS_I_DIRTY, &ii->i_state); + return 1; + } + return test_bit(NILFS_I_DIRTY, &ii->i_state); +} + +static int +nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = container_of(page->mapping, + struct inode, i_data); + struct super_block *sb = inode->i_sb; + struct nilfs_sb_info *writer = NULL; + int err = 0; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + + if (page->mapping->assoc_mapping) + return 0; /* Do not request flush for shadow page cache */ + if (!sb) { + writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); + if (!writer) + return -EROFS; + sb = writer->s_super; + } + + if (wbc->sync_mode == WB_SYNC_ALL) + err = nilfs_construct_segment(sb); + else if (wbc->for_reclaim) + nilfs_flush_segment(sb, inode->i_ino); + + if (writer) + nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); + return err; +} + + +static struct address_space_operations def_mdt_aops = { + .writepage = nilfs_mdt_write_page, +}; + +static struct inode_operations def_mdt_iops; +static struct file_operations def_mdt_fops; + +/* + * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, + * ifile, or gcinodes. This allows the B-tree code and segment constructor + * to treat them like regular files, and this helps to simplify the + * implementation. + * On the other hand, some of the pseudo inodes have an irregular point: + * They don't have valid inode->i_sb pointer because their lifetimes are + * longer than those of the super block structs; they may continue for + * several consecutive mounts/umounts. This would need discussions. + */ +struct inode * +nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, + ino_t ino, gfp_t gfp_mask) +{ + struct inode *inode = nilfs_alloc_inode(sb); + + if (!inode) + return NULL; + else { + struct address_space * const mapping = &inode->i_data; + struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS); + + if (!mi) { + nilfs_destroy_inode(inode); + return NULL; + } + mi->mi_nilfs = nilfs; + init_rwsem(&mi->mi_sem); + + inode->i_sb = sb; /* sb may be NULL for some meta data files */ + inode->i_blkbits = nilfs->ns_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_nlink = 1; + inode->i_ino = ino; + inode->i_mode = S_IFREG; + inode->i_private = mi; + +#ifdef INIT_UNUSED_INODE_FIELDS + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; +#ifdef CONFIG_QUOTA + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); +#endif + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; +#ifdef CONFIG_SECURITY + inode->i_security = NULL; +#endif + inode->dirtied_when = 0; + + INIT_LIST_HEAD(&inode->i_list); + INIT_LIST_HEAD(&inode->i_sb_list); + inode->i_state = 0; +#endif + + spin_lock_init(&inode->i_lock); + mutex_init(&inode->i_mutex); + init_rwsem(&inode->i_alloc_sem); + + mapping->host = NULL; /* instead of inode */ + mapping->flags = 0; + mapping_set_gfp_mask(mapping, gfp_mask); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = nilfs->ns_bdi; + + inode->i_mapping = mapping; + } + + return inode; +} + +struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, + ino_t ino, gfp_t gfp_mask) +{ + struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask); + + if (!inode) + return NULL; + + inode->i_op = &def_mdt_iops; + inode->i_fop = &def_mdt_fops; + inode->i_mapping->a_ops = &def_mdt_aops; + return inode; +} + +void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, + unsigned header_size) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + mi->mi_entry_size = entry_size; + mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size; + mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); +} + +void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow) +{ + shadow->i_mapping->assoc_mapping = orig->i_mapping; + NILFS_I(shadow)->i_btnode_cache.assoc_mapping = + &NILFS_I(orig)->i_btnode_cache; +} + +void nilfs_mdt_clear(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + invalidate_mapping_pages(inode->i_mapping, 0, -1); + truncate_inode_pages(inode->i_mapping, 0); + + nilfs_bmap_clear(ii->i_bmap); + nilfs_btnode_cache_clear(&ii->i_btnode_cache); +} + +void nilfs_mdt_destroy(struct inode *inode) +{ + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ + kfree(mdi); + nilfs_destroy_inode(inode); +} diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h new file mode 100644 index 000000000000..df683e0bca6a --- /dev/null +++ b/fs/nilfs2/mdt.h @@ -0,0 +1,125 @@ +/* + * mdt.h - NILFS meta data file prototype and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#ifndef _NILFS_MDT_H +#define _NILFS_MDT_H + +#include <linux/buffer_head.h> +#include <linux/blockgroup_lock.h> +#include "nilfs.h" +#include "page.h" + +/** + * struct nilfs_mdt_info - on-memory private data of meta data files + * @mi_nilfs: back pointer to the_nilfs struct + * @mi_sem: reader/writer semaphore for meta data operations + * @mi_bgl: per-blockgroup locking + * @mi_entry_size: size of an entry + * @mi_first_entry_offset: offset to the first entry + * @mi_entries_per_block: number of entries in a block + * @mi_blocks_per_group: number of blocks in a group + * @mi_blocks_per_desc_block: number of blocks per descriptor block + */ +struct nilfs_mdt_info { + struct the_nilfs *mi_nilfs; + struct rw_semaphore mi_sem; + struct blockgroup_lock *mi_bgl; + unsigned mi_entry_size; + unsigned mi_first_entry_offset; + unsigned long mi_entries_per_block; + unsigned long mi_blocks_per_group; + unsigned long mi_blocks_per_desc_block; +}; + +static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) +{ + return inode->i_private; +} + +static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs; +} + +/* Default GFP flags using highmem */ +#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) + +int nilfs_mdt_get_block(struct inode *, unsigned long, int, + void (*init_block)(struct inode *, + struct buffer_head *, void *), + struct buffer_head **); +int nilfs_mdt_delete_block(struct inode *, unsigned long); +int nilfs_mdt_forget_block(struct inode *, unsigned long); +int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); +int nilfs_mdt_fetch_dirty(struct inode *); + +struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, + gfp_t); +struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, + ino_t, gfp_t); +void nilfs_mdt_destroy(struct inode *); +void nilfs_mdt_clear(struct inode *); +void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); +void nilfs_mdt_set_shadow(struct inode *, struct inode *); + + +#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh) + +static inline void nilfs_mdt_mark_dirty(struct inode *inode) +{ + if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state)) + set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state); +} + +static inline void nilfs_mdt_clear_dirty(struct inode *inode) +{ + clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state); +} + +static inline __u64 nilfs_mdt_cno(struct inode *inode) +{ + return NILFS_MDT(inode)->mi_nilfs->ns_cno; +} + +#define nilfs_mdt_bgl_lock(inode, bg) \ + (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock) + + +static inline int +nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh, + unsigned n) +{ + return nilfs_read_inode_common( + inode, (struct nilfs_inode *)(bh->b_data + n)); +} + +static inline void +nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh, + unsigned n) +{ + nilfs_write_inode_common( + inode, (struct nilfs_inode *)(bh->b_data + n), 1); +} + +#endif /* _NILFS_MDT_H */ diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c new file mode 100644 index 000000000000..df70dadb336f --- /dev/null +++ b/fs/nilfs2/namei.c @@ -0,0 +1,474 @@ +/* + * namei.c - NILFS pathname lookup operations. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>, + * Ryusuke Konishi <ryusuke@osrg.net> + */ +/* + * linux/fs/ext2/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/pagemap.h> +#include "nilfs.h" + + +static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = nilfs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +/* + * Methods themselves. + */ + +static struct dentry * +nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + ino_t ino; + + if (dentry->d_name.len > NILFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = nilfs_inode_by_name(dir, dentry); + inode = NULL; + if (ino) { + inode = nilfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +struct dentry *nilfs_get_parent(struct dentry *child) +{ + unsigned long ino; + struct inode *inode; + struct dentry dotdot; + + dotdot.d_name.name = ".."; + dotdot.d_name.len = 2; + + ino = nilfs_inode_by_name(child->d_inode, &dotdot); + if (!ino) + return ERR_PTR(-ENOENT); + + inode = nilfs_iget(child->d_inode->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_obtain_alias(inode); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + inode = nilfs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &nilfs_file_inode_operations; + inode->i_fop = &nilfs_file_operations; + inode->i_mapping->a_ops = &nilfs_aops; + mark_inode_dirty(inode); + err = nilfs_add_nondir(dentry, inode); + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int +nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + inode = nilfs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + mark_inode_dirty(inode); + err = nilfs_add_nondir(dentry, inode); + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct nilfs_transaction_info ti; + struct super_block *sb = dir->i_sb; + unsigned l = strlen(symname)+1; + struct inode *inode; + int err; + + if (l > sb->s_blocksize) + return -ENAMETOOLONG; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + /* slow symlink */ + inode->i_op = &nilfs_symlink_inode_operations; + inode->i_mapping->a_ops = &nilfs_aops; + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + + /* mark_inode_dirty(inode); */ + /* nilfs_new_inode() and page_symlink() do this */ + + err = nilfs_add_nondir(dentry, inode); +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int nilfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + struct nilfs_transaction_info ti; + int err; + + if (inode->i_nlink >= NILFS_LINK_MAX) + return -EMLINK; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode->i_ctime = CURRENT_TIME; + inode_inc_link_count(inode); + atomic_inc(&inode->i_count); + + err = nilfs_add_nondir(dentry, inode); + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + if (dir->i_nlink >= NILFS_LINK_MAX) + return -EMLINK; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode_inc_link_count(dir); + + inode = nilfs_new_inode(dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &nilfs_dir_inode_operations; + inode->i_fop = &nilfs_dir_operations; + inode->i_mapping->a_ops = &nilfs_aops; + + inode_inc_link_count(inode); + + err = nilfs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = nilfs_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int nilfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct nilfs_dir_entry *de; + struct page *page; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = -ENOENT; + de = nilfs_find_entry(dir, dentry, &page); + if (!de) + goto out; + + inode = dentry->d_inode; + err = -EIO; + if (le64_to_cpu(de->inode) != inode->i_ino) + goto out; + + if (!inode->i_nlink) { + nilfs_warning(inode->i_sb, __func__, + "deleting nonexistent file (%lu), %d\n", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + err = nilfs_delete_entry(de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = -ENOTEMPTY; + if (nilfs_empty_dir(inode)) { + err = nilfs_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *dir_page = NULL; + struct nilfs_dir_entry *dir_de = NULL; + struct page *old_page; + struct nilfs_dir_entry *old_de; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); + if (unlikely(err)) + return err; + + err = -ENOENT; + old_de = nilfs_find_entry(old_dir, old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = nilfs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct nilfs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !nilfs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = nilfs_find_entry(new_dir, new_dentry, &new_page); + if (!new_de) + goto out_dir; + inode_inc_link_count(old_inode); + nilfs_set_link(new_dir, new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= NILFS_LINK_MAX) + goto out_dir; + } + inode_inc_link_count(old_inode); + err = nilfs_add_link(new_dentry, old_inode); + if (err) { + inode_dec_link_count(old_inode); + goto out_dir; + } + if (dir_de) + inode_inc_link_count(new_dir); + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + * inode_dec_link_count() will mark the inode dirty. + */ + old_inode->i_ctime = CURRENT_TIME; + + nilfs_delete_entry(old_de, old_page); + inode_dec_link_count(old_inode); + + if (dir_de) { + nilfs_set_link(old_inode, dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + } + + err = nilfs_transaction_commit(old_dir->i_sb); + return err; + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + nilfs_transaction_abort(old_dir->i_sb); + return err; +} + +struct inode_operations nilfs_dir_inode_operations = { + .create = nilfs_create, + .lookup = nilfs_lookup, + .link = nilfs_link, + .unlink = nilfs_unlink, + .symlink = nilfs_symlink, + .mkdir = nilfs_mkdir, + .rmdir = nilfs_rmdir, + .mknod = nilfs_mknod, + .rename = nilfs_rename, + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +struct inode_operations nilfs_special_inode_operations = { + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +struct inode_operations nilfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h new file mode 100644 index 000000000000..7558c977db02 --- /dev/null +++ b/fs/nilfs2/nilfs.h @@ -0,0 +1,318 @@ +/* + * nilfs.h - NILFS local header file. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net> + * Ryusuke Konishi <ryusuke@osrg.net> + */ + +#ifndef _NILFS_H +#define _NILFS_H + +#include <linux/kernel.h> +#include <linux/buffer_head.h> +#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/nilfs2_fs.h> +#include "the_nilfs.h" +#include "sb.h" +#include "bmap.h" +#include "bmap_union.h" + +/* + * NILFS filesystem version + */ +#define NILFS_VERSION "2.0.5" + +/* + * nilfs inode data in memory + */ +struct nilfs_inode_info { + __u32 i_flags; + unsigned long i_state; /* Dynamic state flags */ + struct nilfs_bmap *i_bmap; + union nilfs_bmap_union i_bmap_union; + __u64 i_xattr; /* sector_t ??? */ + __u32 i_dir_start_lookup; + __u64 i_cno; /* check point number for GC inode */ + struct address_space i_btnode_cache; + struct list_head i_dirty; /* List for connecting dirty files */ + +#ifdef CONFIG_NILFS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_sem even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif +#ifdef CONFIG_NILFS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + struct buffer_head *i_bh; /* i_bh contains a new or dirty + disk inode */ + struct inode vfs_inode; +}; + +static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode) +{ + return container_of(inode, struct nilfs_inode_info, vfs_inode); +} + +static inline struct nilfs_inode_info * +NILFS_BMAP_I(const struct nilfs_bmap *bmap) +{ + return container_of((union nilfs_bmap_union *)bmap, + struct nilfs_inode_info, + i_bmap_union); +} + +static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) +{ + struct nilfs_inode_info *ii = + container_of(btnc, struct nilfs_inode_info, i_btnode_cache); + return &ii->vfs_inode; +} + +static inline struct inode *NILFS_AS_I(struct address_space *mapping) +{ + return (mapping->host) ? : + container_of(mapping, struct inode, i_data); +} + +/* + * Dynamic state flags of NILFS on-memory inode (i_state) + */ +enum { + NILFS_I_NEW = 0, /* Inode is newly created */ + NILFS_I_DIRTY, /* The file is dirty */ + NILFS_I_QUEUED, /* inode is in dirty_files list */ + NILFS_I_BUSY, /* inode is grabbed by a segment + constructor */ + NILFS_I_COLLECTED, /* All dirty blocks are collected */ + NILFS_I_UPDATED, /* The file has been written back */ + NILFS_I_INODE_DIRTY, /* write_inode is requested */ + NILFS_I_BMAP, /* has bmap and btnode_cache */ + NILFS_I_GCINODE, /* inode for GC, on memory only */ + NILFS_I_GCDAT, /* shadow DAT, on memory only */ +}; + +/* + * Macros to check inode numbers + */ +#define NILFS_MDT_INO_BITS \ + ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \ + 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \ + 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO)) + +#define NILFS_SYS_INO_BITS \ + ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) + +#define NILFS_FIRST_INO(sb) (NILFS_SB(sb)->s_nilfs->ns_first_ino) + +#define NILFS_MDT_INODE(sb, ino) \ + ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino)))) +#define NILFS_VALID_INODE(sb, ino) \ + ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino)))) + +/** + * struct nilfs_transaction_info: context information for synchronization + * @ti_magic: Magic number + * @ti_save: Backup of journal_info field of task_struct + * @ti_flags: Flags + * @ti_count: Nest level + * @ti_garbage: List of inode to be put when releasing semaphore + */ +struct nilfs_transaction_info { + u32 ti_magic; + void *ti_save; + /* This should never used. If this happens, + one of other filesystems has a bug. */ + unsigned short ti_flags; + unsigned short ti_count; + struct list_head ti_garbage; +}; + +/* ti_magic */ +#define NILFS_TI_MAGIC 0xd9e392fb + +/* ti_flags */ +#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */ +#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the + end of transaction. */ +#define NILFS_TI_GC 0x0004 /* GC context */ +#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */ +#define NILFS_TI_WRITER 0x0010 /* Constructor context */ + + +int nilfs_transaction_begin(struct super_block *, + struct nilfs_transaction_info *, int); +int nilfs_transaction_commit(struct super_block *); +void nilfs_transaction_abort(struct super_block *); + +static inline void nilfs_set_transaction_flag(unsigned int flag) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + ti->ti_flags |= flag; +} + +static inline int nilfs_test_transaction_flag(unsigned int flag) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC) + return 0; + return !!(ti->ti_flags & flag); +} + +static inline int nilfs_doing_gc(void) +{ + return nilfs_test_transaction_flag(NILFS_TI_GC); +} + +static inline int nilfs_doing_construction(void) +{ + return nilfs_test_transaction_flag(NILFS_TI_WRITER); +} + +static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs) +{ + return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat; +} + +/* + * function prototype + */ +#ifdef CONFIG_NILFS_POSIX_ACL +#error "NILFS: not yet supported POSIX ACL" +extern int nilfs_permission(struct inode *, int, struct nameidata *); +extern int nilfs_acl_chmod(struct inode *); +extern int nilfs_init_acl(struct inode *, struct inode *); +#else +#define nilfs_permission NULL + +static inline int nilfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int nilfs_init_acl(struct inode *inode, struct inode *dir) +{ + inode->i_mode &= ~current_umask(); + return 0; +} +#endif + +#define NILFS_ATIME_DISABLE + +/* dir.c */ +extern int nilfs_add_link(struct dentry *, struct inode *); +extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *); +extern int nilfs_make_empty(struct inode *, struct inode *); +extern struct nilfs_dir_entry * +nilfs_find_entry(struct inode *, struct dentry *, struct page **); +extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); +extern int nilfs_empty_dir(struct inode *); +extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); +extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, + struct page *, struct inode *); + +/* file.c */ +extern int nilfs_sync_file(struct file *, struct dentry *, int); + +/* ioctl.c */ +long nilfs_ioctl(struct file *, unsigned int, unsigned long); +int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, void __user *); + +/* inode.c */ +extern struct inode *nilfs_new_inode(struct inode *, int); +extern void nilfs_free_inode(struct inode *); +extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern void nilfs_set_inode_flags(struct inode *); +extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); +extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); +extern struct inode *nilfs_iget(struct super_block *, unsigned long); +extern void nilfs_update_inode(struct inode *, struct buffer_head *); +extern void nilfs_truncate(struct inode *); +extern void nilfs_delete_inode(struct inode *); +extern int nilfs_setattr(struct dentry *, struct iattr *); +extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, + struct buffer_head **); +extern int nilfs_inode_dirty(struct inode *); +extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *, + unsigned); +extern int nilfs_mark_inode_dirty(struct inode *); +extern void nilfs_dirty_inode(struct inode *); + +/* namei.c */ +extern struct dentry *nilfs_get_parent(struct dentry *); + +/* super.c */ +extern struct inode *nilfs_alloc_inode(struct super_block *); +extern void nilfs_destroy_inode(struct inode *); +extern void nilfs_error(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void nilfs_warning(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern struct nilfs_super_block * +nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); +extern int nilfs_store_magic_and_option(struct super_block *, + struct nilfs_super_block *, char *); +extern int nilfs_commit_super(struct nilfs_sb_info *, int); +extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); +extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); + +/* gcinode.c */ +int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64, + struct buffer_head **); +int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64, + struct buffer_head **); +int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); +int nilfs_init_gccache(struct the_nilfs *); +void nilfs_destroy_gccache(struct the_nilfs *); +void nilfs_clear_gcinode(struct inode *); +struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64); +void nilfs_remove_all_gcinode(struct the_nilfs *); + +/* gcdat.c */ +int nilfs_init_gcdat_inode(struct the_nilfs *); +void nilfs_commit_gcdat_inode(struct the_nilfs *); +void nilfs_clear_gcdat_inode(struct the_nilfs *); + +/* + * Inodes and files operations + */ +extern struct file_operations nilfs_dir_operations; +extern struct inode_operations nilfs_file_inode_operations; +extern struct file_operations nilfs_file_operations; +extern struct address_space_operations nilfs_aops; +extern struct inode_operations nilfs_dir_inode_operations; +extern struct inode_operations nilfs_special_inode_operations; +extern struct inode_operations nilfs_symlink_inode_operations; + +/* + * filesystem type + */ +extern struct file_system_type nilfs_fs_type; + + +#endif /* _NILFS_H */ diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c new file mode 100644 index 000000000000..1bfbba9c0e9a --- /dev/null +++ b/fs/nilfs2/page.c @@ -0,0 +1,540 @@ +/* + * page.c - buffer/page management specific to NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net>, + * Seiji Kihara <kihara@osrg.net>. + */ + +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/swap.h> +#include <linux/bitops.h> +#include <linux/page-flags.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/pagevec.h> +#include "nilfs.h" +#include "page.h" +#include "mdt.h" + + +#define NILFS_BUFFER_INHERENT_BITS \ + ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ + (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated)) + +static struct buffer_head * +__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, + int blkbits, unsigned long b_state) + +{ + unsigned long first_block; + struct buffer_head *bh; + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << blkbits, b_state); + + first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits); + bh = nilfs_page_get_nth_block(page, block - first_block); + + touch_buffer(bh); + wait_on_buffer(bh); + return bh; +} + +/* + * Since the page cache of B-tree node pages or data page cache of pseudo + * inodes does not have a valid mapping->host pointer, calling + * mark_buffer_dirty() for their buffers causes a NULL pointer dereference; + * it calls __mark_inode_dirty(NULL) through __set_page_dirty(). + * To avoid this problem, the old style mark_buffer_dirty() is used instead. + */ +void nilfs_mark_buffer_dirty(struct buffer_head *bh) +{ + if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) + __set_page_dirty_nobuffers(bh->b_page); +} + +struct buffer_head *nilfs_grab_buffer(struct inode *inode, + struct address_space *mapping, + unsigned long blkoff, + unsigned long b_state) +{ + int blkbits = inode->i_blkbits; + pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); + struct page *page, *opage; + struct buffer_head *bh, *obh; + + page = grab_cache_page(mapping, index); + if (unlikely(!page)) + return NULL; + + bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state); + if (unlikely(!bh)) { + unlock_page(page); + page_cache_release(page); + return NULL; + } + if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) { + /* + * Shadow page cache uses assoc_mapping to point its original + * page cache. The following code tries the original cache + * if the given cache is a shadow and it didn't hit. + */ + opage = find_lock_page(mapping->assoc_mapping, index); + if (!opage) + return bh; + + obh = __nilfs_get_page_block(opage, blkoff, index, blkbits, + b_state); + if (buffer_uptodate(obh)) { + nilfs_copy_buffer(bh, obh); + if (buffer_dirty(obh)) { + nilfs_mark_buffer_dirty(bh); + if (!buffer_nilfs_node(bh) && NILFS_MDT(inode)) + nilfs_mdt_mark_dirty(inode); + } + } + brelse(obh); + unlock_page(opage); + page_cache_release(opage); + } + return bh; +} + +/** + * nilfs_forget_buffer - discard dirty state + * @inode: owner inode of the buffer + * @bh: buffer head of the buffer to be discarded + */ +void nilfs_forget_buffer(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + + lock_buffer(bh); + clear_buffer_nilfs_volatile(bh); + if (test_clear_buffer_dirty(bh) && nilfs_page_buffers_clean(page)) + __nilfs_clear_page_dirty(page); + + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + bh->b_blocknr = -1; + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + unlock_buffer(bh); + brelse(bh); +} + +/** + * nilfs_copy_buffer -- copy buffer data and flags + * @dbh: destination buffer + * @sbh: source buffer + */ +void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) +{ + void *kaddr0, *kaddr1; + unsigned long bits; + struct page *spage = sbh->b_page, *dpage = dbh->b_page; + struct buffer_head *bh; + + kaddr0 = kmap_atomic(spage, KM_USER0); + kaddr1 = kmap_atomic(dpage, KM_USER1); + memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); + kunmap_atomic(kaddr1, KM_USER1); + kunmap_atomic(kaddr0, KM_USER0); + + dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS; + dbh->b_blocknr = sbh->b_blocknr; + dbh->b_bdev = sbh->b_bdev; + + bh = dbh; + bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped)); + while ((bh = bh->b_this_page) != dbh) { + lock_buffer(bh); + bits &= bh->b_state; + unlock_buffer(bh); + } + if (bits & (1UL << BH_Uptodate)) + SetPageUptodate(dpage); + else + ClearPageUptodate(dpage); + if (bits & (1UL << BH_Mapped)) + SetPageMappedToDisk(dpage); + else + ClearPageMappedToDisk(dpage); +} + +/** + * nilfs_page_buffers_clean - check if a page has dirty buffers or not. + * @page: page to be checked + * + * nilfs_page_buffers_clean() returns zero if the page has dirty buffers. + * Otherwise, it returns non-zero value. + */ +int nilfs_page_buffers_clean(struct page *page) +{ + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_dirty(bh)) + return 0; + bh = bh->b_this_page; + } while (bh != head); + return 1; +} + +void nilfs_page_bug(struct page *page) +{ + struct address_space *m; + unsigned long ino = 0; + + if (unlikely(!page)) { + printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n"); + return; + } + + m = page->mapping; + if (m) { + struct inode *inode = NILFS_AS_I(m); + if (inode != NULL) + ino = inode->i_ino; + } + printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " + "mapping=%p ino=%lu\n", + page, atomic_read(&page->_count), + (unsigned long long)page->index, page->flags, m, ino); + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + int i = 0; + + bh = head = page_buffers(page); + do { + printk(KERN_CRIT + " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n", + i++, bh, atomic_read(&bh->b_count), + (unsigned long long)bh->b_blocknr, bh->b_state); + bh = bh->b_this_page; + } while (bh != head); + } +} + +/** + * nilfs_alloc_private_page - allocate a private page with buffer heads + * + * Return Value: On success, a pointer to the allocated page is returned. + * On error, NULL is returned. + */ +struct page *nilfs_alloc_private_page(struct block_device *bdev, int size, + unsigned long state) +{ + struct buffer_head *bh, *head, *tail; + struct page *page; + + page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */ + if (unlikely(!page)) + return NULL; + + lock_page(page); + head = alloc_page_buffers(page, size, 0); + if (unlikely(!head)) { + unlock_page(page); + __free_page(page); + return NULL; + } + + bh = head; + do { + bh->b_state = (1UL << BH_NILFS_Allocated) | state; + tail = bh; + bh->b_bdev = bdev; + bh = bh->b_this_page; + } while (bh); + + tail->b_this_page = head; + attach_page_buffers(page, head); + + return page; +} + +void nilfs_free_private_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + BUG_ON(page->mapping); + + if (page_has_buffers(page) && !try_to_free_buffers(page)) + NILFS_PAGE_BUG(page, "failed to free page"); + + unlock_page(page); + __free_page(page); +} + +/** + * nilfs_copy_page -- copy the page with buffers + * @dst: destination page + * @src: source page + * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. + * + * This fuction is for both data pages and btnode pages. The dirty flag + * should be treated by caller. The page must not be under i/o. + * Both src and dst page must be locked + */ +static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) +{ + struct buffer_head *dbh, *dbufs, *sbh, *sbufs; + unsigned long mask = NILFS_BUFFER_INHERENT_BITS; + + BUG_ON(PageWriteback(dst)); + + sbh = sbufs = page_buffers(src); + if (!page_has_buffers(dst)) + create_empty_buffers(dst, sbh->b_size, 0); + + if (copy_dirty) + mask |= (1UL << BH_Dirty); + + dbh = dbufs = page_buffers(dst); + do { + lock_buffer(sbh); + lock_buffer(dbh); + dbh->b_state = sbh->b_state & mask; + dbh->b_blocknr = sbh->b_blocknr; + dbh->b_bdev = sbh->b_bdev; + sbh = sbh->b_this_page; + dbh = dbh->b_this_page; + } while (dbh != dbufs); + + copy_highpage(dst, src); + + if (PageUptodate(src) && !PageUptodate(dst)) + SetPageUptodate(dst); + else if (!PageUptodate(src) && PageUptodate(dst)) + ClearPageUptodate(dst); + if (PageMappedToDisk(src) && !PageMappedToDisk(dst)) + SetPageMappedToDisk(dst); + else if (!PageMappedToDisk(src) && PageMappedToDisk(dst)) + ClearPageMappedToDisk(dst); + + do { + unlock_buffer(sbh); + unlock_buffer(dbh); + sbh = sbh->b_this_page; + dbh = dbh->b_this_page; + } while (dbh != dbufs); +} + +int nilfs_copy_dirty_pages(struct address_space *dmap, + struct address_space *smap) +{ + struct pagevec pvec; + unsigned int i; + pgoff_t index = 0; + int err = 0; + + pagevec_init(&pvec, 0); +repeat: + if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) + return 0; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i], *dpage; + + lock_page(page); + if (unlikely(!PageDirty(page))) + NILFS_PAGE_BUG(page, "inconsistent dirty state"); + + dpage = grab_cache_page(dmap, page->index); + if (unlikely(!dpage)) { + /* No empty page is added to the page cache */ + err = -ENOMEM; + unlock_page(page); + break; + } + if (unlikely(!page_has_buffers(page))) + NILFS_PAGE_BUG(page, + "found empty page in dat page cache"); + + nilfs_copy_page(dpage, page, 1); + __set_page_dirty_nobuffers(dpage); + + unlock_page(dpage); + page_cache_release(dpage); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + + if (likely(!err)) + goto repeat; + return err; +} + +/** + * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache + * @dmap: destination page cache + * @smap: source page cache + * + * No pages must no be added to the cache during this process. + * This must be ensured by the caller. + */ +void nilfs_copy_back_pages(struct address_space *dmap, + struct address_space *smap) +{ + struct pagevec pvec; + unsigned int i, n; + pgoff_t index = 0; + int err; + + pagevec_init(&pvec, 0); +repeat: + n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); + if (!n) + return; + index = pvec.pages[n - 1]->index + 1; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i], *dpage; + pgoff_t offset = page->index; + + lock_page(page); + dpage = find_lock_page(dmap, offset); + if (dpage) { + /* override existing page on the destination cache */ + WARN_ON(PageDirty(dpage)); + nilfs_copy_page(dpage, page, 0); + unlock_page(dpage); + page_cache_release(dpage); + } else { + struct page *page2; + + /* move the page to the destination cache */ + spin_lock_irq(&smap->tree_lock); + page2 = radix_tree_delete(&smap->page_tree, offset); + WARN_ON(page2 != page); + + smap->nrpages--; + spin_unlock_irq(&smap->tree_lock); + + spin_lock_irq(&dmap->tree_lock); + err = radix_tree_insert(&dmap->page_tree, offset, page); + if (unlikely(err < 0)) { + WARN_ON(err == -EEXIST); + page->mapping = NULL; + page_cache_release(page); /* for cache */ + } else { + page->mapping = dmap; + dmap->nrpages++; + if (PageDirty(page)) + radix_tree_tag_set(&dmap->page_tree, + offset, + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&dmap->tree_lock); + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + + goto repeat; +} + +void nilfs_clear_dirty_pages(struct address_space *mapping) +{ + struct pagevec pvec; + unsigned int i; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + lock_page(page); + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + bh = head = page_buffers(page); + do { + lock_buffer(bh); + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + unlock_buffer(bh); + bh = bh->b_this_page; + } while (bh != head); + + __nilfs_clear_page_dirty(page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +unsigned nilfs_page_count_clean_buffers(struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + struct buffer_head *bh, *head; + unsigned nc = 0; + + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + bh->b_size; + if (block_end > from && block_start < to && !buffer_dirty(bh)) + nc++; + } + return nc; +} + +/* + * NILFS2 needs clear_page_dirty() in the following two cases: + * + * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears + * page dirty flags when it copies back pages from the shadow cache + * (gcdat->{i_mapping,i_btnode_cache}) to its original cache + * (dat->{i_mapping,i_btnode_cache}). + * + * 2) Some B-tree operations like insertion or deletion may dispose buffers + * in dirty state, and this needs to cancel the dirty state of their pages. + */ +int __nilfs_clear_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping) { + spin_lock_irq(&mapping->tree_lock); + if (test_bit(PG_dirty, &page->flags)) { + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&mapping->tree_lock); + return clear_page_dirty_for_io(page); + } + spin_unlock_irq(&mapping->tree_lock); + return 0; + } + return TestClearPageDirty(page); +} diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h new file mode 100644 index 000000000000..8abca4d1c1f8 --- /dev/null +++ b/fs/nilfs2/page.h @@ -0,0 +1,76 @@ +/* + * page.h - buffer/page management specific to NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net>, + * Seiji Kihara <kihara@osrg.net>. + */ + +#ifndef _NILFS_PAGE_H +#define _NILFS_PAGE_H + +#include <linux/buffer_head.h> +#include "nilfs.h" + +/* + * Extended buffer state bits + */ +enum { + BH_NILFS_Allocated = BH_PrivateStart, + BH_NILFS_Node, + BH_NILFS_Volatile, +}; + +BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ +BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ +BUFFER_FNS(NILFS_Volatile, nilfs_volatile) + + +void nilfs_mark_buffer_dirty(struct buffer_head *bh); +int __nilfs_clear_page_dirty(struct page *); + +struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, + unsigned long, unsigned long); +void nilfs_forget_buffer(struct buffer_head *); +void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *); +int nilfs_page_buffers_clean(struct page *); +void nilfs_page_bug(struct page *); +struct page *nilfs_alloc_private_page(struct block_device *, int, + unsigned long); +void nilfs_free_private_page(struct page *); + +int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); +void nilfs_copy_back_pages(struct address_space *, struct address_space *); +void nilfs_clear_dirty_pages(struct address_space *); +unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); + +#define NILFS_PAGE_BUG(page, m, a...) \ + do { nilfs_page_bug(page); BUG(); } while (0) + +static inline struct buffer_head * +nilfs_page_get_nth_block(struct page *page, unsigned int count) +{ + struct buffer_head *bh = page_buffers(page); + + while (count-- > 0) + bh = bh->b_this_page; + get_bh(bh); + return bh; +} + +#endif /* _NILFS_PAGE_H */ diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c new file mode 100644 index 000000000000..6ade0963fc1d --- /dev/null +++ b/fs/nilfs2/recovery.c @@ -0,0 +1,929 @@ +/* + * recovery.c - NILFS recovery logic + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/crc32.h> +#include "nilfs.h" +#include "segment.h" +#include "sufile.h" +#include "page.h" +#include "seglist.h" +#include "segbuf.h" + +/* + * Segment check result + */ +enum { + NILFS_SEG_VALID, + NILFS_SEG_NO_SUPER_ROOT, + NILFS_SEG_FAIL_IO, + NILFS_SEG_FAIL_MAGIC, + NILFS_SEG_FAIL_SEQ, + NILFS_SEG_FAIL_CHECKSUM_SEGSUM, + NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, + NILFS_SEG_FAIL_CHECKSUM_FULL, + NILFS_SEG_FAIL_CONSISTENCY, +}; + +/* work structure for recovery */ +struct nilfs_recovery_block { + ino_t ino; /* Inode number of the file that this block + belongs to */ + sector_t blocknr; /* block number */ + __u64 vblocknr; /* virtual block number */ + unsigned long blkoff; /* File offset of the data block (per block) */ + struct list_head list; +}; + + +static int nilfs_warn_segment_error(int err) +{ + switch (err) { + case NILFS_SEG_FAIL_IO: + printk(KERN_WARNING + "NILFS warning: I/O error on loading last segment\n"); + return -EIO; + case NILFS_SEG_FAIL_MAGIC: + printk(KERN_WARNING + "NILFS warning: Segment magic number invalid\n"); + break; + case NILFS_SEG_FAIL_SEQ: + printk(KERN_WARNING + "NILFS warning: Sequence number mismatch\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_SEGSUM: + printk(KERN_WARNING + "NILFS warning: Checksum error in segment summary\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: + printk(KERN_WARNING + "NILFS warning: Checksum error in super root\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_FULL: + printk(KERN_WARNING + "NILFS warning: Checksum error in segment payload\n"); + break; + case NILFS_SEG_FAIL_CONSISTENCY: + printk(KERN_WARNING + "NILFS warning: Inconsistent segment\n"); + break; + case NILFS_SEG_NO_SUPER_ROOT: + printk(KERN_WARNING + "NILFS warning: No super root in the last segment\n"); + break; + } + return -EINVAL; +} + +static void store_segsum_info(struct nilfs_segsum_info *ssi, + struct nilfs_segment_summary *sum, + unsigned int blocksize) +{ + ssi->flags = le16_to_cpu(sum->ss_flags); + ssi->seg_seq = le64_to_cpu(sum->ss_seq); + ssi->ctime = le64_to_cpu(sum->ss_create); + ssi->next = le64_to_cpu(sum->ss_next); + ssi->nblocks = le32_to_cpu(sum->ss_nblocks); + ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo); + ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes); + + ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize); + ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi); +} + +/** + * calc_crc_cont - check CRC of blocks continuously + * @sbi: nilfs_sb_info + * @bhs: buffer head of start block + * @sum: place to store result + * @offset: offset bytes in the first block + * @check_bytes: number of bytes to be checked + * @start: DBN of start block + * @nblock: number of blocks to be checked + */ +static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs, + u32 *sum, unsigned long offset, u64 check_bytes, + sector_t start, unsigned long nblock) +{ + unsigned long blocksize = sbi->s_super->s_blocksize; + unsigned long size; + u32 crc; + + BUG_ON(offset >= blocksize); + check_bytes -= offset; + size = min_t(u64, check_bytes, blocksize - offset); + crc = crc32_le(sbi->s_nilfs->ns_crc_seed, + (unsigned char *)bhs->b_data + offset, size); + if (--nblock > 0) { + do { + struct buffer_head *bh + = sb_bread(sbi->s_super, ++start); + if (!bh) + return -EIO; + check_bytes -= size; + size = min_t(u64, check_bytes, blocksize); + crc = crc32_le(crc, bh->b_data, size); + brelse(bh); + } while (--nblock > 0); + } + *sum = crc; + return 0; +} + +/** + * nilfs_read_super_root_block - read super root block + * @sb: super_block + * @sr_block: disk block number of the super root block + * @pbh: address of a buffer_head pointer to return super root buffer + * @check: CRC check flag + */ +int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, + struct buffer_head **pbh, int check) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *sr; + u32 crc; + int ret; + + *pbh = NULL; + bh_sr = sb_bread(sb, sr_block); + if (unlikely(!bh_sr)) { + ret = NILFS_SEG_FAIL_IO; + goto failed; + } + + sr = (struct nilfs_super_root *)bh_sr->b_data; + if (check) { + unsigned bytes = le16_to_cpu(sr->sr_bytes); + + if (bytes == 0 || bytes > sb->s_blocksize) { + ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; + goto failed_bh; + } + if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc, + sizeof(sr->sr_sum), bytes, sr_block, 1)) { + ret = NILFS_SEG_FAIL_IO; + goto failed_bh; + } + if (crc != le32_to_cpu(sr->sr_sum)) { + ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; + goto failed_bh; + } + } + *pbh = bh_sr; + return 0; + + failed_bh: + brelse(bh_sr); + + failed: + return nilfs_warn_segment_error(ret); +} + +/** + * load_segment_summary - read segment summary of the specified partial segment + * @sbi: nilfs_sb_info + * @pseg_start: start disk block number of partial segment + * @seg_seq: sequence number requested + * @ssi: pointer to nilfs_segsum_info struct to store information + * @full_check: full check flag + * (0: only checks segment summary CRC, 1: data CRC) + */ +static int +load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, + u64 seg_seq, struct nilfs_segsum_info *ssi, + int full_check) +{ + struct buffer_head *bh_sum; + struct nilfs_segment_summary *sum; + unsigned long offset, nblock; + u64 check_bytes; + u32 crc, crc_sum; + int ret = NILFS_SEG_FAIL_IO; + + bh_sum = sb_bread(sbi->s_super, pseg_start); + if (!bh_sum) + goto out; + + sum = (struct nilfs_segment_summary *)bh_sum->b_data; + + /* Check consistency of segment summary */ + if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) { + ret = NILFS_SEG_FAIL_MAGIC; + goto failed; + } + store_segsum_info(ssi, sum, sbi->s_super->s_blocksize); + if (seg_seq != ssi->seg_seq) { + ret = NILFS_SEG_FAIL_SEQ; + goto failed; + } + if (full_check) { + offset = sizeof(sum->ss_datasum); + check_bytes = + ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits); + nblock = ssi->nblocks; + crc_sum = le32_to_cpu(sum->ss_datasum); + ret = NILFS_SEG_FAIL_CHECKSUM_FULL; + } else { /* only checks segment summary */ + offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum); + check_bytes = ssi->sumbytes; + nblock = ssi->nsumblk; + crc_sum = le32_to_cpu(sum->ss_sumsum); + ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM; + } + + if (unlikely(nblock == 0 || + nblock > sbi->s_nilfs->ns_blocks_per_segment)) { + /* This limits the number of blocks read in the CRC check */ + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto failed; + } + if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes, + pseg_start, nblock)) { + ret = NILFS_SEG_FAIL_IO; + goto failed; + } + if (crc == crc_sum) + ret = 0; + failed: + brelse(bh_sum); + out: + return ret; +} + +static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes) +{ + void *ptr; + sector_t blocknr; + + BUG_ON((*pbh)->b_size < *offset); + if (bytes > (*pbh)->b_size - *offset) { + blocknr = (*pbh)->b_blocknr; + brelse(*pbh); + *pbh = sb_bread(sb, blocknr + 1); + if (unlikely(!*pbh)) + return NULL; + *offset = 0; + } + ptr = (*pbh)->b_data + *offset; + *offset += bytes; + return ptr; +} + +static void segsum_skip(struct super_block *sb, struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes, + unsigned long count) +{ + unsigned int rest_item_in_current_block + = ((*pbh)->b_size - *offset) / bytes; + + if (count <= rest_item_in_current_block) { + *offset += bytes * count; + } else { + sector_t blocknr = (*pbh)->b_blocknr; + unsigned int nitem_per_block = (*pbh)->b_size / bytes; + unsigned int bcnt; + + count -= rest_item_in_current_block; + bcnt = DIV_ROUND_UP(count, nitem_per_block); + *offset = bytes * (count - (bcnt - 1) * nitem_per_block); + + brelse(*pbh); + *pbh = sb_bread(sb, blocknr + bcnt); + } +} + +static int +collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, + struct nilfs_segsum_info *ssi, + struct list_head *head) +{ + struct buffer_head *bh; + unsigned int offset; + unsigned long nfinfo = ssi->nfinfo; + sector_t blocknr = sum_blocknr + ssi->nsumblk; + ino_t ino; + int err = -EIO; + + if (!nfinfo) + return 0; + + bh = sb_bread(sbi->s_super, sum_blocknr); + if (unlikely(!bh)) + goto out; + + offset = le16_to_cpu( + ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes); + for (;;) { + unsigned long nblocks, ndatablk, nnodeblk; + struct nilfs_finfo *finfo; + + finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo)); + if (unlikely(!finfo)) + goto out; + + ino = le64_to_cpu(finfo->fi_ino); + nblocks = le32_to_cpu(finfo->fi_nblocks); + ndatablk = le32_to_cpu(finfo->fi_ndatablk); + nnodeblk = nblocks - ndatablk; + + while (ndatablk-- > 0) { + struct nilfs_recovery_block *rb; + struct nilfs_binfo_v *binfo; + + binfo = segsum_get(sbi->s_super, &bh, &offset, + sizeof(*binfo)); + if (unlikely(!binfo)) + goto out; + + rb = kmalloc(sizeof(*rb), GFP_NOFS); + if (unlikely(!rb)) { + err = -ENOMEM; + goto out; + } + rb->ino = ino; + rb->blocknr = blocknr++; + rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr); + rb->blkoff = le64_to_cpu(binfo->bi_blkoff); + /* INIT_LIST_HEAD(&rb->list); */ + list_add_tail(&rb->list, head); + } + if (--nfinfo == 0) + break; + blocknr += nnodeblk; /* always 0 for the data sync segments */ + segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64), + nnodeblk); + if (unlikely(!bh)) + goto out; + } + err = 0; + out: + brelse(bh); /* brelse(NULL) is just ignored */ + return err; +} + +static void dispose_recovery_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct nilfs_recovery_block *rb + = list_entry(head->next, + struct nilfs_recovery_block, list); + list_del(&rb->list); + kfree(rb); + } +} + +void nilfs_dispose_segment_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct nilfs_segment_entry *ent + = list_entry(head->next, + struct nilfs_segment_entry, list); + list_del(&ent->list); + nilfs_free_segment_entry(ent); + } +} + +static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, + struct nilfs_recovery_info *ri) +{ + struct list_head *head = &ri->ri_used_segments; + struct nilfs_segment_entry *ent, *n; + struct inode *sufile = nilfs->ns_sufile; + __u64 segnum[4]; + time_t mtime; + int err; + int i; + + segnum[0] = nilfs->ns_segnum; + segnum[1] = nilfs->ns_nextnum; + segnum[2] = ri->ri_segnum; + segnum[3] = ri->ri_nextnum; + + /* + * Releasing the next segment of the latest super root. + * The next segment is invalidated by this recovery. + */ + err = nilfs_sufile_free(sufile, segnum[1]); + if (unlikely(err)) + goto failed; + + err = -ENOMEM; + for (i = 1; i < 4; i++) { + ent = nilfs_alloc_segment_entry(segnum[i]); + if (unlikely(!ent)) + goto failed; + list_add_tail(&ent->list, head); + } + + /* + * Collecting segments written after the latest super root. + * These are marked dirty to avoid being reallocated in the next write. + */ + mtime = get_seconds(); + list_for_each_entry_safe(ent, n, head, list) { + if (ent->segnum == segnum[0]) { + list_del(&ent->list); + nilfs_free_segment_entry(ent); + continue; + } + err = nilfs_open_segment_entry(ent, sufile); + if (unlikely(err)) + goto failed; + if (!nilfs_segment_usage_dirty(ent->raw_su)) { + /* make the segment garbage */ + ent->raw_su->su_nblocks = cpu_to_le32(0); + ent->raw_su->su_lastmod = cpu_to_le32(mtime); + nilfs_segment_usage_set_dirty(ent->raw_su); + } + list_del(&ent->list); + nilfs_close_segment_entry(ent, sufile); + nilfs_free_segment_entry(ent); + } + + /* Allocate new segments for recovery */ + err = nilfs_sufile_alloc(sufile, &segnum[0]); + if (unlikely(err)) + goto failed; + + nilfs->ns_pseg_offset = 0; + nilfs->ns_seg_seq = ri->ri_seq + 2; + nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0]; + return 0; + + failed: + /* No need to recover sufile because it will be destroyed on error */ + return err; +} + +static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi, + struct nilfs_recovery_block *rb, + struct page *page) +{ + struct buffer_head *bh_org; + void *kaddr; + + bh_org = sb_bread(sbi->s_super, rb->blocknr); + if (unlikely(!bh_org)) + return -EIO; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh_org); + return 0; +} + +static int recover_dsync_blocks(struct nilfs_sb_info *sbi, + struct list_head *head, + unsigned long *nr_salvaged_blocks) +{ + struct inode *inode; + struct nilfs_recovery_block *rb, *n; + unsigned blocksize = sbi->s_super->s_blocksize; + struct page *page; + loff_t pos; + int err = 0, err2 = 0; + + list_for_each_entry_safe(rb, n, head, list) { + inode = nilfs_iget(sbi->s_super, rb->ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto failed_inode; + } + + pos = rb->blkoff << inode->i_blkbits; + page = NULL; + err = block_write_begin(NULL, inode->i_mapping, pos, blocksize, + 0, &page, NULL, nilfs_get_block); + if (unlikely(err)) + goto failed_inode; + + err = nilfs_recovery_copy_block(sbi, rb, page); + if (unlikely(err)) + goto failed_page; + + err = nilfs_set_file_dirty(sbi, inode, 1); + if (unlikely(err)) + goto failed_page; + + block_write_end(NULL, inode->i_mapping, pos, blocksize, + blocksize, page, NULL); + + unlock_page(page); + page_cache_release(page); + + (*nr_salvaged_blocks)++; + goto next; + + failed_page: + unlock_page(page); + page_cache_release(page); + + failed_inode: + printk(KERN_WARNING + "NILFS warning: error recovering data block " + "(err=%d, ino=%lu, block-offset=%llu)\n", + err, rb->ino, (unsigned long long)rb->blkoff); + if (!err2) + err2 = err; + next: + iput(inode); /* iput(NULL) is just ignored */ + list_del_init(&rb->list); + kfree(rb); + } + return err2; +} + +/** + * nilfs_do_roll_forward - salvage logical segments newer than the latest + * checkpoint + * @sbi: nilfs_sb_info + * @nilfs: the_nilfs + * @ri: pointer to a nilfs_recovery_info + */ +static int nilfs_do_roll_forward(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + struct nilfs_segsum_info ssi; + sector_t pseg_start; + sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ + unsigned long nsalvaged_blocks = 0; + u64 seg_seq; + __u64 segnum, nextnum = 0; + int empty_seg = 0; + int err = 0, ret; + LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */ + enum { + RF_INIT_ST, + RF_DSYNC_ST, /* scanning data-sync segments */ + }; + int state = RF_INIT_ST; + + nilfs_attach_writer(nilfs, sbi); + pseg_start = ri->ri_lsegs_start; + seg_seq = ri->ri_lsegs_start_seq; + segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + + while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { + + ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); + if (ret) { + if (ret == NILFS_SEG_FAIL_IO) { + err = -EIO; + goto failed; + } + goto strayed; + } + if (unlikely(NILFS_SEG_HAS_SR(&ssi))) + goto confused; + + /* Found a valid partial segment; do recovery actions */ + nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); + empty_seg = 0; + nilfs->ns_ctime = ssi.ctime; + if (!(ssi.flags & NILFS_SS_GC)) + nilfs->ns_nongc_ctime = ssi.ctime; + + switch (state) { + case RF_INIT_ST: + if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi)) + goto try_next_pseg; + state = RF_DSYNC_ST; + /* Fall through */ + case RF_DSYNC_ST: + if (!NILFS_SEG_DSYNC(&ssi)) + goto confused; + + err = collect_blocks_from_segsum( + sbi, pseg_start, &ssi, &dsync_blocks); + if (unlikely(err)) + goto failed; + if (NILFS_SEG_LOGEND(&ssi)) { + err = recover_dsync_blocks( + sbi, &dsync_blocks, &nsalvaged_blocks); + if (unlikely(err)) + goto failed; + state = RF_INIT_ST; + } + break; /* Fall through to try_next_pseg */ + } + + try_next_pseg: + if (pseg_start == ri->ri_lsegs_end) + break; + pseg_start += ssi.nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + strayed: + if (pseg_start == ri->ri_lsegs_end) + break; + + feed_segment: + /* Looking to the next full segment */ + if (empty_seg++) + break; + seg_seq++; + segnum = nextnum; + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + pseg_start = seg_start; + } + + if (nsalvaged_blocks) { + printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n", + sbi->s_super->s_id, nsalvaged_blocks); + ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; + } + out: + dispose_recovery_list(&dsync_blocks); + nilfs_detach_writer(sbi->s_nilfs, sbi); + return err; + + confused: + err = -EINVAL; + failed: + printk(KERN_ERR + "NILFS (device %s): Error roll-forwarding " + "(err=%d, pseg block=%llu). ", + sbi->s_super->s_id, err, (unsigned long long)pseg_start); + goto out; +} + +static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + struct buffer_head *bh; + int err; + + if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) != + nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) + return; + + bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start); + BUG_ON(!bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_dirty(bh); + err = sync_dirty_buffer(bh); + if (unlikely(err)) + printk(KERN_WARNING + "NILFS warning: buffer sync write failed during " + "post-cleaning of recovery.\n"); + brelse(bh); +} + +/** + * nilfs_recover_logical_segments - salvage logical segments written after + * the latest super root + * @nilfs: the_nilfs + * @sbi: nilfs_sb_info + * @ri: pointer to a nilfs_recovery_info struct to store search results. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EINVAL - Inconsistent filesystem state. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_recover_logical_segments(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + int err; + + if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) + return 0; + + err = nilfs_attach_checkpoint(sbi, ri->ri_cno); + if (unlikely(err)) { + printk(KERN_ERR + "NILFS: error loading the latest checkpoint.\n"); + return err; + } + + err = nilfs_do_roll_forward(nilfs, sbi, ri); + if (unlikely(err)) + goto failed; + + if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { + err = nilfs_prepare_segment_for_recovery(nilfs, ri); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: Error preparing segments for " + "recovery.\n"); + goto failed; + } + + err = nilfs_attach_segment_constructor(sbi); + if (unlikely(err)) + goto failed; + + set_nilfs_discontinued(nilfs); + err = nilfs_construct_segment(sbi->s_super); + nilfs_detach_segment_constructor(sbi); + + if (unlikely(err)) { + printk(KERN_ERR "NILFS: Oops! recovery failed. " + "(err=%d)\n", err); + goto failed; + } + + nilfs_finish_roll_forward(nilfs, sbi, ri); + } + + nilfs_detach_checkpoint(sbi); + return 0; + + failed: + nilfs_detach_checkpoint(sbi); + nilfs_mdt_clear(nilfs->ns_cpfile); + nilfs_mdt_clear(nilfs->ns_sufile); + nilfs_mdt_clear(nilfs->ns_dat); + return err; +} + +/** + * nilfs_search_super_root - search the latest valid super root + * @nilfs: the_nilfs + * @sbi: nilfs_sb_info + * @ri: pointer to a nilfs_recovery_info struct to store search results. + * + * nilfs_search_super_root() looks for the latest super-root from a partial + * segment pointed by the superblock. It sets up struct the_nilfs through + * this search. It fills nilfs_recovery_info (ri) required for recovery. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EINVAL - No valid segment found + * + * %-EIO - I/O error + */ +int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + struct nilfs_segsum_info ssi; + sector_t pseg_start, pseg_end, sr_pseg_start = 0; + sector_t seg_start, seg_end; /* range of full segment (block number) */ + u64 seg_seq; + __u64 segnum, nextnum = 0; + __u64 cno; + struct nilfs_segment_entry *ent; + LIST_HEAD(segments); + int empty_seg = 0, scan_newer = 0; + int ret; + + pseg_start = nilfs->ns_last_pseg; + seg_seq = nilfs->ns_last_seq; + cno = nilfs->ns_last_cno; + segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); + + /* Calculate range of segment */ + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + + for (;;) { + /* Load segment summary */ + ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); + if (ret) { + if (ret == NILFS_SEG_FAIL_IO) + goto failed; + goto strayed; + } + pseg_end = pseg_start + ssi.nblocks - 1; + if (unlikely(pseg_end > seg_end)) { + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto strayed; + } + + /* A valid partial segment */ + ri->ri_pseg_start = pseg_start; + ri->ri_seq = seg_seq; + ri->ri_segnum = segnum; + nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); + ri->ri_nextnum = nextnum; + empty_seg = 0; + + if (!NILFS_SEG_HAS_SR(&ssi)) { + if (!scan_newer) { + /* This will never happen because a superblock + (last_segment) always points to a pseg + having a super root. */ + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto failed; + } + if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { + ri->ri_lsegs_start = pseg_start; + ri->ri_lsegs_start_seq = seg_seq; + } + if (NILFS_SEG_LOGEND(&ssi)) + ri->ri_lsegs_end = pseg_start; + goto try_next_pseg; + } + + /* A valid super root was found. */ + ri->ri_cno = cno++; + ri->ri_super_root = pseg_end; + ri->ri_lsegs_start = ri->ri_lsegs_end = 0; + + nilfs_dispose_segment_list(&segments); + nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start) + + ssi.nblocks - seg_start; + nilfs->ns_seg_seq = seg_seq; + nilfs->ns_segnum = segnum; + nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ + nilfs->ns_ctime = ssi.ctime; + nilfs->ns_nextnum = nextnum; + + if (scan_newer) + ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED; + else { + if (nilfs->ns_mount_state & NILFS_VALID_FS) + goto super_root_found; + scan_newer = 1; + } + + /* reset region for roll-forward */ + pseg_start += ssi.nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + try_next_pseg: + /* Standing on a course, or met an inconsistent state */ + pseg_start += ssi.nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + strayed: + /* Off the trail */ + if (!scan_newer) + /* + * This can happen if a checkpoint was written without + * barriers, or as a result of an I/O failure. + */ + goto failed; + + feed_segment: + /* Looking to the next full segment */ + if (empty_seg++) + goto super_root_found; /* found a valid super root */ + + ent = nilfs_alloc_segment_entry(segnum); + if (unlikely(!ent)) { + ret = -ENOMEM; + goto failed; + } + list_add_tail(&ent->list, &segments); + + seg_seq++; + segnum = nextnum; + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + pseg_start = seg_start; + } + + super_root_found: + /* Updating pointers relating to the latest checkpoint */ + list_splice(&segments, ri->ri_used_segments.prev); + nilfs->ns_last_pseg = sr_pseg_start; + nilfs->ns_last_seq = nilfs->ns_seg_seq; + nilfs->ns_last_cno = ri->ri_cno; + return 0; + + failed: + nilfs_dispose_segment_list(&segments); + return (ret < 0) ? ret : nilfs_warn_segment_error(ret); +} diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h new file mode 100644 index 000000000000..adccd4fc654e --- /dev/null +++ b/fs/nilfs2/sb.h @@ -0,0 +1,102 @@ +/* + * sb.h - NILFS on-memory super block structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#ifndef _NILFS_SB +#define _NILFS_SB + +#include <linux/types.h> +#include <linux/fs.h> + +/* + * Mount options + */ +struct nilfs_mount_options { + unsigned long mount_opt; + __u64 snapshot_cno; +}; + +struct the_nilfs; +struct nilfs_sc_info; + +/* + * NILFS super-block data in memory + */ +struct nilfs_sb_info { + /* Snapshot status */ + __u64 s_snapshot_cno; /* Checkpoint number */ + atomic_t s_inodes_count; + atomic_t s_blocks_count; /* Reserved (might be deleted) */ + + /* Mount options */ + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + + unsigned long s_interval; /* construction interval */ + unsigned long s_watermark; /* threshold of data amount + for the segment construction */ + + /* Fundamental members */ + struct super_block *s_super; /* reverse pointer to super_block */ + struct the_nilfs *s_nilfs; + struct list_head s_list; /* list head for nilfs->ns_supers */ + + /* Segment constructor */ + struct list_head s_dirty_files; /* dirty files list */ + struct nilfs_sc_info *s_sc_info; /* segment constructor info */ + spinlock_t s_inode_lock; /* Lock for the nilfs inode. + It covers s_dirty_files list */ + + /* Metadata files */ + struct inode *s_ifile; /* index file inode */ + + /* Inode allocator */ + spinlock_t s_next_gen_lock; + u32 s_next_generation; +}; + +static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi) +{ + return sbi->s_sc_info; +} + +/* + * Bit operations for the mount option + */ +#define nilfs_clear_opt(sbi, opt) \ + do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0) +#define nilfs_set_opt(sbi, opt) \ + do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0) +#define nilfs_test_opt(sbi, opt) ((sbi)->s_mount_opt & NILFS_MOUNT_##opt) +#define nilfs_write_opt(sbi, mask, opt) \ + do { (sbi)->s_mount_opt = \ + (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) | \ + NILFS_MOUNT_##opt); \ + } while (0) + +#endif /* _NILFS_SB */ diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c new file mode 100644 index 000000000000..1e68821b4a9b --- /dev/null +++ b/fs/nilfs2/segbuf.c @@ -0,0 +1,439 @@ +/* + * segbuf.c - NILFS segment buffer + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/crc32.h> +#include "page.h" +#include "segbuf.h" +#include "seglist.h" + + +static struct kmem_cache *nilfs_segbuf_cachep; + +static void nilfs_segbuf_init_once(void *obj) +{ + memset(obj, 0, sizeof(struct nilfs_segment_buffer)); +} + +int __init nilfs_init_segbuf_cache(void) +{ + nilfs_segbuf_cachep = + kmem_cache_create("nilfs2_segbuf_cache", + sizeof(struct nilfs_segment_buffer), + 0, SLAB_RECLAIM_ACCOUNT, + nilfs_segbuf_init_once); + + return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0; +} + +void nilfs_destroy_segbuf_cache(void) +{ + kmem_cache_destroy(nilfs_segbuf_cachep); +} + +struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) +{ + struct nilfs_segment_buffer *segbuf; + + segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS); + if (unlikely(!segbuf)) + return NULL; + + segbuf->sb_super = sb; + INIT_LIST_HEAD(&segbuf->sb_list); + INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); + INIT_LIST_HEAD(&segbuf->sb_payload_buffers); + return segbuf; +} + +void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf) +{ + kmem_cache_free(nilfs_segbuf_cachep, segbuf); +} + +void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum, + unsigned long offset, struct the_nilfs *nilfs) +{ + segbuf->sb_segnum = segnum; + nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start, + &segbuf->sb_fseg_end); + + segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset; + segbuf->sb_rest_blocks = + segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; +} + +void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf, + __u64 nextnum, struct the_nilfs *nilfs) +{ + segbuf->sb_nextnum = nextnum; + segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum); +} + +int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) +{ + struct buffer_head *bh; + + bh = sb_getblk(segbuf->sb_super, + segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk); + if (unlikely(!bh)) + return -ENOMEM; + + nilfs_segbuf_add_segsum_buffer(segbuf, bh); + return 0; +} + +int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + + bh = sb_getblk(segbuf->sb_super, + segbuf->sb_pseg_start + segbuf->sb_sum.nblocks); + if (unlikely(!bh)) + return -ENOMEM; + + nilfs_segbuf_add_payload_buffer(segbuf, bh); + *bhp = bh; + return 0; +} + +int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, + time_t ctime) +{ + int err; + + segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0; + err = nilfs_segbuf_extend_segsum(segbuf); + if (unlikely(err)) + return err; + + segbuf->sb_sum.flags = flags; + segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); + segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; + segbuf->sb_sum.ctime = ctime; + + segbuf->sb_io_error = 0; + return 0; +} + +/* + * Setup segument summary + */ +void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) +{ + struct nilfs_segment_summary *raw_sum; + struct buffer_head *bh_sum; + + bh_sum = list_entry(segbuf->sb_segsum_buffers.next, + struct buffer_head, b_assoc_buffers); + raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data; + + raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC); + raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum)); + raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags); + raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq); + raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime); + raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next); + raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks); + raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); + raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); + raw_sum->ss_pad = 0; +} + +/* + * CRC calculation routines + */ +void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct buffer_head *bh; + struct nilfs_segment_summary *raw_sum; + unsigned long size, bytes = segbuf->sb_sum.sumbytes; + u32 crc; + + bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, + b_assoc_buffers); + + raw_sum = (struct nilfs_segment_summary *)bh->b_data; + size = min_t(unsigned long, bytes, bh->b_size); + crc = crc32_le(seed, + (unsigned char *)raw_sum + + sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum), + size - (sizeof(raw_sum->ss_datasum) + + sizeof(raw_sum->ss_sumsum))); + + list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + bytes -= size; + size = min_t(unsigned long, bytes, bh->b_size); + crc = crc32_le(crc, bh->b_data, size); + } + raw_sum->ss_sumsum = cpu_to_le32(crc); +} + +void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct buffer_head *bh; + struct nilfs_segment_summary *raw_sum; + void *kaddr; + u32 crc; + + bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, + b_assoc_buffers); + raw_sum = (struct nilfs_segment_summary *)bh->b_data; + crc = crc32_le(seed, + (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum), + bh->b_size - sizeof(raw_sum->ss_datasum)); + + list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + crc = crc32_le(crc, bh->b_data, bh->b_size); + } + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + kaddr = kmap_atomic(bh->b_page, KM_USER0); + crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size); + kunmap_atomic(kaddr, KM_USER0); + } + raw_sum->ss_datasum = cpu_to_le32(crc); +} + +void nilfs_release_buffers(struct list_head *list) +{ + struct buffer_head *bh, *n; + + list_for_each_entry_safe(bh, n, list, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + if (buffer_nilfs_allocated(bh)) { + struct page *clone_page = bh->b_page; + + /* remove clone page */ + brelse(bh); + page_cache_release(clone_page); /* for each bh */ + if (page_count(clone_page) <= 2) { + lock_page(clone_page); + nilfs_free_private_page(clone_page); + } + continue; + } + brelse(bh); + } +} + +/* + * BIO operations + */ +static void nilfs_end_bio_write(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct nilfs_write_info *wi = bio->bi_private; + + if (err == -EOPNOTSUPP) { + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + bio_put(bio); + /* to be detected by submit_seg_bio() */ + } + + if (!uptodate) + atomic_inc(&wi->err); + + bio_put(bio); + complete(&wi->bio_event); +} + +static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) +{ + struct bio *bio = wi->bio; + int err; + + if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) { + wait_for_completion(&wi->bio_event); + wi->nbio--; + if (unlikely(atomic_read(&wi->err))) { + bio_put(bio); + err = -EIO; + goto failed; + } + } + + bio->bi_end_io = nilfs_end_bio_write; + bio->bi_private = wi; + bio_get(bio); + submit_bio(mode, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + bio_put(bio); + err = -EOPNOTSUPP; + goto failed; + } + wi->nbio++; + bio_put(bio); + + wi->bio = NULL; + wi->rest_blocks -= wi->end - wi->start; + wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); + wi->start = wi->end; + return 0; + + failed: + wi->bio = NULL; + return err; +} + +/** + * nilfs_alloc_seg_bio - allocate a bio for writing segment. + * @sb: super block + * @start: beginning disk block number of this BIO. + * @nr_vecs: request size of page vector. + * + * alloc_seg_bio() allocates a new BIO structure and initialize it. + * + * Return Value: On success, pointer to the struct bio is returned. + * On error, NULL is returned. + */ +static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, + int nr_vecs) +{ + struct bio *bio; + + bio = bio_alloc(GFP_NOWAIT, nr_vecs); + if (bio == NULL) { + while (!bio && (nr_vecs >>= 1)) + bio = bio_alloc(GFP_NOWAIT, nr_vecs); + } + if (likely(bio)) { + bio->bi_bdev = sb->s_bdev; + bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9); + } + return bio; +} + +void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + wi->bio = NULL; + wi->rest_blocks = segbuf->sb_sum.nblocks; + wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev); + wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); + wi->start = wi->end = 0; + wi->nbio = 0; + wi->blocknr = segbuf->sb_pseg_start; + + atomic_set(&wi->err, 0); + init_completion(&wi->bio_event); +} + +static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh, + int mode) +{ + int len, err; + + BUG_ON(wi->nr_vecs <= 0); + repeat: + if (!wi->bio) { + wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end, + wi->nr_vecs); + if (unlikely(!wi->bio)) + return -ENOMEM; + } + + len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (len == bh->b_size) { + wi->end++; + return 0; + } + /* bio is FULL */ + err = nilfs_submit_seg_bio(wi, mode); + /* never submit current bh */ + if (likely(!err)) + goto repeat; + return err; +} + +int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + struct buffer_head *bh; + int res, rw = WRITE; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + res = nilfs_submit_bh(wi, bh, rw); + if (unlikely(res)) + goto failed_bio; + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + res = nilfs_submit_bh(wi, bh, rw); + if (unlikely(res)) + goto failed_bio; + } + + if (wi->bio) { + /* + * Last BIO is always sent through the following + * submission. + */ + rw |= (1 << BIO_RW_SYNCIO); + res = nilfs_submit_seg_bio(wi, rw); + if (unlikely(res)) + goto failed_bio; + } + + res = 0; + out: + return res; + + failed_bio: + atomic_inc(&wi->err); + goto out; +} + +/** + * nilfs_segbuf_wait - wait for completion of requested BIOs + * @wi: nilfs_write_info + * + * Return Value: On Success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error + */ +int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + int err = 0; + + if (!wi->nbio) + return 0; + + do { + wait_for_completion(&wi->bio_event); + } while (--wi->nbio > 0); + + if (unlikely(atomic_read(&wi->err) > 0)) { + printk(KERN_ERR "NILFS: IO error writing segment\n"); + err = -EIO; + segbuf->sb_io_error = 1; + } + return err; +} diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h new file mode 100644 index 000000000000..0c3076f4e592 --- /dev/null +++ b/fs/nilfs2/segbuf.h @@ -0,0 +1,201 @@ +/* + * segbuf.h - NILFS Segment buffer prototypes and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ +#ifndef _NILFS_SEGBUF_H +#define _NILFS_SEGBUF_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/bio.h> +#include <linux/completion.h> +#include <linux/backing-dev.h> + +/** + * struct nilfs_segsum_info - On-memory segment summary + * @flags: Flags + * @nfinfo: Number of file information structures + * @nblocks: Number of blocks included in the partial segment + * @nsumblk: Number of summary blocks + * @sumbytes: Byte count of segment summary + * @nfileblk: Total number of file blocks + * @seg_seq: Segment sequence number + * @ctime: Creation time + * @next: Block number of the next full segment + */ +struct nilfs_segsum_info { + unsigned int flags; + unsigned long nfinfo; + unsigned long nblocks; + unsigned long nsumblk; + unsigned long sumbytes; + unsigned long nfileblk; + u64 seg_seq; + time_t ctime; + sector_t next; +}; + +/* macro for the flags */ +#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR) +#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN) +#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND) +#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT) +#define NILFS_SEG_SIMPLEX(sum) \ + (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \ + (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) + +#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk) + +/** + * struct nilfs_segment_buffer - Segment buffer + * @sb_super: back pointer to a superblock struct + * @sb_list: List head to chain this structure + * @sb_sum: On-memory segment summary + * @sb_segnum: Index number of the full segment + * @sb_nextnum: Index number of the next full segment + * @sb_fseg_start: Start block number of the full segment + * @sb_fseg_end: End block number of the full segment + * @sb_pseg_start: Disk block number of partial segment + * @sb_rest_blocks: Number of residual blocks in the current segment + * @sb_segsum_buffers: List of buffers for segment summaries + * @sb_payload_buffers: List of buffers for segment payload + * @sb_io_error: I/O error status + */ +struct nilfs_segment_buffer { + struct super_block *sb_super; + struct list_head sb_list; + + /* Segment information */ + struct nilfs_segsum_info sb_sum; + __u64 sb_segnum; + __u64 sb_nextnum; + sector_t sb_fseg_start, sb_fseg_end; + sector_t sb_pseg_start; + unsigned sb_rest_blocks; + + /* Buffers */ + struct list_head sb_segsum_buffers; + struct list_head sb_payload_buffers; /* including super root */ + + /* io status */ + int sb_io_error; +}; + +#define NILFS_LIST_SEGBUF(head) \ + list_entry((head), struct nilfs_segment_buffer, sb_list) +#define NILFS_NEXT_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.next) +#define NILFS_PREV_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.prev) +#define NILFS_LAST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->prev) +#define NILFS_FIRST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->next) +#define NILFS_SEGBUF_IS_LAST(segbuf, head) ((segbuf)->sb_list.next == (head)) + +#define nilfs_for_each_segbuf_before(s, t, h) \ + for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \ + (s) = NILFS_NEXT_SEGBUF(s)) + +#define NILFS_SEGBUF_FIRST_BH(head) \ + (list_entry((head)->next, struct buffer_head, b_assoc_buffers)) +#define NILFS_SEGBUF_NEXT_BH(bh) \ + (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \ + b_assoc_buffers)) +#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) + + +int __init nilfs_init_segbuf_cache(void); +void nilfs_destroy_segbuf_cache(void); +struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); +void nilfs_segbuf_free(struct nilfs_segment_buffer *); +void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, + struct the_nilfs *); +void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, + struct the_nilfs *); +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); +int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); +int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, + struct buffer_head **); +void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); +void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32); +void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32); + +static inline void +nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers); + segbuf->sb_sum.nblocks++; + segbuf->sb_sum.nsumblk++; +} + +static inline void +nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers); + segbuf->sb_sum.nblocks++; +} + +static inline void +nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + get_bh(bh); + nilfs_segbuf_add_payload_buffer(segbuf, bh); + segbuf->sb_sum.nfileblk++; +} + +void nilfs_release_buffers(struct list_head *); + +static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) +{ + nilfs_release_buffers(&segbuf->sb_segsum_buffers); + nilfs_release_buffers(&segbuf->sb_payload_buffers); +} + +struct nilfs_write_info { + struct bio *bio; + int start, end; /* The region to be submitted */ + int rest_blocks; + int max_pages; + int nr_vecs; + sector_t blocknr; + + int nbio; + atomic_t err; + struct completion bio_event; + /* completion event of segment write */ + + /* + * The following fields must be set explicitly + */ + struct super_block *sb; + struct backing_dev_info *bdi; /* backing dev info */ + struct buffer_head *bh_sr; +}; + + +void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *, + struct nilfs_write_info *); +int nilfs_segbuf_write(struct nilfs_segment_buffer *, + struct nilfs_write_info *); +int nilfs_segbuf_wait(struct nilfs_segment_buffer *, + struct nilfs_write_info *); + +#endif /* _NILFS_SEGBUF_H */ diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h new file mode 100644 index 000000000000..d39df9144e99 --- /dev/null +++ b/fs/nilfs2/seglist.h @@ -0,0 +1,85 @@ +/* + * seglist.h - expediential structure and routines to handle list of segments + * (would be removed in a future release) + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ +#ifndef _NILFS_SEGLIST_H +#define _NILFS_SEGLIST_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "sufile.h" + +struct nilfs_segment_entry { + __u64 segnum; + +#define NILFS_SLH_FREED 0x0001 /* The segment was freed provisonally. + It must be cancelled if + construction aborted */ + + unsigned flags; + struct list_head list; + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; +}; + + +void nilfs_dispose_segment_list(struct list_head *); + +static inline struct nilfs_segment_entry * +nilfs_alloc_segment_entry(__u64 segnum) +{ + struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS); + + if (likely(ent)) { + ent->segnum = segnum; + ent->flags = 0; + ent->bh_su = NULL; + ent->raw_su = NULL; + INIT_LIST_HEAD(&ent->list); + } + return ent; +} + +static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent, + struct inode *sufile) +{ + return nilfs_sufile_get_segment_usage(sufile, ent->segnum, + &ent->raw_su, &ent->bh_su); +} + +static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent, + struct inode *sufile) +{ + if (!ent->bh_su) + return; + nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su); + ent->bh_su = NULL; + ent->raw_su = NULL; +} + +static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent) +{ + kfree(ent); +} + +#endif /* _NILFS_SEGLIST_H */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c new file mode 100644 index 000000000000..fb70ec3be20e --- /dev/null +++ b/fs/nilfs2/segment.c @@ -0,0 +1,2977 @@ +/* + * segment.c - NILFS segment constructor. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/pagemap.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/bio.h> +#include <linux/completion.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/crc32.h> +#include <linux/pagevec.h> +#include "nilfs.h" +#include "btnode.h" +#include "page.h" +#include "segment.h" +#include "sufile.h" +#include "cpfile.h" +#include "ifile.h" +#include "seglist.h" +#include "segbuf.h" + + +/* + * Segment constructor + */ +#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */ + +#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments + appended in collection retry loop */ + +/* Construction mode */ +enum { + SC_LSEG_SR = 1, /* Make a logical segment having a super root */ + SC_LSEG_DSYNC, /* Flush data blocks of a given file and make + a logical segment without a super root */ + SC_FLUSH_FILE, /* Flush data files, leads to segment writes without + creating a checkpoint */ + SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without + a checkpoint */ +}; + +/* Stage numbers of dirty block collection */ +enum { + NILFS_ST_INIT = 0, + NILFS_ST_GC, /* Collecting dirty blocks for GC */ + NILFS_ST_FILE, + NILFS_ST_IFILE, + NILFS_ST_CPFILE, + NILFS_ST_SUFILE, + NILFS_ST_DAT, + NILFS_ST_SR, /* Super root */ + NILFS_ST_DSYNC, /* Data sync blocks */ + NILFS_ST_DONE, +}; + +/* State flags of collection */ +#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ +#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ +#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED) + +/* Operations depending on the construction mode and file type */ +struct nilfs_sc_operations { + int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + void (*write_data_binfo)(struct nilfs_sc_info *, + struct nilfs_segsum_pointer *, + union nilfs_binfo *); + void (*write_node_binfo)(struct nilfs_sc_info *, + struct nilfs_segsum_pointer *, + union nilfs_binfo *); +}; + +/* + * Other definitions + */ +static void nilfs_segctor_start_timer(struct nilfs_sc_info *); +static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int); +static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *); +static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *, + int); + +#define nilfs_cnt32_gt(a, b) \ + (typecheck(__u32, a) && typecheck(__u32, b) && \ + ((__s32)(b) - (__s32)(a) < 0)) +#define nilfs_cnt32_ge(a, b) \ + (typecheck(__u32, a) && typecheck(__u32, b) && \ + ((__s32)(a) - (__s32)(b) >= 0)) +#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) +#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) + +/* + * Transaction + */ +static struct kmem_cache *nilfs_transaction_cachep; + +/** + * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info + * + * nilfs_init_transaction_cache() creates a slab cache for the struct + * nilfs_transaction_info. + * + * Return Value: On success, it returns 0. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_init_transaction_cache(void) +{ + nilfs_transaction_cachep = + kmem_cache_create("nilfs2_transaction_cache", + sizeof(struct nilfs_transaction_info), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0; +} + +/** + * nilfs_detroy_transaction_cache - destroy the cache for transaction info + * + * nilfs_destroy_transaction_cache() frees the slab cache for the struct + * nilfs_transaction_info. + */ +void nilfs_destroy_transaction_cache(void) +{ + kmem_cache_destroy(nilfs_transaction_cachep); +} + +static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) +{ + struct nilfs_transaction_info *cur_ti = current->journal_info; + void *save = NULL; + + if (cur_ti) { + if (cur_ti->ti_magic == NILFS_TI_MAGIC) + return ++cur_ti->ti_count; + else { + /* + * If journal_info field is occupied by other FS, + * it is saved and will be restored on + * nilfs_transaction_commit(). + */ + printk(KERN_WARNING + "NILFS warning: journal info from a different " + "FS\n"); + save = current->journal_info; + } + } + if (!ti) { + ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS); + if (!ti) + return -ENOMEM; + ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC; + } else { + ti->ti_flags = 0; + } + ti->ti_count = 0; + ti->ti_save = save; + ti->ti_magic = NILFS_TI_MAGIC; + current->journal_info = ti; + return 0; +} + +/** + * nilfs_transaction_begin - start indivisible file operations. + * @sb: super block + * @ti: nilfs_transaction_info + * @vacancy_check: flags for vacancy rate checks + * + * nilfs_transaction_begin() acquires a reader/writer semaphore, called + * the segment semaphore, to make a segment construction and write tasks + * exclusive. The function is used with nilfs_transaction_commit() in pairs. + * The region enclosed by these two functions can be nested. To avoid a + * deadlock, the semaphore is only acquired or released in the outermost call. + * + * This function allocates a nilfs_transaction_info struct to keep context + * information on it. It is initialized and hooked onto the current task in + * the outermost call. If a pre-allocated struct is given to @ti, it is used + * instead; othewise a new struct is assigned from a slab. + * + * When @vacancy_check flag is set, this function will check the amount of + * free space, and will wait for the GC to reclaim disk space if low capacity. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-ENOSPC - No space left on device + */ +int nilfs_transaction_begin(struct super_block *sb, + struct nilfs_transaction_info *ti, + int vacancy_check) +{ + struct nilfs_sb_info *sbi; + struct the_nilfs *nilfs; + int ret = nilfs_prepare_segment_lock(ti); + + if (unlikely(ret < 0)) + return ret; + if (ret > 0) + return 0; + + sbi = NILFS_SB(sb); + nilfs = sbi->s_nilfs; + down_read(&nilfs->ns_segctor_sem); + if (vacancy_check && nilfs_near_disk_full(nilfs)) { + up_read(&nilfs->ns_segctor_sem); + ret = -ENOSPC; + goto failed; + } + return 0; + + failed: + ti = current->journal_info; + current->journal_info = ti->ti_save; + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + return ret; +} + +/** + * nilfs_transaction_commit - commit indivisible file operations. + * @sb: super block + * + * nilfs_transaction_commit() releases the read semaphore which is + * acquired by nilfs_transaction_begin(). This is only performed + * in outermost call of this function. If a commit flag is set, + * nilfs_transaction_commit() sets a timer to start the segment + * constructor. If a sync flag is set, it starts construction + * directly. + */ +int nilfs_transaction_commit(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct nilfs_sb_info *sbi; + struct nilfs_sc_info *sci; + int err = 0; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + ti->ti_flags |= NILFS_TI_COMMIT; + if (ti->ti_count > 0) { + ti->ti_count--; + return 0; + } + sbi = NILFS_SB(sb); + sci = NILFS_SC(sbi); + if (sci != NULL) { + if (ti->ti_flags & NILFS_TI_COMMIT) + nilfs_segctor_start_timer(sci); + if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) > + sci->sc_watermark) + nilfs_segctor_do_flush(sci, 0); + } + up_read(&sbi->s_nilfs->ns_segctor_sem); + current->journal_info = ti->ti_save; + + if (ti->ti_flags & NILFS_TI_SYNC) + err = nilfs_construct_segment(sb); + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + return err; +} + +void nilfs_transaction_abort(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + if (ti->ti_count > 0) { + ti->ti_count--; + return; + } + up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem); + + current->journal_info = ti->ti_save; + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); +} + +void nilfs_relax_pressure_in_lock(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct the_nilfs *nilfs = sbi->s_nilfs; + + if (!sci || !sci->sc_flush_request) + return; + + set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); + up_read(&nilfs->ns_segctor_sem); + + down_write(&nilfs->ns_segctor_sem); + if (sci->sc_flush_request && + test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) { + struct nilfs_transaction_info *ti = current->journal_info; + + ti->ti_flags |= NILFS_TI_WRITER; + nilfs_segctor_do_immediate_flush(sci); + ti->ti_flags &= ~NILFS_TI_WRITER; + } + downgrade_write(&nilfs->ns_segctor_sem); +} + +static void nilfs_transaction_lock(struct nilfs_sb_info *sbi, + struct nilfs_transaction_info *ti, + int gcflag) +{ + struct nilfs_transaction_info *cur_ti = current->journal_info; + + WARN_ON(cur_ti); + ti->ti_flags = NILFS_TI_WRITER; + ti->ti_count = 0; + ti->ti_save = cur_ti; + ti->ti_magic = NILFS_TI_MAGIC; + INIT_LIST_HEAD(&ti->ti_garbage); + current->journal_info = ti; + + for (;;) { + down_write(&sbi->s_nilfs->ns_segctor_sem); + if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags)) + break; + + nilfs_segctor_do_immediate_flush(NILFS_SC(sbi)); + + up_write(&sbi->s_nilfs->ns_segctor_sem); + yield(); + } + if (gcflag) + ti->ti_flags |= NILFS_TI_GC; +} + +static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + BUG_ON(ti->ti_count > 0); + + up_write(&sbi->s_nilfs->ns_segctor_sem); + current->journal_info = ti->ti_save; + if (!list_empty(&ti->ti_garbage)) + nilfs_dispose_list(sbi, &ti->ti_garbage, 0); +} + +static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + unsigned bytes) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + unsigned blocksize = sci->sc_super->s_blocksize; + void *p; + + if (unlikely(ssp->offset + bytes > blocksize)) { + ssp->offset = 0; + BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh, + &segbuf->sb_segsum_buffers)); + ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh); + } + p = ssp->bh->b_data + ssp->offset; + ssp->offset += bytes; + return p; +} + +/** + * nilfs_segctor_reset_segment_buffer - reset the current segment buffer + * @sci: nilfs_sc_info + */ +static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + struct buffer_head *sumbh; + unsigned sumbytes; + unsigned flags = 0; + int err; + + if (nilfs_doing_gc()) + flags = NILFS_SS_GC; + err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime); + if (unlikely(err)) + return err; + + sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); + sumbytes = segbuf->sb_sum.sumbytes; + sci->sc_finfo_ptr.bh = sumbh; sci->sc_finfo_ptr.offset = sumbytes; + sci->sc_binfo_ptr.bh = sumbh; sci->sc_binfo_ptr.offset = sumbytes; + sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; + return 0; +} + +static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) +{ + sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; + if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs)) + return -E2BIG; /* The current segment is filled up + (internal code) */ + sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); + return nilfs_segctor_reset_segment_buffer(sci); +} + +static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + int err; + + if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) { + err = nilfs_segctor_feed_segment(sci); + if (err) + return err; + segbuf = sci->sc_curseg; + } + err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root); + if (likely(!err)) + segbuf->sb_sum.flags |= NILFS_SS_SR; + return err; +} + +/* + * Functions for making segment summary and payloads + */ +static int nilfs_segctor_segsum_block_required( + struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp, + unsigned binfo_size) +{ + unsigned blocksize = sci->sc_super->s_blocksize; + /* Size of finfo and binfo is enough small against blocksize */ + + return ssp->offset + binfo_size + + (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) > + blocksize; +} + +static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci, + struct inode *inode) +{ + sci->sc_curseg->sb_sum.nfinfo++; + sci->sc_binfo_ptr = sci->sc_finfo_ptr; + nilfs_segctor_map_segsum_entry( + sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo)); + + if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); + /* skip finfo */ +} + +static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci, + struct inode *inode) +{ + struct nilfs_finfo *finfo; + struct nilfs_inode_info *ii; + struct nilfs_segment_buffer *segbuf; + + if (sci->sc_blk_cnt == 0) + return; + + ii = NILFS_I(inode); + finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr, + sizeof(*finfo)); + finfo->fi_ino = cpu_to_le64(inode->i_ino); + finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt); + finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt); + finfo->fi_cno = cpu_to_le64(ii->i_cno); + + segbuf = sci->sc_curseg; + segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset + + sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1); + sci->sc_finfo_ptr = sci->sc_binfo_ptr; + sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; +} + +static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode, + unsigned binfo_size) +{ + struct nilfs_segment_buffer *segbuf; + int required, err = 0; + + retry: + segbuf = sci->sc_curseg; + required = nilfs_segctor_segsum_block_required( + sci, &sci->sc_binfo_ptr, binfo_size); + if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) { + nilfs_segctor_end_finfo(sci, inode); + err = nilfs_segctor_feed_segment(sci); + if (err) + return err; + goto retry; + } + if (unlikely(required)) { + err = nilfs_segbuf_extend_segsum(segbuf); + if (unlikely(err)) + goto failed; + } + if (sci->sc_blk_cnt == 0) + nilfs_segctor_begin_finfo(sci, inode); + + nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size); + /* Substitution to vblocknr is delayed until update_blocknr() */ + nilfs_segbuf_add_file_buffer(segbuf, bh); + sci->sc_blk_cnt++; + failed: + return err; +} + +static int nilfs_handle_bmap_error(int err, const char *fname, + struct inode *inode, struct super_block *sb) +{ + if (err == -EINVAL) { + nilfs_error(sb, fname, "broken bmap (inode=%lu)\n", + inode->i_ino); + err = -EIO; + } + return err; +} + +/* + * Callback functions that enumerate, mark, and collect dirty blocks + */ +static int nilfs_collect_file_data(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (unlikely(err < 0)) + return nilfs_handle_bmap_error(err, __func__, inode, + sci->sc_super); + + err = nilfs_segctor_add_file_block(sci, bh, inode, + sizeof(struct nilfs_binfo_v)); + if (!err) + sci->sc_datablk_cnt++; + return err; +} + +static int nilfs_collect_file_node(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (unlikely(err < 0)) + return nilfs_handle_bmap_error(err, __func__, inode, + sci->sc_super); + return 0; +} + +static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode) +{ + WARN_ON(!buffer_dirty(bh)); + return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); +} + +static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry( + sci, ssp, sizeof(*binfo_v)); + *binfo_v = binfo->bi_v; +} + +static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + __le64 *vblocknr = nilfs_segctor_map_segsum_entry( + sci, ssp, sizeof(*vblocknr)); + *vblocknr = binfo->bi_v.bi_vblocknr; +} + +struct nilfs_sc_operations nilfs_sc_file_ops = { + .collect_data = nilfs_collect_file_data, + .collect_node = nilfs_collect_file_node, + .collect_bmap = nilfs_collect_file_bmap, + .write_data_binfo = nilfs_write_file_data_binfo, + .write_node_binfo = nilfs_write_file_node_binfo, +}; + +static int nilfs_collect_dat_data(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (unlikely(err < 0)) + return nilfs_handle_bmap_error(err, __func__, inode, + sci->sc_super); + + err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); + if (!err) + sci->sc_datablk_cnt++; + return err; +} + +static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + WARN_ON(!buffer_dirty(bh)); + return nilfs_segctor_add_file_block(sci, bh, inode, + sizeof(struct nilfs_binfo_dat)); +} + +static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp, + sizeof(*blkoff)); + *blkoff = binfo->bi_dat.bi_blkoff; +} + +static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + struct nilfs_binfo_dat *binfo_dat = + nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat)); + *binfo_dat = binfo->bi_dat; +} + +struct nilfs_sc_operations nilfs_sc_dat_ops = { + .collect_data = nilfs_collect_dat_data, + .collect_node = nilfs_collect_file_node, + .collect_bmap = nilfs_collect_dat_bmap, + .write_data_binfo = nilfs_write_dat_data_binfo, + .write_node_binfo = nilfs_write_dat_node_binfo, +}; + +struct nilfs_sc_operations nilfs_sc_dsync_ops = { + .collect_data = nilfs_collect_file_data, + .collect_node = NULL, + .collect_bmap = NULL, + .write_data_binfo = nilfs_write_file_data_binfo, + .write_node_binfo = NULL, +}; + +static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, + struct list_head *listp, + size_t nlimit, + loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t index = 0, last = ULONG_MAX; + size_t ndirties = 0; + int i; + + if (unlikely(start != 0 || end != LLONG_MAX)) { + /* + * A valid range is given for sync-ing data pages. The + * range is rounded to per-page; extra dirty buffers + * may be included if blocksize < pagesize. + */ + index = start >> PAGE_SHIFT; + last = end >> PAGE_SHIFT; + } + pagevec_init(&pvec, 0); + repeat: + if (unlikely(index > last) || + !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + min_t(pgoff_t, last - index, + PAGEVEC_SIZE - 1) + 1)) + return ndirties; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct buffer_head *bh, *head; + struct page *page = pvec.pages[i]; + + if (unlikely(page->index > last)) + break; + + if (mapping->host) { + lock_page(page); + if (!page_has_buffers(page)) + create_empty_buffers(page, + 1 << inode->i_blkbits, 0); + unlock_page(page); + } + + bh = head = page_buffers(page); + do { + if (!buffer_dirty(bh)) + continue; + get_bh(bh); + list_add_tail(&bh->b_assoc_buffers, listp); + ndirties++; + if (unlikely(ndirties >= nlimit)) { + pagevec_release(&pvec); + cond_resched(); + return ndirties; + } + } while (bh = bh->b_this_page, bh != head); + } + pagevec_release(&pvec); + cond_resched(); + goto repeat; +} + +static void nilfs_lookup_dirty_node_buffers(struct inode *inode, + struct list_head *listp) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct address_space *mapping = &ii->i_btnode_cache; + struct pagevec pvec; + struct buffer_head *bh, *head; + unsigned int i; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + bh = head = page_buffers(pvec.pages[i]); + do { + if (buffer_dirty(bh)) { + get_bh(bh); + list_add_tail(&bh->b_assoc_buffers, + listp); + } + bh = bh->b_this_page; + } while (bh != head); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +static void nilfs_dispose_list(struct nilfs_sb_info *sbi, + struct list_head *head, int force) +{ + struct nilfs_inode_info *ii, *n; + struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii; + unsigned nv = 0; + + while (!list_empty(head)) { + spin_lock(&sbi->s_inode_lock); + list_for_each_entry_safe(ii, n, head, i_dirty) { + list_del_init(&ii->i_dirty); + if (force) { + if (unlikely(ii->i_bh)) { + brelse(ii->i_bh); + ii->i_bh = NULL; + } + } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) { + set_bit(NILFS_I_QUEUED, &ii->i_state); + list_add_tail(&ii->i_dirty, + &sbi->s_dirty_files); + continue; + } + ivec[nv++] = ii; + if (nv == SC_N_INODEVEC) + break; + } + spin_unlock(&sbi->s_inode_lock); + + for (pii = ivec; nv > 0; pii++, nv--) + iput(&(*pii)->vfs_inode); + } +} + +static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int ret = 0; + + if (nilfs_mdt_fetch_dirty(sbi->s_ifile)) + ret++; + if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile)) + ret++; + if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile)) + ret++; + if (ret || nilfs_doing_gc()) + if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs))) + ret++; + return ret; +} + +static int nilfs_segctor_clean(struct nilfs_sc_info *sci) +{ + return list_empty(&sci->sc_dirty_files) && + !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) && + list_empty(&sci->sc_cleaning_segments) && + (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes)); +} + +static int nilfs_segctor_confirm(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + int ret = 0; + + if (nilfs_test_metadata_dirty(sbi)) + set_bit(NILFS_SC_DIRTY, &sci->sc_flags); + + spin_lock(&sbi->s_inode_lock); + if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci)) + ret++; + + spin_unlock(&sbi->s_inode_lock); + return ret; +} + +static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + + nilfs_mdt_clear_dirty(sbi->s_ifile); + nilfs_mdt_clear_dirty(nilfs->ns_cpfile); + nilfs_mdt_clear_dirty(nilfs->ns_sufile); + nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs)); +} + +static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; + struct buffer_head *bh_cp; + struct nilfs_checkpoint *raw_cp; + int err; + + /* XXX: this interface will be changed */ + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1, + &raw_cp, &bh_cp); + if (likely(!err)) { + /* The following code is duplicated with cpfile. But, it is + needed to collect the checkpoint even if it was not newly + created */ + nilfs_mdt_mark_buffer_dirty(bh_cp); + nilfs_mdt_mark_dirty(nilfs->ns_cpfile); + nilfs_cpfile_put_checkpoint( + nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); + } else + WARN_ON(err == -EINVAL || err == -ENOENT); + + return err; +} + +static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + struct buffer_head *bh_cp; + struct nilfs_checkpoint *raw_cp; + int err; + + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, + &raw_cp, &bh_cp); + if (unlikely(err)) { + WARN_ON(err == -EINVAL || err == -ENOENT); + goto failed_ibh; + } + raw_cp->cp_snapshot_list.ssl_next = 0; + raw_cp->cp_snapshot_list.ssl_prev = 0; + raw_cp->cp_inodes_count = + cpu_to_le64(atomic_read(&sbi->s_inodes_count)); + raw_cp->cp_blocks_count = + cpu_to_le64(atomic_read(&sbi->s_blocks_count)); + raw_cp->cp_nblk_inc = + cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); + raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); + raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno); + + if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + nilfs_checkpoint_clear_minor(raw_cp); + else + nilfs_checkpoint_set_minor(raw_cp); + + nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1); + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); + return 0; + + failed_ibh: + return err; +} + +static void nilfs_fill_in_file_bmap(struct inode *ifile, + struct nilfs_inode_info *ii) + +{ + struct buffer_head *ibh; + struct nilfs_inode *raw_inode; + + if (test_bit(NILFS_I_BMAP, &ii->i_state)) { + ibh = ii->i_bh; + BUG_ON(!ibh); + raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino, + ibh); + nilfs_bmap_write(ii->i_bmap, raw_inode); + nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh); + } +} + +static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci, + struct inode *ifile) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) { + nilfs_fill_in_file_bmap(ifile, ii); + set_bit(NILFS_I_COLLECTED, &ii->i_state); + } +} + +/* + * CRC calculation routines + */ +static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed) +{ + struct nilfs_super_root *raw_sr = + (struct nilfs_super_root *)bh_sr->b_data; + u32 crc; + + crc = crc32_le(seed, + (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), + NILFS_SR_BYTES - sizeof(raw_sr->sr_sum)); + raw_sr->sr_sum = cpu_to_le32(crc); +} + +static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci, + u32 seed) +{ + struct nilfs_segment_buffer *segbuf; + + if (sci->sc_super_root) + nilfs_fill_in_super_root_crc(sci->sc_super_root, seed); + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); + nilfs_segbuf_fill_in_data_crc(segbuf, seed); + } +} + +static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct buffer_head *bh_sr = sci->sc_super_root; + struct nilfs_super_root *raw_sr = + (struct nilfs_super_root *)bh_sr->b_data; + unsigned isz = nilfs->ns_inode_size; + + raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); + raw_sr->sr_nongc_ctime + = cpu_to_le64(nilfs_doing_gc() ? + nilfs->ns_nongc_ctime : sci->sc_seg_ctime); + raw_sr->sr_flags = 0; + + nilfs_mdt_write_inode_direct( + nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz)); + nilfs_mdt_write_inode_direct( + nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz)); + nilfs_mdt_write_inode_direct( + nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz)); +} + +static void nilfs_redirty_inodes(struct list_head *head) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, head, i_dirty) { + if (test_bit(NILFS_I_COLLECTED, &ii->i_state)) + clear_bit(NILFS_I_COLLECTED, &ii->i_state); + } +} + +static void nilfs_drop_collected_inodes(struct list_head *head) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, head, i_dirty) { + if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state)) + continue; + + clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + set_bit(NILFS_I_UPDATED, &ii->i_state); + } +} + +static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci, + struct inode *sufile) + +{ + struct list_head *head = &sci->sc_cleaning_segments; + struct nilfs_segment_entry *ent; + int err; + + list_for_each_entry(ent, head, list) { + if (!(ent->flags & NILFS_SLH_FREED)) + break; + err = nilfs_sufile_cancel_free(sufile, ent->segnum); + WARN_ON(err); /* do not happen */ + ent->flags &= ~NILFS_SLH_FREED; + } +} + +static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct list_head *head = &sci->sc_cleaning_segments; + struct nilfs_segment_entry *ent; + int err; + + list_for_each_entry(ent, head, list) { + err = nilfs_sufile_free(sufile, ent->segnum); + if (unlikely(err)) + return err; + ent->flags |= NILFS_SLH_FREED; + } + return 0; +} + +static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci) +{ + nilfs_dispose_segment_list(&sci->sc_cleaning_segments); +} + +static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci, + struct inode *inode, + struct list_head *listp, + int (*collect)(struct nilfs_sc_info *, + struct buffer_head *, + struct inode *)) +{ + struct buffer_head *bh, *n; + int err = 0; + + if (collect) { + list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + err = collect(sci, bh, inode); + brelse(bh); + if (unlikely(err)) + goto dispose_buffers; + } + return 0; + } + + dispose_buffers: + while (!list_empty(listp)) { + bh = list_entry(listp->next, struct buffer_head, + b_assoc_buffers); + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } + return err; +} + +static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci) +{ + /* Remaining number of blocks within segment buffer */ + return sci->sc_segbuf_nblocks - + (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks); +} + +static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci, + struct inode *inode, + struct nilfs_sc_operations *sc_ops) +{ + LIST_HEAD(data_buffers); + LIST_HEAD(node_buffers); + int err; + + if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { + size_t n, rest = nilfs_segctor_buffer_rest(sci); + + n = nilfs_lookup_dirty_data_buffers( + inode, &data_buffers, rest + 1, 0, LLONG_MAX); + if (n > rest) { + err = nilfs_segctor_apply_buffers( + sci, inode, &data_buffers, + sc_ops->collect_data); + BUG_ON(!err); /* always receive -E2BIG or true error */ + goto break_or_fail; + } + } + nilfs_lookup_dirty_node_buffers(inode, &node_buffers); + + if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { + err = nilfs_segctor_apply_buffers( + sci, inode, &data_buffers, sc_ops->collect_data); + if (unlikely(err)) { + /* dispose node list */ + nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, NULL); + goto break_or_fail; + } + sci->sc_stage.flags |= NILFS_CF_NODE; + } + /* Collect node */ + err = nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, sc_ops->collect_node); + if (unlikely(err)) + goto break_or_fail; + + nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers); + err = nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, sc_ops->collect_bmap); + if (unlikely(err)) + goto break_or_fail; + + nilfs_segctor_end_finfo(sci, inode); + sci->sc_stage.flags &= ~NILFS_CF_NODE; + + break_or_fail: + return err; +} + +static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci, + struct inode *inode) +{ + LIST_HEAD(data_buffers); + size_t n, rest = nilfs_segctor_buffer_rest(sci); + int err; + + n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1, + sci->sc_dsync_start, + sci->sc_dsync_end); + + err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers, + nilfs_collect_file_data); + if (!err) { + nilfs_segctor_end_finfo(sci, inode); + BUG_ON(n > rest); + /* always receive -E2BIG or true error if n > rest */ + } + return err; +} + +static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + struct list_head *head; + struct nilfs_inode_info *ii; + int err = 0; + + switch (sci->sc_stage.scnt) { + case NILFS_ST_INIT: + /* Pre-processes */ + sci->sc_stage.flags = 0; + + if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) { + sci->sc_nblk_inc = 0; + sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN; + if (mode == SC_LSEG_DSYNC) { + sci->sc_stage.scnt = NILFS_ST_DSYNC; + goto dsync_mode; + } + } + + sci->sc_stage.dirty_file_ptr = NULL; + sci->sc_stage.gc_inode_ptr = NULL; + if (mode == SC_FLUSH_DAT) { + sci->sc_stage.scnt = NILFS_ST_DAT; + goto dat_stage; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_GC: + if (nilfs_doing_gc()) { + head = &sci->sc_gc_inodes; + ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr, + head, i_dirty); + list_for_each_entry_continue(ii, head, i_dirty) { + err = nilfs_segctor_scan_file( + sci, &ii->vfs_inode, + &nilfs_sc_file_ops); + if (unlikely(err)) { + sci->sc_stage.gc_inode_ptr = list_entry( + ii->i_dirty.prev, + struct nilfs_inode_info, + i_dirty); + goto break_or_fail; + } + set_bit(NILFS_I_COLLECTED, &ii->i_state); + } + sci->sc_stage.gc_inode_ptr = NULL; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_FILE: + head = &sci->sc_dirty_files; + ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, + i_dirty); + list_for_each_entry_continue(ii, head, i_dirty) { + clear_bit(NILFS_I_DIRTY, &ii->i_state); + + err = nilfs_segctor_scan_file(sci, &ii->vfs_inode, + &nilfs_sc_file_ops); + if (unlikely(err)) { + sci->sc_stage.dirty_file_ptr = + list_entry(ii->i_dirty.prev, + struct nilfs_inode_info, + i_dirty); + goto break_or_fail; + } + /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */ + /* XXX: required ? */ + } + sci->sc_stage.dirty_file_ptr = NULL; + if (mode == SC_FLUSH_FILE) { + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + } + sci->sc_stage.scnt++; + sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; + /* Fall through */ + case NILFS_ST_IFILE: + err = nilfs_segctor_scan_file(sci, sbi->s_ifile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; + /* Creating a checkpoint */ + err = nilfs_segctor_create_checkpoint(sci); + if (unlikely(err)) + break; + /* Fall through */ + case NILFS_ST_CPFILE: + err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_SUFILE: + err = nilfs_segctor_prepare_free_segments(sci, + nilfs->ns_sufile); + if (unlikely(err)) + break; + err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_DAT: + dat_stage: + err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs), + &nilfs_sc_dat_ops); + if (unlikely(err)) + break; + if (mode == SC_FLUSH_DAT) { + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_SR: + if (mode == SC_LSEG_SR) { + /* Appending a super root */ + err = nilfs_segctor_add_super_root(sci); + if (unlikely(err)) + break; + } + /* End of a logical segment */ + sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + case NILFS_ST_DSYNC: + dsync_mode: + sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT; + ii = sci->sc_dsync_inode; + if (!test_bit(NILFS_I_BUSY, &ii->i_state)) + break; + + err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode); + if (unlikely(err)) + break; + sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + case NILFS_ST_DONE: + return 0; + default: + BUG(); + } + + break_or_fail: + return err; +} + +static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum) +{ + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; + int err; + + err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su); + if (unlikely(err)) + return err; + nilfs_mdt_mark_buffer_dirty(bh_su); + nilfs_mdt_mark_dirty(sufile); + nilfs_sufile_put_segment_usage(sufile, segnum, bh_su); + return 0; +} + +static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf, *n; + __u64 nextnum; + int err; + + if (list_empty(&sci->sc_segbufs)) { + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + return -ENOMEM; + list_add(&segbuf->sb_list, &sci->sc_segbufs); + } else + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset, + nilfs); + + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_shift_to_next_segment(nilfs); + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); + } + sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks; + + err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum); + if (unlikely(err)) + return err; + + if (nilfs->ns_segnum == nilfs->ns_nextnum) { + /* Start from the head of a new full segment */ + err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum); + if (unlikely(err)) + return err; + } else + nextnum = nilfs->ns_nextnum; + + segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq; + nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs); + + /* truncating segment buffers */ + list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, + sb_list) { + list_del_init(&segbuf->sb_list); + nilfs_segbuf_free(segbuf); + } + return 0; +} + +static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int nadd) +{ + struct nilfs_segment_buffer *segbuf, *prev, *n; + struct inode *sufile = nilfs->ns_sufile; + __u64 nextnextnum; + LIST_HEAD(list); + int err, ret, i; + + prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs); + /* + * Since the segment specified with nextnum might be allocated during + * the previous construction, the buffer including its segusage may + * not be dirty. The following call ensures that the buffer is dirty + * and will pin the buffer on memory until the sufile is written. + */ + err = nilfs_touch_segusage(sufile, prev->sb_nextnum); + if (unlikely(err)) + return err; + + for (i = 0; i < nadd; i++) { + /* extend segment info */ + err = -ENOMEM; + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + goto failed; + + /* map this buffer to region of segment on-disk */ + nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); + sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks; + + /* allocate the next next full segment */ + err = nilfs_sufile_alloc(sufile, &nextnextnum); + if (unlikely(err)) + goto failed_segbuf; + + segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1; + nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs); + + list_add_tail(&segbuf->sb_list, &list); + prev = segbuf; + } + list_splice(&list, sci->sc_segbufs.prev); + return 0; + + failed_segbuf: + nilfs_segbuf_free(segbuf); + failed: + list_for_each_entry_safe(segbuf, n, &list, sb_list) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + list_del_init(&segbuf->sb_list); + nilfs_segbuf_free(segbuf); + } + return err; +} + +static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf; + int ret, done = 0; + + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + if (nilfs->ns_nextnum != segbuf->sb_nextnum) { + ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + } + if (segbuf->sb_io_error) { + /* Case 1: The first segment failed */ + if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) + /* Case 1a: Partial segment appended into an existing + segment */ + nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start, + segbuf->sb_fseg_end); + else /* Case 1b: New full segment */ + set_nilfs_discontinued(nilfs); + done++; + } + + list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { + ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + if (!done && segbuf->sb_io_error) { + if (segbuf->sb_segnum != nilfs->ns_nextnum) + /* Case 2: extended segment (!= next) failed */ + nilfs_sufile_set_error(nilfs->ns_sufile, + segbuf->sb_segnum); + done++; + } + } +} + +static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) + nilfs_segbuf_clear(segbuf); + sci->sc_super_root = NULL; +} + +static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + + while (!list_empty(&sci->sc_segbufs)) { + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + list_del_init(&segbuf->sb_list); + nilfs_segbuf_free(segbuf); + } + /* sci->sc_curseg = NULL; */ +} + +static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int err) +{ + if (unlikely(err)) { + nilfs_segctor_free_incomplete_segments(sci, nilfs); + nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile); + } + nilfs_segctor_clear_segment_buffers(sci); +} + +static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf; + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; + unsigned long live_blocks; + int ret; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, + &raw_su, &bh_su); + WARN_ON(ret); /* always succeed because bh_su is dirty */ + live_blocks = segbuf->sb_sum.nblocks + + (segbuf->sb_pseg_start - segbuf->sb_fseg_start); + raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime); + raw_su->su_nblocks = cpu_to_le32(live_blocks); + nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, + bh_su); + } +} + +static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf; + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; + int ret; + + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, + &raw_su, &bh_su); + WARN_ON(ret); /* always succeed because bh_su is dirty */ + raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start - + segbuf->sb_fseg_start); + nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su); + + list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { + ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, + &raw_su, &bh_su); + WARN_ON(ret); /* always succeed */ + raw_su->su_nblocks = 0; + nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, + bh_su); + } +} + +static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci, + struct nilfs_segment_buffer *last, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf = last, *n; + int ret; + + list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, + sb_list) { + list_del_init(&segbuf->sb_list); + sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks; + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); + nilfs_segbuf_free(segbuf); + } +} + + +static int nilfs_segctor_collect(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int mode) +{ + struct nilfs_cstage prev_stage = sci->sc_stage; + int err, nadd = 1; + + /* Collection retry loop */ + for (;;) { + sci->sc_super_root = NULL; + sci->sc_nblk_this_inc = 0; + sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + + err = nilfs_segctor_reset_segment_buffer(sci); + if (unlikely(err)) + goto failed; + + err = nilfs_segctor_collect_blocks(sci, mode); + sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; + if (!err) + break; + + if (unlikely(err != -E2BIG)) + goto failed; + + /* The current segment is filled up */ + if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) + break; + + nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile); + nilfs_segctor_clear_segment_buffers(sci); + + err = nilfs_segctor_extend_segments(sci, nilfs, nadd); + if (unlikely(err)) + return err; + + nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); + sci->sc_stage = prev_stage; + } + nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile); + return 0; + + failed: + return err; +} + +static void nilfs_list_replace_buffer(struct buffer_head *old_bh, + struct buffer_head *new_bh) +{ + BUG_ON(!list_empty(&new_bh->b_assoc_buffers)); + + list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers); + /* The caller must release old_bh */ +} + +static int +nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, + struct nilfs_segment_buffer *segbuf, + int mode) +{ + struct inode *inode = NULL; + sector_t blocknr; + unsigned long nfinfo = segbuf->sb_sum.nfinfo; + unsigned long nblocks = 0, ndatablk = 0; + struct nilfs_sc_operations *sc_op = NULL; + struct nilfs_segsum_pointer ssp; + struct nilfs_finfo *finfo = NULL; + union nilfs_binfo binfo; + struct buffer_head *bh, *bh_org; + ino_t ino = 0; + int err = 0; + + if (!nfinfo) + goto out; + + blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk; + ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); + ssp.offset = sizeof(struct nilfs_segment_summary); + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + if (bh == sci->sc_super_root) + break; + if (!finfo) { + finfo = nilfs_segctor_map_segsum_entry( + sci, &ssp, sizeof(*finfo)); + ino = le64_to_cpu(finfo->fi_ino); + nblocks = le32_to_cpu(finfo->fi_nblocks); + ndatablk = le32_to_cpu(finfo->fi_ndatablk); + + if (buffer_nilfs_node(bh)) + inode = NILFS_BTNC_I(bh->b_page->mapping); + else + inode = NILFS_AS_I(bh->b_page->mapping); + + if (mode == SC_LSEG_DSYNC) + sc_op = &nilfs_sc_dsync_ops; + else if (ino == NILFS_DAT_INO) + sc_op = &nilfs_sc_dat_ops; + else /* file blocks */ + sc_op = &nilfs_sc_file_ops; + } + bh_org = bh; + get_bh(bh_org); + err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr, + &binfo); + if (bh != bh_org) + nilfs_list_replace_buffer(bh_org, bh); + brelse(bh_org); + if (unlikely(err)) + goto failed_bmap; + + if (ndatablk > 0) + sc_op->write_data_binfo(sci, &ssp, &binfo); + else + sc_op->write_node_binfo(sci, &ssp, &binfo); + + blocknr++; + if (--nblocks == 0) { + finfo = NULL; + if (--nfinfo == 0) + break; + } else if (ndatablk > 0) + ndatablk--; + } + out: + return 0; + + failed_bmap: + err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super); + return err; +} + +static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_segment_buffer *segbuf; + int err; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode); + if (unlikely(err)) + return err; + nilfs_segbuf_fill_in_segsum(segbuf); + } + return 0; +} + +static int +nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) +{ + struct page *clone_page; + struct buffer_head *bh, *head, *bh2; + void *kaddr; + + bh = head = page_buffers(page); + + clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0); + if (unlikely(!clone_page)) + return -ENOMEM; + + bh2 = page_buffers(clone_page); + kaddr = kmap_atomic(page, KM_USER0); + do { + if (list_empty(&bh->b_assoc_buffers)) + continue; + get_bh(bh2); + page_cache_get(clone_page); /* for each bh */ + memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size); + bh2->b_blocknr = bh->b_blocknr; + list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers); + list_add_tail(&bh->b_assoc_buffers, out); + } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head); + kunmap_atomic(kaddr, KM_USER0); + + if (!TestSetPageWriteback(clone_page)) + inc_zone_page_state(clone_page, NR_WRITEBACK); + unlock_page(clone_page); + + return 0; +} + +static int nilfs_test_page_to_be_frozen(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode)) + return 0; + + if (page_mapped(page)) { + ClearPageChecked(page); + return 1; + } + return PageChecked(page); +} + +static int nilfs_begin_page_io(struct page *page, struct list_head *out) +{ + if (!page || PageWriteback(page)) + /* For split b-tree node pages, this function may be called + twice. We ignore the 2nd or later calls by this check. */ + return 0; + + lock_page(page); + clear_page_dirty_for_io(page); + set_page_writeback(page); + unlock_page(page); + + if (nilfs_test_page_to_be_frozen(page)) { + int err = nilfs_copy_replace_page_buffers(page, out); + if (unlikely(err)) + return err; + } + return 0; +} + +static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, + struct page **failed_page) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + struct list_head *list = &sci->sc_copied_buffers; + int err; + + *failed_page = NULL; + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + if (bh->b_page != bd_page) { + if (bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + } + bd_page = bh->b_page; + } + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + if (bh == sci->sc_super_root) { + if (bh->b_page != bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + err = nilfs_begin_page_io(fs_page, list); + if (unlikely(err)) { + *failed_page = fs_page; + goto out; + } + fs_page = bh->b_page; + } + } + } + if (bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + } + err = nilfs_begin_page_io(fs_page, list); + if (unlikely(err)) + *failed_page = fs_page; + out: + return err; +} + +static int nilfs_segctor_write(struct nilfs_sc_info *sci, + struct backing_dev_info *bdi) +{ + struct nilfs_segment_buffer *segbuf; + struct nilfs_write_info wi; + int err, res; + + wi.sb = sci->sc_super; + wi.bh_sr = sci->sc_super_root; + wi.bdi = bdi; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + nilfs_segbuf_prepare_write(segbuf, &wi); + err = nilfs_segbuf_write(segbuf, &wi); + + res = nilfs_segbuf_wait(segbuf, &wi); + err = unlikely(err) ? : res; + if (unlikely(err)) + return err; + } + return 0; +} + +static int nilfs_page_has_uncleared_buffer(struct page *page) +{ + struct buffer_head *head, *bh; + + head = bh = page_buffers(page); + do { + if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers)) + return 1; + bh = bh->b_this_page; + } while (bh != head); + return 0; +} + +static void __nilfs_end_page_io(struct page *page, int err) +{ + if (!err) { + if (!nilfs_page_buffers_clean(page)) + __set_page_dirty_nobuffers(page); + ClearPageError(page); + } else { + __set_page_dirty_nobuffers(page); + SetPageError(page); + } + + if (buffer_nilfs_allocated(page_buffers(page))) { + if (TestClearPageWriteback(page)) + dec_zone_page_state(page, NR_WRITEBACK); + } else + end_page_writeback(page); +} + +static void nilfs_end_page_io(struct page *page, int err) +{ + if (!page) + return; + + if (buffer_nilfs_node(page_buffers(page)) && + nilfs_page_has_uncleared_buffer(page)) + /* For b-tree node pages, this function may be called twice + or more because they might be split in a segment. + This check assures that cleanup has been done for all + buffers in a split btnode page. */ + return; + + __nilfs_end_page_io(page, err); +} + +static void nilfs_clear_copied_buffers(struct list_head *list, int err) +{ + struct buffer_head *bh, *head; + struct page *page; + + while (!list_empty(list)) { + bh = list_entry(list->next, struct buffer_head, + b_assoc_buffers); + page = bh->b_page; + page_cache_get(page); + head = bh = page_buffers(page); + do { + if (!list_empty(&bh->b_assoc_buffers)) { + list_del_init(&bh->b_assoc_buffers); + if (!err) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + } + brelse(bh); /* for b_assoc_buffers */ + } + } while ((bh = bh->b_this_page) != head); + + __nilfs_end_page_io(page, err); + page_cache_release(page); + } +} + +static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, + struct page *failed_page, int err) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + if (bh->b_page != bd_page) { + if (bd_page) + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + if (bh == sci->sc_super_root) { + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, err); + if (unlikely(fs_page == failed_page)) + goto done; + fs_page = bh->b_page; + } + } + } + if (bd_page) + end_page_writeback(bd_page); + + nilfs_end_page_io(fs_page, err); + done: + nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err); +} + +static void nilfs_set_next_segment(struct the_nilfs *nilfs, + struct nilfs_segment_buffer *segbuf) +{ + nilfs->ns_segnum = segbuf->sb_segnum; + nilfs->ns_nextnum = segbuf->sb_nextnum; + nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start + + segbuf->sb_sum.nblocks; + nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq; + nilfs->ns_ctime = segbuf->sb_sum.ctime; +} + +static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + int update_sr = (sci->sc_super_root != NULL); + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + if (bh->b_page != bd_page) { + if (bd_page) + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + } + /* + * We assume that the buffers which belong to the same page + * continue over the buffer list. + * Under this assumption, the last BHs of pages is + * identifiable by the discontinuity of bh->b_page + * (page != fs_page). + * + * For B-tree node blocks, however, this assumption is not + * guaranteed. The cleanup code of B-tree node pages needs + * special care. + */ + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + if (bh == sci->sc_super_root) { + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, 0); + fs_page = bh->b_page; + } + } + + if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) { + if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) { + set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); + sci->sc_lseg_stime = jiffies; + } + if (NILFS_SEG_LOGEND(&segbuf->sb_sum)) + clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); + } + } + /* + * Since pages may continue over multiple segment buffers, + * end of the last page must be checked outside of the loop. + */ + if (bd_page) + end_page_writeback(bd_page); + + nilfs_end_page_io(fs_page, 0); + + nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0); + + nilfs_drop_collected_inodes(&sci->sc_dirty_files); + + if (nilfs_doing_gc()) { + nilfs_drop_collected_inodes(&sci->sc_gc_inodes); + if (update_sr) + nilfs_commit_gcdat_inode(nilfs); + } else + nilfs->ns_nongc_ctime = sci->sc_seg_ctime; + + sci->sc_nblk_inc += sci->sc_nblk_this_inc; + + segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs); + nilfs_set_next_segment(nilfs, segbuf); + + if (update_sr) { + nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, + segbuf->sb_sum.seg_seq, nilfs->ns_cno++); + sbi->s_super->s_dirt = 1; + + clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); + clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); + set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); + } else + clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); +} + +static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci, + struct nilfs_sb_info *sbi) +{ + struct nilfs_inode_info *ii, *n; + __u64 cno = sbi->s_nilfs->ns_cno; + + spin_lock(&sbi->s_inode_lock); + retry: + list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) { + if (!ii->i_bh) { + struct buffer_head *ibh; + int err; + + spin_unlock(&sbi->s_inode_lock); + err = nilfs_ifile_get_inode_block( + sbi->s_ifile, ii->vfs_inode.i_ino, &ibh); + if (unlikely(err)) { + nilfs_warning(sbi->s_super, __func__, + "failed to get inode block.\n"); + return err; + } + nilfs_mdt_mark_buffer_dirty(ibh); + nilfs_mdt_mark_dirty(sbi->s_ifile); + spin_lock(&sbi->s_inode_lock); + if (likely(!ii->i_bh)) + ii->i_bh = ibh; + else + brelse(ibh); + goto retry; + } + ii->i_cno = cno; + + clear_bit(NILFS_I_QUEUED, &ii->i_state); + set_bit(NILFS_I_BUSY, &ii->i_state); + list_del(&ii->i_dirty); + list_add_tail(&ii->i_dirty, &sci->sc_dirty_files); + } + spin_unlock(&sbi->s_inode_lock); + + NILFS_I(sbi->s_ifile)->i_cno = cno; + + return 0; +} + +static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci, + struct nilfs_sb_info *sbi) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct nilfs_inode_info *ii, *n; + __u64 cno = sbi->s_nilfs->ns_cno; + + spin_lock(&sbi->s_inode_lock); + list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { + if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || + test_bit(NILFS_I_DIRTY, &ii->i_state)) { + /* The current checkpoint number (=nilfs->ns_cno) is + changed between check-in and check-out only if the + super root is written out. So, we can update i_cno + for the inodes that remain in the dirty list. */ + ii->i_cno = cno; + continue; + } + clear_bit(NILFS_I_BUSY, &ii->i_state); + brelse(ii->i_bh); + ii->i_bh = NULL; + list_del(&ii->i_dirty); + list_add_tail(&ii->i_dirty, &ti->ti_garbage); + } + spin_unlock(&sbi->s_inode_lock); +} + +/* + * Main procedure of segment constructor + */ +static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + struct page *failed_page; + int err, has_sr = 0; + + sci->sc_stage.scnt = NILFS_ST_INIT; + + err = nilfs_segctor_check_in_files(sci, sbi); + if (unlikely(err)) + goto out; + + if (nilfs_test_metadata_dirty(sbi)) + set_bit(NILFS_SC_DIRTY, &sci->sc_flags); + + if (nilfs_segctor_clean(sci)) + goto out; + + do { + sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK; + + err = nilfs_segctor_begin_construction(sci, nilfs); + if (unlikely(err)) + goto out; + + /* Update time stamp */ + sci->sc_seg_ctime = get_seconds(); + + err = nilfs_segctor_collect(sci, nilfs, mode); + if (unlikely(err)) + goto failed; + + has_sr = (sci->sc_super_root != NULL); + + /* Avoid empty segment */ + if (sci->sc_stage.scnt == NILFS_ST_DONE && + NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { + nilfs_segctor_end_construction(sci, nilfs, 1); + goto out; + } + + err = nilfs_segctor_assign(sci, mode); + if (unlikely(err)) + goto failed; + + if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) + nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); + + if (has_sr) { + err = nilfs_segctor_fill_in_checkpoint(sci); + if (unlikely(err)) + goto failed_to_make_up; + + nilfs_segctor_fill_in_super_root(sci, nilfs); + } + nilfs_segctor_update_segusage(sci, nilfs->ns_sufile); + + /* Write partial segments */ + err = nilfs_segctor_prepare_write(sci, &failed_page); + if (unlikely(err)) + goto failed_to_write; + + nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); + + err = nilfs_segctor_write(sci, nilfs->ns_bdi); + if (unlikely(err)) + goto failed_to_write; + + nilfs_segctor_complete_write(sci); + + /* Commit segments */ + if (has_sr) { + nilfs_segctor_commit_free_segments(sci); + nilfs_segctor_clear_metadata_dirty(sci); + } + + nilfs_segctor_end_construction(sci, nilfs, 0); + + } while (sci->sc_stage.scnt != NILFS_ST_DONE); + + out: + nilfs_segctor_destroy_segment_buffers(sci); + nilfs_segctor_check_out_files(sci, sbi); + return err; + + failed_to_write: + nilfs_segctor_abort_write(sci, failed_page, err); + nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile); + + failed_to_make_up: + if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) + nilfs_redirty_inodes(&sci->sc_dirty_files); + + failed: + if (nilfs_doing_gc()) + nilfs_redirty_inodes(&sci->sc_gc_inodes); + nilfs_segctor_end_construction(sci, nilfs, err); + goto out; +} + +/** + * nilfs_secgtor_start_timer - set timer of background write + * @sci: nilfs_sc_info + * + * If the timer has already been set, it ignores the new request. + * This function MUST be called within a section locking the segment + * semaphore. + */ +static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) +{ + spin_lock(&sci->sc_state_lock); + if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { + sci->sc_timer->expires = jiffies + sci->sc_interval; + add_timer(sci->sc_timer); + sci->sc_state |= NILFS_SEGCTOR_COMMIT; + } + spin_unlock(&sci->sc_state_lock); +} + +static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) +{ + spin_lock(&sci->sc_state_lock); + if (!(sci->sc_flush_request & (1 << bn))) { + unsigned long prev_req = sci->sc_flush_request; + + sci->sc_flush_request |= (1 << bn); + if (!prev_req) + wake_up(&sci->sc_wait_daemon); + } + spin_unlock(&sci->sc_state_lock); +} + +/** + * nilfs_flush_segment - trigger a segment construction for resource control + * @sb: super block + * @ino: inode number of the file to be flushed out. + */ +void nilfs_flush_segment(struct super_block *sb, ino_t ino) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + + if (!sci || nilfs_doing_construction()) + return; + nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0); + /* assign bit 0 to data files */ +} + +int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci, + __u64 *segnum, size_t nsegs) +{ + struct nilfs_segment_entry *ent; + struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; + struct inode *sufile = nilfs->ns_sufile; + LIST_HEAD(list); + __u64 *pnum; + size_t i; + int err; + + for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) { + ent = nilfs_alloc_segment_entry(*pnum); + if (unlikely(!ent)) { + err = -ENOMEM; + goto failed; + } + list_add_tail(&ent->list, &list); + + err = nilfs_open_segment_entry(ent, sufile); + if (unlikely(err)) + goto failed; + + if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su))) + printk(KERN_WARNING "NILFS: unused segment is " + "requested to be cleaned (segnum=%llu)\n", + (unsigned long long)ent->segnum); + nilfs_close_segment_entry(ent, sufile); + } + list_splice(&list, sci->sc_cleaning_segments.prev); + return 0; + + failed: + nilfs_dispose_segment_list(&list); + return err; +} + +void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci) +{ + nilfs_dispose_segment_list(&sci->sc_cleaning_segments); +} + +struct nilfs_segctor_wait_request { + wait_queue_t wq; + __u32 seq; + int err; + atomic_t done; +}; + +static int nilfs_segctor_sync(struct nilfs_sc_info *sci) +{ + struct nilfs_segctor_wait_request wait_req; + int err = 0; + + spin_lock(&sci->sc_state_lock); + init_wait(&wait_req.wq); + wait_req.err = 0; + atomic_set(&wait_req.done, 0); + wait_req.seq = ++sci->sc_seq_request; + spin_unlock(&sci->sc_state_lock); + + init_waitqueue_entry(&wait_req.wq, current); + add_wait_queue(&sci->sc_wait_request, &wait_req.wq); + set_current_state(TASK_INTERRUPTIBLE); + wake_up(&sci->sc_wait_daemon); + + for (;;) { + if (atomic_read(&wait_req.done)) { + err = wait_req.err; + break; + } + if (!signal_pending(current)) { + schedule(); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(&sci->sc_wait_request, &wait_req.wq); + return err; +} + +static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) +{ + struct nilfs_segctor_wait_request *wrq, *n; + unsigned long flags; + + spin_lock_irqsave(&sci->sc_wait_request.lock, flags); + list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list, + wq.task_list) { + if (!atomic_read(&wrq->done) && + nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) { + wrq->err = err; + atomic_set(&wrq->done, 1); + } + if (atomic_read(&wrq->done)) { + wrq->wq.func(&wrq->wq, + TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, NULL); + } + } + spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags); +} + +/** + * nilfs_construct_segment - construct a logical segment + * @sb: super block + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_construct_segment(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct nilfs_transaction_info *ti; + int err; + + if (!sci) + return -EROFS; + + /* A call inside transactions causes a deadlock. */ + BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); + + err = nilfs_segctor_sync(sci); + return err; +} + +/** + * nilfs_construct_dsync_segment - construct a data-only logical segment + * @sb: super block + * @inode: inode whose data blocks should be written out + * @start: start byte offset + * @end: end byte offset (inclusive) + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, + loff_t start, loff_t end) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct nilfs_inode_info *ii; + struct nilfs_transaction_info ti; + int err = 0; + + if (!sci) + return -EROFS; + + nilfs_transaction_lock(sbi, &ti, 0); + + ii = NILFS_I(inode); + if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) || + nilfs_test_opt(sbi, STRICT_ORDER) || + test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || + nilfs_discontinued(sbi->s_nilfs)) { + nilfs_transaction_unlock(sbi); + err = nilfs_segctor_sync(sci); + return err; + } + + spin_lock(&sbi->s_inode_lock); + if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && + !test_bit(NILFS_I_BUSY, &ii->i_state)) { + spin_unlock(&sbi->s_inode_lock); + nilfs_transaction_unlock(sbi); + return 0; + } + spin_unlock(&sbi->s_inode_lock); + sci->sc_dsync_inode = ii; + sci->sc_dsync_start = start; + sci->sc_dsync_end = end; + + err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC); + + nilfs_transaction_unlock(sbi); + return err; +} + +struct nilfs_segctor_req { + int mode; + __u32 seq_accepted; + int sc_err; /* construction failure */ + int sb_err; /* super block writeback failure */ +}; + +#define FLUSH_FILE_BIT (0x1) /* data file only */ +#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ + +static void nilfs_segctor_accept(struct nilfs_sc_info *sci, + struct nilfs_segctor_req *req) +{ + req->sc_err = req->sb_err = 0; + spin_lock(&sci->sc_state_lock); + req->seq_accepted = sci->sc_seq_request; + spin_unlock(&sci->sc_state_lock); + + if (sci->sc_timer) + del_timer_sync(sci->sc_timer); +} + +static void nilfs_segctor_notify(struct nilfs_sc_info *sci, + struct nilfs_segctor_req *req) +{ + /* Clear requests (even when the construction failed) */ + spin_lock(&sci->sc_state_lock); + + sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; + + if (req->mode == SC_LSEG_SR) { + sci->sc_seq_done = req->seq_accepted; + nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); + sci->sc_flush_request = 0; + } else if (req->mode == SC_FLUSH_FILE) + sci->sc_flush_request &= ~FLUSH_FILE_BIT; + else if (req->mode == SC_FLUSH_DAT) + sci->sc_flush_request &= ~FLUSH_DAT_BIT; + + spin_unlock(&sci->sc_state_lock); +} + +static int nilfs_segctor_construct(struct nilfs_sc_info *sci, + struct nilfs_segctor_req *req) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + int err = 0; + + if (nilfs_discontinued(nilfs)) + req->mode = SC_LSEG_SR; + if (!nilfs_segctor_confirm(sci)) { + err = nilfs_segctor_do_construct(sci, req->mode); + req->sc_err = err; + } + if (likely(!err)) { + if (req->mode != SC_FLUSH_DAT) + atomic_set(&nilfs->ns_ndirtyblks, 0); + if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && + nilfs_discontinued(nilfs)) { + down_write(&nilfs->ns_sem); + req->sb_err = nilfs_commit_super(sbi, 0); + up_write(&nilfs->ns_sem); + } + } + return err; +} + +static void nilfs_construction_timeout(unsigned long data) +{ + struct task_struct *p = (struct task_struct *)data; + wake_up_process(p); +} + +static void +nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) +{ + struct nilfs_inode_info *ii, *n; + + list_for_each_entry_safe(ii, n, head, i_dirty) { + if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) + continue; + hlist_del_init(&ii->vfs_inode.i_hash); + list_del_init(&ii->i_dirty); + nilfs_clear_gcinode(&ii->vfs_inode); + } +} + +int nilfs_clean_segments(struct super_block *sb, void __user *argp) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_transaction_info ti; + struct nilfs_segctor_req req = { .mode = SC_LSEG_SR }; + int err; + + if (unlikely(!sci)) + return -EROFS; + + nilfs_transaction_lock(sbi, &ti, 1); + + err = nilfs_init_gcdat_inode(nilfs); + if (unlikely(err)) + goto out_unlock; + err = nilfs_ioctl_prepare_clean_segments(nilfs, argp); + if (unlikely(err)) + goto out_unlock; + + list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev); + + for (;;) { + nilfs_segctor_accept(sci, &req); + err = nilfs_segctor_construct(sci, &req); + nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); + nilfs_segctor_notify(sci, &req); + + if (likely(!err)) + break; + + nilfs_warning(sb, __func__, + "segment construction failed. (err=%d)", err); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(sci->sc_interval); + } + + out_unlock: + nilfs_clear_gcdat_inode(nilfs); + nilfs_transaction_unlock(sbi); + return err; +} + +static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct nilfs_transaction_info ti; + struct nilfs_segctor_req req = { .mode = mode }; + + nilfs_transaction_lock(sbi, &ti, 0); + + nilfs_segctor_accept(sci, &req); + nilfs_segctor_construct(sci, &req); + nilfs_segctor_notify(sci, &req); + + /* + * Unclosed segment should be retried. We do this using sc_timer. + * Timeout of sc_timer will invoke complete construction which leads + * to close the current logical segment. + */ + if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) + nilfs_segctor_start_timer(sci); + + nilfs_transaction_unlock(sbi); +} + +static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) +{ + int mode = 0; + int err; + + spin_lock(&sci->sc_state_lock); + mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ? + SC_FLUSH_DAT : SC_FLUSH_FILE; + spin_unlock(&sci->sc_state_lock); + + if (mode) { + err = nilfs_segctor_do_construct(sci, mode); + + spin_lock(&sci->sc_state_lock); + sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ? + ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT; + spin_unlock(&sci->sc_state_lock); + } + clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); +} + +static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci) +{ + if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || + time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) { + if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT)) + return SC_FLUSH_FILE; + else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT)) + return SC_FLUSH_DAT; + } + return SC_LSEG_SR; +} + +/** + * nilfs_segctor_thread - main loop of the segment constructor thread. + * @arg: pointer to a struct nilfs_sc_info. + * + * nilfs_segctor_thread() initializes a timer and serves as a daemon + * to execute segment constructions. + */ +static int nilfs_segctor_thread(void *arg) +{ + struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; + struct timer_list timer; + int timeout = 0; + + init_timer(&timer); + timer.data = (unsigned long)current; + timer.function = nilfs_construction_timeout; + sci->sc_timer = &timer; + + /* start sync. */ + sci->sc_task = current; + wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */ + printk(KERN_INFO + "segctord starting. Construction interval = %lu seconds, " + "CP frequency < %lu seconds\n", + sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); + + spin_lock(&sci->sc_state_lock); + loop: + for (;;) { + int mode; + + if (sci->sc_state & NILFS_SEGCTOR_QUIT) + goto end_thread; + + if (timeout || sci->sc_seq_request != sci->sc_seq_done) + mode = SC_LSEG_SR; + else if (!sci->sc_flush_request) + break; + else + mode = nilfs_segctor_flush_mode(sci); + + spin_unlock(&sci->sc_state_lock); + nilfs_segctor_thread_construct(sci, mode); + spin_lock(&sci->sc_state_lock); + timeout = 0; + } + + + if (freezing(current)) { + spin_unlock(&sci->sc_state_lock); + refrigerator(); + spin_lock(&sci->sc_state_lock); + } else { + DEFINE_WAIT(wait); + int should_sleep = 1; + + prepare_to_wait(&sci->sc_wait_daemon, &wait, + TASK_INTERRUPTIBLE); + + if (sci->sc_seq_request != sci->sc_seq_done) + should_sleep = 0; + else if (sci->sc_flush_request) + should_sleep = 0; + else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) + should_sleep = time_before(jiffies, + sci->sc_timer->expires); + + if (should_sleep) { + spin_unlock(&sci->sc_state_lock); + schedule(); + spin_lock(&sci->sc_state_lock); + } + finish_wait(&sci->sc_wait_daemon, &wait); + timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_after_eq(jiffies, sci->sc_timer->expires)); + } + goto loop; + + end_thread: + spin_unlock(&sci->sc_state_lock); + del_timer_sync(sci->sc_timer); + sci->sc_timer = NULL; + + /* end sync. */ + sci->sc_task = NULL; + wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */ + return 0; +} + +static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci) +{ + struct task_struct *t; + + t = kthread_run(nilfs_segctor_thread, sci, "segctord"); + if (IS_ERR(t)) { + int err = PTR_ERR(t); + + printk(KERN_ERR "NILFS: error %d creating segctord thread\n", + err); + return err; + } + wait_event(sci->sc_wait_task, sci->sc_task != NULL); + return 0; +} + +static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) +{ + sci->sc_state |= NILFS_SEGCTOR_QUIT; + + while (sci->sc_task) { + wake_up(&sci->sc_wait_daemon); + spin_unlock(&sci->sc_state_lock); + wait_event(sci->sc_wait_task, sci->sc_task == NULL); + spin_lock(&sci->sc_state_lock); + } +} + +static int nilfs_segctor_init(struct nilfs_sc_info *sci) +{ + sci->sc_seq_done = sci->sc_seq_request; + + return nilfs_segctor_start_thread(sci); +} + +/* + * Setup & clean-up functions + */ +static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) +{ + struct nilfs_sc_info *sci; + + sci = kzalloc(sizeof(*sci), GFP_KERNEL); + if (!sci) + return NULL; + + sci->sc_sbi = sbi; + sci->sc_super = sbi->s_super; + + init_waitqueue_head(&sci->sc_wait_request); + init_waitqueue_head(&sci->sc_wait_daemon); + init_waitqueue_head(&sci->sc_wait_task); + spin_lock_init(&sci->sc_state_lock); + INIT_LIST_HEAD(&sci->sc_dirty_files); + INIT_LIST_HEAD(&sci->sc_segbufs); + INIT_LIST_HEAD(&sci->sc_gc_inodes); + INIT_LIST_HEAD(&sci->sc_cleaning_segments); + INIT_LIST_HEAD(&sci->sc_copied_buffers); + + sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; + sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; + sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; + + if (sbi->s_interval) + sci->sc_interval = sbi->s_interval; + if (sbi->s_watermark) + sci->sc_watermark = sbi->s_watermark; + return sci; +} + +static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) +{ + int ret, retrycount = NILFS_SC_CLEANUP_RETRY; + + /* The segctord thread was stopped and its timer was removed. + But some tasks remain. */ + do { + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct nilfs_transaction_info ti; + struct nilfs_segctor_req req = { .mode = SC_LSEG_SR }; + + nilfs_transaction_lock(sbi, &ti, 0); + nilfs_segctor_accept(sci, &req); + ret = nilfs_segctor_construct(sci, &req); + nilfs_segctor_notify(sci, &req); + nilfs_transaction_unlock(sbi); + + } while (ret && retrycount-- > 0); +} + +/** + * nilfs_segctor_destroy - destroy the segment constructor. + * @sci: nilfs_sc_info + * + * nilfs_segctor_destroy() kills the segctord thread and frees + * the nilfs_sc_info struct. + * Caller must hold the segment semaphore. + */ +static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + int flag; + + up_write(&sbi->s_nilfs->ns_segctor_sem); + + spin_lock(&sci->sc_state_lock); + nilfs_segctor_kill_thread(sci); + flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request + || sci->sc_seq_request != sci->sc_seq_done); + spin_unlock(&sci->sc_state_lock); + + if (flag || nilfs_segctor_confirm(sci)) + nilfs_segctor_write_out(sci); + + WARN_ON(!list_empty(&sci->sc_copied_buffers)); + + if (!list_empty(&sci->sc_dirty_files)) { + nilfs_warning(sbi->s_super, __func__, + "dirty file(s) after the final construction\n"); + nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1); + } + + if (!list_empty(&sci->sc_cleaning_segments)) + nilfs_dispose_segment_list(&sci->sc_cleaning_segments); + + WARN_ON(!list_empty(&sci->sc_segbufs)); + + down_write(&sbi->s_nilfs->ns_segctor_sem); + + kfree(sci); +} + +/** + * nilfs_attach_segment_constructor - attach a segment constructor + * @sbi: nilfs_sb_info + * + * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, + * initilizes it, and starts the segment constructor. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int err; + + /* Each field of nilfs_segctor is cleared through the initialization + of super-block info */ + sbi->s_sc_info = nilfs_segctor_new(sbi); + if (!sbi->s_sc_info) + return -ENOMEM; + + nilfs_attach_writer(nilfs, sbi); + err = nilfs_segctor_init(NILFS_SC(sbi)); + if (err) { + nilfs_detach_writer(nilfs, sbi); + kfree(sbi->s_sc_info); + sbi->s_sc_info = NULL; + } + return err; +} + +/** + * nilfs_detach_segment_constructor - destroy the segment constructor + * @sbi: nilfs_sb_info + * + * nilfs_detach_segment_constructor() kills the segment constructor daemon, + * frees the struct nilfs_sc_info, and destroy the dirty file list. + */ +void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + LIST_HEAD(garbage_list); + + down_write(&nilfs->ns_segctor_sem); + if (NILFS_SC(sbi)) { + nilfs_segctor_destroy(NILFS_SC(sbi)); + sbi->s_sc_info = NULL; + } + + /* Force to free the list of dirty files */ + spin_lock(&sbi->s_inode_lock); + if (!list_empty(&sbi->s_dirty_files)) { + list_splice_init(&sbi->s_dirty_files, &garbage_list); + nilfs_warning(sbi->s_super, __func__, + "Non empty dirty list after the last " + "segment construction\n"); + } + spin_unlock(&sbi->s_inode_lock); + up_write(&nilfs->ns_segctor_sem); + + nilfs_dispose_list(sbi, &garbage_list, 1); + nilfs_detach_writer(nilfs, sbi); +} diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h new file mode 100644 index 000000000000..a98fc1ed0bbb --- /dev/null +++ b/fs/nilfs2/segment.h @@ -0,0 +1,243 @@ +/* + * segment.h - NILFS Segment constructor prototypes and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ +#ifndef _NILFS_SEGMENT_H +#define _NILFS_SEGMENT_H + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "sb.h" + +/** + * struct nilfs_recovery_info - Recovery infomation + * @ri_need_recovery: Recovery status + * @ri_super_root: Block number of the last super root + * @ri_ri_cno: Number of the last checkpoint + * @ri_lsegs_start: Region for roll-forwarding (start block number) + * @ri_lsegs_end: Region for roll-forwarding (end block number) + * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start + * @ri_used_segments: List of segments to be mark active + * @ri_pseg_start: Block number of the last partial segment + * @ri_seq: Sequence number on the last partial segment + * @ri_segnum: Segment number on the last partial segment + * @ri_nextnum: Next segment number on the last partial segment + */ +struct nilfs_recovery_info { + int ri_need_recovery; + sector_t ri_super_root; + __u64 ri_cno; + + sector_t ri_lsegs_start; + sector_t ri_lsegs_end; + u64 ri_lsegs_start_seq; + struct list_head ri_used_segments; + sector_t ri_pseg_start; + u64 ri_seq; + __u64 ri_segnum; + __u64 ri_nextnum; +}; + +/* ri_need_recovery */ +#define NILFS_RECOVERY_SR_UPDATED 1 /* The super root was updated */ +#define NILFS_RECOVERY_ROLLFORWARD_DONE 2 /* Rollforward was carried out */ + +/** + * struct nilfs_cstage - Context of collection stage + * @scnt: Stage count + * @flags: State flags + * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file + * @gc_inode_ptr: Pointer on the list of gc-inodes + */ +struct nilfs_cstage { + int scnt; + unsigned flags; + struct nilfs_inode_info *dirty_file_ptr; + struct nilfs_inode_info *gc_inode_ptr; +}; + +struct nilfs_segment_buffer; + +struct nilfs_segsum_pointer { + struct buffer_head *bh; + unsigned offset; /* offset in bytes */ +}; + +/** + * struct nilfs_sc_info - Segment constructor information + * @sc_super: Back pointer to super_block struct + * @sc_sbi: Back pointer to nilfs_sb_info struct + * @sc_nblk_inc: Block count of current generation + * @sc_dirty_files: List of files to be written + * @sc_gc_inodes: List of GC inodes having blocks to be written + * @sc_cleaning_segments: List of segments to be freed through construction + * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data + * @sc_dsync_inode: inode whose data pages are written for a sync operation + * @sc_dsync_start: start byte offset of data pages + * @sc_dsync_end: end byte offset of data pages (inclusive) + * @sc_segbufs: List of segment buffers + * @sc_segbuf_nblocks: Number of available blocks in segment buffers. + * @sc_curseg: Current segment buffer + * @sc_super_root: Pointer to the super root buffer + * @sc_stage: Collection stage + * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary + * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary + * @sc_blk_cnt: Block count of a file + * @sc_datablk_cnt: Data block count of a file + * @sc_nblk_this_inc: Number of blocks included in the current logical segment + * @sc_seg_ctime: Creation time + * @sc_flags: Internal flags + * @sc_state_lock: spinlock for sc_state and so on + * @sc_state: Segctord state flags + * @sc_flush_request: inode bitmap of metadata files to be flushed + * @sc_wait_request: Client request queue + * @sc_wait_daemon: Daemon wait queue + * @sc_wait_task: Start/end wait queue to control segctord task + * @sc_seq_request: Request counter + * @sc_seq_done: Completion counter + * @sc_sync: Request of explicit sync operation + * @sc_interval: Timeout value of background construction + * @sc_mjcp_freq: Frequency of creating checkpoints + * @sc_lseg_stime: Start time of the latest logical segment + * @sc_watermark: Watermark for the number of dirty buffers + * @sc_timer: Timer for segctord + * @sc_task: current thread of segctord + */ +struct nilfs_sc_info { + struct super_block *sc_super; + struct nilfs_sb_info *sc_sbi; + + unsigned long sc_nblk_inc; + + struct list_head sc_dirty_files; + struct list_head sc_gc_inodes; + struct list_head sc_cleaning_segments; + struct list_head sc_copied_buffers; + + struct nilfs_inode_info *sc_dsync_inode; + loff_t sc_dsync_start; + loff_t sc_dsync_end; + + /* Segment buffers */ + struct list_head sc_segbufs; + unsigned long sc_segbuf_nblocks; + struct nilfs_segment_buffer *sc_curseg; + struct buffer_head *sc_super_root; + + struct nilfs_cstage sc_stage; + + struct nilfs_segsum_pointer sc_finfo_ptr; + struct nilfs_segsum_pointer sc_binfo_ptr; + unsigned long sc_blk_cnt; + unsigned long sc_datablk_cnt; + unsigned long sc_nblk_this_inc; + time_t sc_seg_ctime; + + unsigned long sc_flags; + + spinlock_t sc_state_lock; + unsigned long sc_state; + unsigned long sc_flush_request; + + wait_queue_head_t sc_wait_request; + wait_queue_head_t sc_wait_daemon; + wait_queue_head_t sc_wait_task; + + __u32 sc_seq_request; + __u32 sc_seq_done; + + int sc_sync; + unsigned long sc_interval; + unsigned long sc_mjcp_freq; + unsigned long sc_lseg_stime; /* in 1/HZ seconds */ + unsigned long sc_watermark; + + struct timer_list *sc_timer; + struct task_struct *sc_task; +}; + +/* sc_flags */ +enum { + NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */ + NILFS_SC_UNCLOSED, /* Logical segment is not closed */ + NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */ + NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a + checkpoint */ + NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files + other than DAT, cpfile, sufile, or files + moved by GC */ +}; + +/* sc_state */ +#define NILFS_SEGCTOR_QUIT 0x0001 /* segctord is being destroyed */ +#define NILFS_SEGCTOR_COMMIT 0x0004 /* committed transaction exists */ + +/* + * Constant parameters + */ +#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when + destroying segctord */ + +/* + * Default values of timeout, in seconds. + */ +#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks. + It triggers construction of a + logical segment with a super root */ +#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root + creation */ + +/* + * The default threshold amount of data, in block counts. + */ +#define NILFS_SC_DEFAULT_WATERMARK 3600 + + +/* segment.c */ +extern int nilfs_init_transaction_cache(void); +extern void nilfs_destroy_transaction_cache(void); +extern void nilfs_relax_pressure_in_lock(struct super_block *); + +extern int nilfs_construct_segment(struct super_block *); +extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *, + loff_t, loff_t); +extern void nilfs_flush_segment(struct super_block *, ino_t); +extern int nilfs_clean_segments(struct super_block *, void __user *); + +extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *, + __u64 *, size_t); +extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *); + +extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); +extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); + +/* recovery.c */ +extern int nilfs_read_super_root_block(struct super_block *, sector_t, + struct buffer_head **, int); +extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *, + struct nilfs_recovery_info *); +extern int nilfs_recover_logical_segments(struct the_nilfs *, + struct nilfs_sb_info *, + struct nilfs_recovery_info *); + +#endif /* _NILFS_SEGMENT_H */ diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c new file mode 100644 index 000000000000..c774cf397e2f --- /dev/null +++ b/fs/nilfs2/sufile.c @@ -0,0 +1,640 @@ +/* + * sufile.c - NILFS segment usage file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/errno.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" +#include "sufile.h" + + +static inline unsigned long +nilfs_sufile_segment_usages_per_block(const struct inode *sufile) +{ + return NILFS_MDT(sufile)->mi_entries_per_block; +} + +static unsigned long +nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum) +{ + __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); + return (unsigned long)t; +} + +static unsigned long +nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum) +{ + __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + return do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); +} + +static unsigned long +nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, + __u64 max) +{ + return min_t(unsigned long, + nilfs_sufile_segment_usages_per_block(sufile) - + nilfs_sufile_get_offset(sufile, curr), + max - curr + 1); +} + +static inline struct nilfs_sufile_header * +nilfs_sufile_block_get_header(const struct inode *sufile, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh); +} + +static struct nilfs_segment_usage * +nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum, + struct buffer_head *bh, void *kaddr) +{ + return kaddr + bh_offset(bh) + + nilfs_sufile_get_offset(sufile, segnum) * + NILFS_MDT(sufile)->mi_entry_size; +} + +static inline int nilfs_sufile_get_header_block(struct inode *sufile, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp); +} + +static inline int +nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(sufile, + nilfs_sufile_get_blkoff(sufile, segnum), + create, NULL, bhp); +} + +/** + * nilfs_sufile_alloc - allocate a segment + * @sufile: inode of segment usage file + * @segnump: pointer to segment number + * + * Description: nilfs_sufile_alloc() allocates a clean segment. + * + * Return Value: On success, 0 is returned and the segment number of the + * allocated segment is stored in the place pointed by @segnump. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - No clean segment left. + */ +int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) +{ + struct buffer_head *header_bh, *su_bh; + struct the_nilfs *nilfs; + struct nilfs_sufile_header *header; + struct nilfs_segment_usage *su; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + __u64 segnum, maxsegnum, last_alloc; + void *kaddr; + unsigned long nsegments, ncleansegs, nsus; + int ret, i, j; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + nilfs = NILFS_MDT(sufile)->mi_nilfs; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + ncleansegs = le64_to_cpu(header->sh_ncleansegs); + last_alloc = le64_to_cpu(header->sh_last_alloc); + kunmap_atomic(kaddr, KM_USER0); + + nsegments = nilfs_sufile_get_nsegments(sufile); + segnum = last_alloc + 1; + maxsegnum = nsegments - 1; + for (i = 0; i < nsegments; i += nsus) { + if (segnum >= nsegments) { + /* wrap around */ + segnum = 0; + maxsegnum = last_alloc; + } + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, + &su_bh); + if (ret < 0) + goto out_header; + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + + nsus = nilfs_sufile_segment_usages_in_block( + sufile, segnum, maxsegnum); + for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) { + if (!nilfs_segment_usage_clean(su)) + continue; + /* found a clean segment */ + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header( + sufile, header_bh, kaddr); + le64_add_cpu(&header->sh_ncleansegs, -1); + le64_add_cpu(&header->sh_ndirtysegs, 1); + header->sh_last_alloc = cpu_to_le64(segnum); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); + brelse(su_bh); + *segnump = segnum; + goto out_header; + } + + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + } + + /* no segments left */ + ret = -ENOSPC; + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_cancel_free - + * @sufile: inode of segment usage file + * @segnum: segment number + * + * Description: + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum) +{ + struct buffer_head *header_bh, *su_bh; + struct the_nilfs *nilfs; + struct nilfs_sufile_header *header; + struct nilfs_segment_usage *su; + void *kaddr; + int ret; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + nilfs = NILFS_MDT(sufile)->mi_nilfs; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); + if (ret < 0) + goto out_header; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + if (unlikely(!nilfs_segment_usage_clean(su))) { + printk(KERN_WARNING "%s: segment %llu must be clean\n", + __func__, (unsigned long long)segnum); + kunmap_atomic(kaddr, KM_USER0); + goto out_su_bh; + } + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + le64_add_cpu(&header->sh_ncleansegs, -1); + le64_add_cpu(&header->sh_ndirtysegs, 1); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); + + out_su_bh: + brelse(su_bh); + out_header: + brelse(header_bh); + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_freev - free segments + * @sufile: inode of segment usage file + * @segnum: array of segment numbers + * @nsegs: number of segments + * + * Description: nilfs_sufile_freev() frees segments specified by @segnum and + * @nsegs, which must have been returned by a previous call to + * nilfs_sufile_alloc(). + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +#define NILFS_SUFILE_FREEV_PREALLOC 16 +int nilfs_sufile_freev(struct inode *sufile, __u64 *segnum, size_t nsegs) +{ + struct buffer_head *header_bh, **su_bh, + *su_bh_prealloc[NILFS_SUFILE_FREEV_PREALLOC]; + struct the_nilfs *nilfs; + struct nilfs_sufile_header *header; + struct nilfs_segment_usage *su; + void *kaddr; + int ret, i; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + nilfs = NILFS_MDT(sufile)->mi_nilfs; + + /* prepare resources */ + if (nsegs <= NILFS_SUFILE_FREEV_PREALLOC) + su_bh = su_bh_prealloc; + else { + su_bh = kmalloc(sizeof(*su_bh) * nsegs, GFP_NOFS); + if (su_bh == NULL) { + ret = -ENOMEM; + goto out_sem; + } + } + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_su_bh; + for (i = 0; i < nsegs; i++) { + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum[i], + 0, &su_bh[i]); + if (ret < 0) + goto out_bh; + } + + /* free segments */ + for (i = 0; i < nsegs; i++) { + kaddr = kmap_atomic(su_bh[i]->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum[i], su_bh[i], kaddr); + WARN_ON(nilfs_segment_usage_error(su)); + nilfs_segment_usage_set_clean(su); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(su_bh[i]); + } + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + le64_add_cpu(&header->sh_ncleansegs, nsegs); + le64_add_cpu(&header->sh_ndirtysegs, -(u64)nsegs); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(sufile); + + out_bh: + for (i--; i >= 0; i--) + brelse(su_bh[i]); + brelse(header_bh); + + out_su_bh: + if (su_bh != su_bh_prealloc) + kfree(su_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_free - + * @sufile: + * @segnum: + */ +int nilfs_sufile_free(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_freev(sufile, &segnum, 1); +} + +/** + * nilfs_sufile_get_segment_usage - get a segment usage + * @sufile: inode of segment usage file + * @segnum: segment number + * @sup: pointer to segment usage + * @bhp: pointer to buffer head + * + * Description: nilfs_sufile_get_segment_usage() acquires the segment usage + * specified by @segnum. + * + * Return Value: On success, 0 is returned, and the segment usage and the + * buffer head of the buffer on which the segment usage is located are stored + * in the place pointed by @sup and @bhp, respectively. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid segment usage number. + */ +int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum, + struct nilfs_segment_usage **sup, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + struct nilfs_segment_usage *su; + void *kaddr; + int ret; + + /* segnum is 0 origin */ + if (segnum >= nilfs_sufile_get_nsegments(sufile)) + return -EINVAL; + down_write(&NILFS_MDT(sufile)->mi_sem); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh); + if (ret < 0) + goto out_sem; + kaddr = kmap(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + if (nilfs_segment_usage_error(su)) { + kunmap(bh->b_page); + brelse(bh); + ret = -EINVAL; + goto out_sem; + } + + if (sup != NULL) + *sup = su; + *bhp = bh; + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_put_segment_usage - put a segment usage + * @sufile: inode of segment usage file + * @segnum: segment number + * @bh: buffer head + * + * Description: nilfs_sufile_put_segment_usage() releases the segment usage + * specified by @segnum. @bh must be the buffer head which have been returned + * by a previous call to nilfs_sufile_get_segment_usage() with @segnum. + */ +void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum, + struct buffer_head *bh) +{ + kunmap(bh->b_page); + brelse(bh); +} + +/** + * nilfs_sufile_get_stat - get segment usage statistics + * @sufile: inode of segment usage file + * @stat: pointer to a structure of segment usage statistics + * + * Description: nilfs_sufile_get_stat() returns information about segment + * usage. + * + * Return Value: On success, 0 is returned, and segment usage information is + * stored in the place pointed by @stat. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) +{ + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; + void *kaddr; + int ret; + + down_read(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); + sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); + sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); + sustat->ss_ctime = nilfs->ns_ctime; + sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime; + spin_lock(&nilfs->ns_last_segment_lock); + sustat->ss_prot_seq = nilfs->ns_prot_seq; + spin_unlock(&nilfs->ns_last_segment_lock); + kunmap_atomic(kaddr, KM_USER0); + brelse(header_bh); + + out_sem: + up_read(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_get_ncleansegs - get the number of clean segments + * @sufile: inode of segment usage file + * @nsegsp: pointer to the number of clean segments + * + * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean + * segments. + * + * Return Value: On success, 0 is returned and the number of clean segments is + * stored in the place pointed by @nsegsp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp) +{ + struct nilfs_sustat sustat; + int ret; + + ret = nilfs_sufile_get_stat(sufile, &sustat); + if (ret == 0) + *nsegsp = sustat.ss_ncleansegs; + return ret; +} + +/** + * nilfs_sufile_set_error - mark a segment as erroneous + * @sufile: inode of segment usage file + * @segnum: segment number + * + * Description: nilfs_sufile_set_error() marks the segment specified by + * @segnum as erroneous. The error segment will never be used again. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid segment usage number. + */ +int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum) +{ + struct buffer_head *header_bh, *su_bh; + struct nilfs_segment_usage *su; + struct nilfs_sufile_header *header; + void *kaddr; + int ret; + + if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { + printk(KERN_WARNING "%s: invalid segment number: %llu\n", + __func__, (unsigned long long)segnum); + return -EINVAL; + } + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); + if (ret < 0) + goto out_header; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (nilfs_segment_usage_error(su)) { + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + goto out_header; + } + + nilfs_segment_usage_set_error(su); + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + le64_add_cpu(&header->sh_ndirtysegs, -1); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); + brelse(su_bh); + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_get_suinfo - + * @sufile: inode of segment usage file + * @segnum: segment number to start looking + * @si: array of suinfo + * @nsi: size of suinfo array + * + * Description: + * + * Return Value: On success, 0 is returned and .... On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, + struct nilfs_suinfo *si, size_t nsi) +{ + struct buffer_head *su_bh; + struct nilfs_segment_usage *su; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; + void *kaddr; + unsigned long nsegs, segusages_per_block; + ssize_t n; + int ret, i, j; + + down_read(&NILFS_MDT(sufile)->mi_sem); + + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); + nsegs = min_t(unsigned long, + nilfs_sufile_get_nsegments(sufile) - segnum, + nsi); + for (i = 0; i < nsegs; i += n, segnum += n) { + n = min_t(unsigned long, + segusages_per_block - + nilfs_sufile_get_offset(sufile, segnum), + nsegs - i); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out; + /* hole */ + memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n); + continue; + } + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + for (j = 0; j < n; j++, su = (void *)su + susz) { + si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod); + si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks); + si[i + j].sui_flags = le32_to_cpu(su->su_flags) & + ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + if (nilfs_segment_is_active(nilfs, segnum + i + j)) + si[i + j].sui_flags |= + (1UL << NILFS_SEGMENT_USAGE_ACTIVE); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + } + ret = nsegs; + + out: + up_read(&NILFS_MDT(sufile)->mi_sem); + return ret; +} diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h new file mode 100644 index 000000000000..d595f33a768d --- /dev/null +++ b/fs/nilfs2/sufile.h @@ -0,0 +1,54 @@ +/* + * sufile.h - NILFS segment usage file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_SUFILE_H +#define _NILFS_SUFILE_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" + +#define NILFS_SUFILE_GFP NILFS_MDT_GFP + +static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) +{ + return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments; +} + +int nilfs_sufile_alloc(struct inode *, __u64 *); +int nilfs_sufile_cancel_free(struct inode *, __u64); +int nilfs_sufile_freev(struct inode *, __u64 *, size_t); +int nilfs_sufile_free(struct inode *, __u64); +int nilfs_sufile_get_segment_usage(struct inode *, __u64, + struct nilfs_segment_usage **, + struct buffer_head **); +void nilfs_sufile_put_segment_usage(struct inode *, __u64, + struct buffer_head *); +int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); +int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *); +int nilfs_sufile_set_error(struct inode *, __u64); +ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *, + size_t); + + +#endif /* _NILFS_SUFILE_H */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c new file mode 100644 index 000000000000..e117e1ea9bff --- /dev/null +++ b/fs/nilfs2/super.c @@ -0,0 +1,1323 @@ +/* + * super.c - NILFS module and super block management. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ +/* + * linux/fs/ext2/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/parser.h> +#include <linux/random.h> +#include <linux/crc32.h> +#include <linux/smp_lock.h> +#include <linux/vfs.h> +#include <linux/writeback.h> +#include <linux/kobject.h> +#include <linux/exportfs.h> +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "page.h" +#include "cpfile.h" +#include "ifile.h" +#include "dat.h" +#include "segment.h" +#include "segbuf.h" + +MODULE_AUTHOR("NTT Corp."); +MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " + "(NILFS)"); +MODULE_VERSION(NILFS_VERSION); +MODULE_LICENSE("GPL"); + +static int nilfs_remount(struct super_block *sb, int *flags, char *data); +static int test_exclusive_mount(struct file_system_type *fs_type, + struct block_device *bdev, int flags); + +/** + * nilfs_error() - report failure condition on a filesystem + * + * nilfs_error() sets an ERROR_FS flag on the superblock as well as + * reporting an error message. It should be called when NILFS detects + * incoherences or defects of meta data on disk. As for sustainable + * errors such as a single-shot I/O error, nilfs_warning() or the printk() + * function should be used instead. + * + * The segment constructor must not call this function because it can + * kill itself. + */ +void nilfs_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + va_list args; + + va_start(args, fmt); + printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if (!(sb->s_flags & MS_RDONLY)) { + struct the_nilfs *nilfs = sbi->s_nilfs; + + if (!nilfs_test_opt(sbi, ERRORS_CONT)) + nilfs_detach_segment_constructor(sbi); + + down_write(&nilfs->ns_sem); + if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { + nilfs->ns_mount_state |= NILFS_ERROR_FS; + nilfs->ns_sbp[0]->s_state |= + cpu_to_le16(NILFS_ERROR_FS); + nilfs_commit_super(sbi, 1); + } + up_write(&nilfs->ns_sem); + + if (nilfs_test_opt(sbi, ERRORS_RO)) { + printk(KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } + } + + if (nilfs_test_opt(sbi, ERRORS_PANIC)) + panic("NILFS (device %s): panic forced after error\n", + sb->s_id); +} + +void nilfs_warning(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_WARNING "NILFS warning (device %s): %s: ", + sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); +} + +static struct kmem_cache *nilfs_inode_cachep; + +struct inode *nilfs_alloc_inode(struct super_block *sb) +{ + struct nilfs_inode_info *ii; + + ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS); + if (!ii) + return NULL; + ii->i_bh = NULL; + ii->i_state = 0; + ii->vfs_inode.i_version = 1; + nilfs_btnode_cache_init(&ii->i_btnode_cache); + return &ii->vfs_inode; +} + +void nilfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); +} + +static void init_once(void *obj) +{ + struct nilfs_inode_info *ii = obj; + + INIT_LIST_HEAD(&ii->i_dirty); +#ifdef CONFIG_NILFS_XATTR + init_rwsem(&ii->xattr_sem); +#endif + nilfs_btnode_cache_init_once(&ii->i_btnode_cache); + ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; + inode_init_once(&ii->vfs_inode); +} + +static int nilfs_init_inode_cache(void) +{ + nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", + sizeof(struct nilfs_inode_info), + 0, SLAB_RECLAIM_ACCOUNT, + init_once); + + return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0; +} + +static inline void nilfs_destroy_inode_cache(void) +{ + kmem_cache_destroy(nilfs_inode_cachep); +} + +static void nilfs_clear_inode(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + +#ifdef CONFIG_NILFS_POSIX_ACL + if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) { + posix_acl_release(ii->i_acl); + ii->i_acl = NILFS_ACL_NOT_CACHED; + } + if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) { + posix_acl_release(ii->i_default_acl); + ii->i_default_acl = NILFS_ACL_NOT_CACHED; + } +#endif + /* + * Free resources allocated in nilfs_read_inode(), here. + */ + BUG_ON(!list_empty(&ii->i_dirty)); + brelse(ii->i_bh); + ii->i_bh = NULL; + + if (test_bit(NILFS_I_BMAP, &ii->i_state)) + nilfs_bmap_clear(ii->i_bmap); + + nilfs_btnode_cache_clear(&ii->i_btnode_cache); +} + +static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int err; + int barrier_done = 0; + + if (nilfs_test_opt(sbi, BARRIER)) { + set_buffer_ordered(nilfs->ns_sbh[0]); + barrier_done = 1; + } + retry: + set_buffer_dirty(nilfs->ns_sbh[0]); + err = sync_dirty_buffer(nilfs->ns_sbh[0]); + if (err == -EOPNOTSUPP && barrier_done) { + nilfs_warning(sbi->s_super, __func__, + "barrier-based sync failed. " + "disabling barriers\n"); + nilfs_clear_opt(sbi, BARRIER); + barrier_done = 0; + clear_buffer_ordered(nilfs->ns_sbh[0]); + goto retry; + } + if (unlikely(err)) { + printk(KERN_ERR + "NILFS: unable to write superblock (err=%d)\n", err); + if (err == -EIO && nilfs->ns_sbh[1]) { + nilfs_fall_back_super_block(nilfs); + goto retry; + } + } else { + struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; + + /* + * The latest segment becomes trailable from the position + * written in superblock. + */ + clear_nilfs_discontinued(nilfs); + + /* update GC protection for recent segments */ + if (nilfs->ns_sbh[1]) { + sbp = NULL; + if (dupsb) { + set_buffer_dirty(nilfs->ns_sbh[1]); + if (!sync_dirty_buffer(nilfs->ns_sbh[1])) + sbp = nilfs->ns_sbp[1]; + } + } + if (sbp) { + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); + spin_unlock(&nilfs->ns_last_segment_lock); + } + } + + return err; +} + +int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + sector_t nfreeblocks; + time_t t; + int err; + + /* nilfs->sem must be locked by the caller. */ + if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) { + if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC) + nilfs_swap_super_block(nilfs); + else { + printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", + sbi->s_super->s_id); + return -EIO; + } + } + err = nilfs_count_free_blocks(nilfs, &nfreeblocks); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: failed to count free blocks\n"); + return err; + } + spin_lock(&nilfs->ns_last_segment_lock); + sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); + sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); + sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); + spin_unlock(&nilfs->ns_last_segment_lock); + + t = get_seconds(); + nilfs->ns_sbwtime[0] = t; + sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks); + sbp[0]->s_wtime = cpu_to_le64(t); + sbp[0]->s_sum = 0; + sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, + (unsigned char *)sbp[0], + nilfs->ns_sbsize)); + if (dupsb && sbp[1]) { + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + nilfs->ns_sbwtime[1] = t; + } + sbi->s_super->s_dirt = 0; + return nilfs_sync_super(sbi, dupsb); +} + +static void nilfs_put_super(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct the_nilfs *nilfs = sbi->s_nilfs; + + nilfs_detach_segment_constructor(sbi); + + if (!(sb->s_flags & MS_RDONLY)) { + down_write(&nilfs->ns_sem); + nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); + nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + } + + nilfs_detach_checkpoint(sbi); + put_nilfs(sbi->s_nilfs); + sbi->s_super = NULL; + sb->s_fs_info = NULL; + kfree(sbi); +} + +/** + * nilfs_write_super - write super block(s) of NILFS + * @sb: super_block + * + * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and + * clears s_dirt. This function is called in the section protected by + * lock_super(). + * + * The s_dirt flag is managed by each filesystem and we protect it by ns_sem + * of the struct the_nilfs. Lock order must be as follows: + * + * 1. lock_super() + * 2. down_write(&nilfs->ns_sem) + * + * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer + * of the super block (nilfs->ns_sbp[]). + * + * In most cases, VFS functions call lock_super() before calling these + * methods. So we must be careful not to bring on deadlocks when using + * lock_super(); see generic_shutdown_super(), write_super(), and so on. + * + * Note that order of lock_kernel() and lock_super() depends on contexts + * of VFS. We should also note that lock_kernel() can be used in its + * protective section and only the outermost one has an effect. + */ +static void nilfs_write_super(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct the_nilfs *nilfs = sbi->s_nilfs; + + down_write(&nilfs->ns_sem); + if (!(sb->s_flags & MS_RDONLY)) { + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u64 t = get_seconds(); + int dupsb; + + if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] && + t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) { + up_write(&nilfs->ns_sem); + return; + } + dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; + nilfs_commit_super(sbi, dupsb); + } + sb->s_dirt = 0; + up_write(&nilfs->ns_sem); +} + +static int nilfs_sync_fs(struct super_block *sb, int wait) +{ + int err = 0; + + /* This function is called when super block should be written back */ + if (wait) + err = nilfs_construct_segment(sb); + return err; +} + +int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_checkpoint *raw_cp; + struct buffer_head *bh_cp; + int err; + + down_write(&nilfs->ns_sem); + list_add(&sbi->s_list, &nilfs->ns_supers); + up_write(&nilfs->ns_sem); + + sbi->s_ifile = nilfs_mdt_new( + nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); + if (!sbi->s_ifile) + return -ENOMEM; + + err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size); + if (unlikely(err)) + goto failed; + + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, + &bh_cp); + if (unlikely(err)) { + if (err == -ENOENT || err == -EINVAL) { + printk(KERN_ERR + "NILFS: Invalid checkpoint " + "(checkpoint number=%llu)\n", + (unsigned long long)cno); + err = -EINVAL; + } + goto failed; + } + err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode); + if (unlikely(err)) + goto failed_bh; + atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); + atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); + + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); + return 0; + + failed_bh: + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); + failed: + nilfs_mdt_destroy(sbi->s_ifile); + sbi->s_ifile = NULL; + + down_write(&nilfs->ns_sem); + list_del_init(&sbi->s_list); + up_write(&nilfs->ns_sem); + + return err; +} + +void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + + nilfs_mdt_clear(sbi->s_ifile); + nilfs_mdt_destroy(sbi->s_ifile); + sbi->s_ifile = NULL; + down_write(&nilfs->ns_sem); + list_del_init(&sbi->s_list); + up_write(&nilfs->ns_sem); +} + +static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int err = 0; + + down_write(&nilfs->ns_sem); + if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { + nilfs->ns_mount_state |= NILFS_VALID_FS; + err = nilfs_commit_super(sbi, 1); + if (likely(!err)) + printk(KERN_INFO "NILFS: recovery complete.\n"); + } + up_write(&nilfs->ns_sem); + return err; +} + +static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + unsigned long long blocks; + unsigned long overhead; + unsigned long nrsvblocks; + sector_t nfreeblocks; + struct the_nilfs *nilfs = sbi->s_nilfs; + int err; + + /* + * Compute all of the segment blocks + * + * The blocks before first segment and after last segment + * are excluded. + */ + blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments + - nilfs->ns_first_data_block; + nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment; + + /* + * Compute the overhead + * + * When distributing meta data blocks outside semgent structure, + * We must count them as the overhead. + */ + overhead = 0; + + err = nilfs_count_free_blocks(nilfs, &nfreeblocks); + if (unlikely(err)) + return err; + + buf->f_type = NILFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = blocks - overhead; + buf->f_bfree = nfreeblocks; + buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? + (buf->f_bfree - nrsvblocks) : 0; + buf->f_files = atomic_read(&sbi->s_inodes_count); + buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ + buf->f_namelen = NILFS_NAME_LEN; + return 0; +} + +static struct super_operations nilfs_sops = { + .alloc_inode = nilfs_alloc_inode, + .destroy_inode = nilfs_destroy_inode, + .dirty_inode = nilfs_dirty_inode, + /* .write_inode = nilfs_write_inode, */ + /* .put_inode = nilfs_put_inode, */ + /* .drop_inode = nilfs_drop_inode, */ + .delete_inode = nilfs_delete_inode, + .put_super = nilfs_put_super, + .write_super = nilfs_write_super, + .sync_fs = nilfs_sync_fs, + /* .write_super_lockfs */ + /* .unlockfs */ + .statfs = nilfs_statfs, + .remount_fs = nilfs_remount, + .clear_inode = nilfs_clear_inode, + /* .umount_begin */ + /* .show_options */ +}; + +static struct inode * +nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO && + ino != NILFS_SKETCH_INO) + return ERR_PTR(-ESTALE); + + inode = nilfs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry * +nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, + int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + nilfs_nfs_get_inode); +} + +static struct dentry * +nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, + int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + nilfs_nfs_get_inode); +} + +static struct export_operations nilfs_export_ops = { + .fh_to_dentry = nilfs_fh_to_dentry, + .fh_to_parent = nilfs_fh_to_parent, + .get_parent = nilfs_get_parent, +}; + +enum { + Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_barrier, Opt_snapshot, Opt_order, + Opt_err, +}; + +static match_table_t tokens = { + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_barrier, "barrier=%s"}, + {Opt_snapshot, "cp=%u"}, + {Opt_order, "order=%s"}, + {Opt_err, NULL} +}; + +static int match_bool(substring_t *s, int *result) +{ + int len = s->to - s->from; + + if (strncmp(s->from, "on", len) == 0) + *result = 1; + else if (strncmp(s->from, "off", len) == 0) + *result = 0; + else + return 1; + return 0; +} + +static int parse_options(char *options, struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + if (match_bool(&args[0], &option)) + return 0; + if (option) + nilfs_set_opt(sbi, BARRIER); + else + nilfs_clear_opt(sbi, BARRIER); + break; + case Opt_order: + if (strcmp(args[0].from, "relaxed") == 0) + /* Ordered data semantics */ + nilfs_clear_opt(sbi, STRICT_ORDER); + else if (strcmp(args[0].from, "strict") == 0) + /* Strict in-order semantics */ + nilfs_set_opt(sbi, STRICT_ORDER); + else + return 0; + break; + case Opt_err_panic: + nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC); + break; + case Opt_err_ro: + nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO); + break; + case Opt_err_cont: + nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); + break; + case Opt_snapshot: + if (match_int(&args[0], &option) || option <= 0) + return 0; + if (!(sb->s_flags & MS_RDONLY)) + return 0; + sbi->s_snapshot_cno = option; + nilfs_set_opt(sbi, SNAPSHOT); + break; + default: + printk(KERN_ERR + "NILFS: Unrecognized mount option \"%s\"\n", p); + return 0; + } + } + return 1; +} + +static inline void +nilfs_set_default_options(struct nilfs_sb_info *sbi, + struct nilfs_super_block *sbp) +{ + sbi->s_mount_opt = + NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER; +} + +static int nilfs_setup_super(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; + int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count); + int mnt_count = le16_to_cpu(sbp->s_mnt_count); + + /* nilfs->sem must be locked by the caller. */ + if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { + printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); + } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) { + printk(KERN_WARNING + "NILFS warning: mounting fs with errors\n"); +#if 0 + } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) { + printk(KERN_WARNING + "NILFS warning: maximal mount count reached\n"); +#endif + } + if (!max_mnt_count) + sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); + + sbp->s_mnt_count = cpu_to_le16(mnt_count + 1); + sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS); + sbp->s_mtime = cpu_to_le64(get_seconds()); + return nilfs_commit_super(sbi, 1); +} + +struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, + u64 pos, int blocksize, + struct buffer_head **pbh) +{ + unsigned long long sb_index = pos; + unsigned long offset; + + offset = do_div(sb_index, blocksize); + *pbh = sb_bread(sb, sb_index); + if (!*pbh) + return NULL; + return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset); +} + +int nilfs_store_magic_and_option(struct super_block *sb, + struct nilfs_super_block *sbp, + char *data) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + + sb->s_magic = le16_to_cpu(sbp->s_magic); + + /* FS independent flags */ +#ifdef NILFS_ATIME_DISABLE + sb->s_flags |= MS_NOATIME; +#endif + + nilfs_set_default_options(sbi, sbp); + + sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid); + sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid); + sbi->s_interval = le32_to_cpu(sbp->s_c_interval); + sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); + + return !parse_options(data, sb) ? -EINVAL : 0 ; +} + +/** + * nilfs_fill_super() - initialize a super block instance + * @sb: super_block + * @data: mount options + * @silent: silent mode flag + * @nilfs: the_nilfs struct + * + * This function is called exclusively by bd_mount_mutex. + * So, the recovery process is protected from other simultaneous mounts. + */ +static int +nilfs_fill_super(struct super_block *sb, void *data, int silent, + struct the_nilfs *nilfs) +{ + struct nilfs_sb_info *sbi; + struct inode *root; + __u64 cno; + int err; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + + get_nilfs(nilfs); + sbi->s_nilfs = nilfs; + sbi->s_super = sb; + + err = init_nilfs(nilfs, sbi, (char *)data); + if (err) + goto failed_sbi; + + spin_lock_init(&sbi->s_inode_lock); + INIT_LIST_HEAD(&sbi->s_dirty_files); + INIT_LIST_HEAD(&sbi->s_list); + + /* + * Following initialization is overlapped because + * nilfs_sb_info structure has been cleared at the beginning. + * But we reserve them to keep our interest and make ready + * for the future change. + */ + get_random_bytes(&sbi->s_next_generation, + sizeof(sbi->s_next_generation)); + spin_lock_init(&sbi->s_next_gen_lock); + + sb->s_op = &nilfs_sops; + sb->s_export_op = &nilfs_export_ops; + sb->s_root = NULL; + sb->s_time_gran = 1; + + if (!nilfs_loaded(nilfs)) { + err = load_nilfs(nilfs, sbi); + if (err) + goto failed_sbi; + } + cno = nilfs_last_cno(nilfs); + + if (sb->s_flags & MS_RDONLY) { + if (nilfs_test_opt(sbi, SNAPSHOT)) { + err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, + sbi->s_snapshot_cno); + if (err < 0) + goto failed_sbi; + if (!err) { + printk(KERN_ERR + "NILFS: The specified checkpoint is " + "not a snapshot " + "(checkpoint number=%llu).\n", + (unsigned long long)sbi->s_snapshot_cno); + err = -EINVAL; + goto failed_sbi; + } + cno = sbi->s_snapshot_cno; + } else + /* Read-only mount */ + sbi->s_snapshot_cno = cno; + } + + err = nilfs_attach_checkpoint(sbi, cno); + if (err) { + printk(KERN_ERR "NILFS: error loading a checkpoint" + " (checkpoint number=%llu).\n", (unsigned long long)cno); + goto failed_sbi; + } + + if (!(sb->s_flags & MS_RDONLY)) { + err = nilfs_attach_segment_constructor(sbi); + if (err) + goto failed_checkpoint; + } + + root = nilfs_iget(sb, NILFS_ROOT_INO); + if (IS_ERR(root)) { + printk(KERN_ERR "NILFS: get root inode failed\n"); + err = PTR_ERR(root); + goto failed_segctor; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); + printk(KERN_ERR "NILFS: corrupt root inode.\n"); + err = -EINVAL; + goto failed_segctor; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + printk(KERN_ERR "NILFS: get root dentry failed\n"); + err = -ENOMEM; + goto failed_segctor; + } + + if (!(sb->s_flags & MS_RDONLY)) { + down_write(&nilfs->ns_sem); + nilfs_setup_super(sbi); + up_write(&nilfs->ns_sem); + } + + err = nilfs_mark_recovery_complete(sbi); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: recovery failed.\n"); + goto failed_root; + } + + return 0; + + failed_root: + dput(sb->s_root); + sb->s_root = NULL; + + failed_segctor: + nilfs_detach_segment_constructor(sbi); + + failed_checkpoint: + nilfs_detach_checkpoint(sbi); + + failed_sbi: + put_nilfs(nilfs); + sb->s_fs_info = NULL; + kfree(sbi); + return err; +} + +static int nilfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_super_block *sbp; + struct the_nilfs *nilfs = sbi->s_nilfs; + unsigned long old_sb_flags; + struct nilfs_mount_options old_opts; + int err; + + old_sb_flags = sb->s_flags; + old_opts.mount_opt = sbi->s_mount_opt; + old_opts.snapshot_cno = sbi->s_snapshot_cno; + + if (!parse_options(data, sb)) { + err = -EINVAL; + goto restore_opts; + } + sb->s_flags = (sb->s_flags & ~MS_POSIXACL); + + if ((*flags & MS_RDONLY) && + sbi->s_snapshot_cno != old_opts.snapshot_cno) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount to a different snapshot. \n", + sb->s_id); + err = -EINVAL; + goto restore_opts; + } + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out; + if (*flags & MS_RDONLY) { + /* Shutting down the segment constructor */ + nilfs_detach_segment_constructor(sbi); + sb->s_flags |= MS_RDONLY; + + sbi->s_snapshot_cno = nilfs_last_cno(nilfs); + /* nilfs_set_opt(sbi, SNAPSHOT); */ + + /* + * Remounting a valid RW partition RDONLY, so set + * the RDONLY flag and then mark the partition as valid again. + */ + down_write(&nilfs->ns_sem); + sbp = nilfs->ns_sbp[0]; + if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) && + (nilfs->ns_mount_state & NILFS_VALID_FS)) + sbp->s_state = cpu_to_le16(nilfs->ns_mount_state); + sbp->s_mtime = cpu_to_le64(get_seconds()); + nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + } else { + /* + * Mounting a RDONLY partition read-write, so reread and + * store the current valid flag. (It may have been changed + * by fsck since we originally mounted the partition.) + */ + down(&sb->s_bdev->bd_mount_sem); + /* Check existing RW-mount */ + if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount because a RW-mount exists.\n", + sb->s_id); + err = -EBUSY; + goto rw_remount_failed; + } + if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount because the current RO-mount is not " + "the latest one.\n", + sb->s_id); + err = -EINVAL; + goto rw_remount_failed; + } + sb->s_flags &= ~MS_RDONLY; + nilfs_clear_opt(sbi, SNAPSHOT); + sbi->s_snapshot_cno = 0; + + err = nilfs_attach_segment_constructor(sbi); + if (err) + goto rw_remount_failed; + + down_write(&nilfs->ns_sem); + nilfs_setup_super(sbi); + up_write(&nilfs->ns_sem); + + up(&sb->s_bdev->bd_mount_sem); + } + out: + return 0; + + rw_remount_failed: + up(&sb->s_bdev->bd_mount_sem); + restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.mount_opt; + sbi->s_snapshot_cno = old_opts.snapshot_cno; + return err; +} + +struct nilfs_super_data { + struct block_device *bdev; + __u64 cno; + int flags; +}; + +/** + * nilfs_identify - pre-read mount options needed to identify mount instance + * @data: mount options + * @sd: nilfs_super_data + */ +static int nilfs_identify(char *data, struct nilfs_super_data *sd) +{ + char *p, *options = data; + substring_t args[MAX_OPT_ARGS]; + int option, token; + int ret = 0; + + do { + p = strsep(&options, ","); + if (p != NULL && *p) { + token = match_token(p, tokens, args); + if (token == Opt_snapshot) { + if (!(sd->flags & MS_RDONLY)) + ret++; + else { + ret = match_int(&args[0], &option); + if (!ret) { + if (option > 0) + sd->cno = option; + else + ret++; + } + } + } + if (ret) + printk(KERN_ERR + "NILFS: invalid mount option: %s\n", p); + } + if (!options) + break; + BUG_ON(options == data); + *(options - 1) = ','; + } while (!ret); + return ret; +} + +static int nilfs_set_bdev_super(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + + s->s_bdev = sd->bdev; + s->s_dev = s->s_bdev->bd_dev; + return 0; +} + +static int nilfs_test_bdev_super(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + + return s->s_bdev == sd->bdev; +} + +static int nilfs_test_bdev_super2(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + int ret; + + if (s->s_bdev != sd->bdev) + return 0; + + if (!((s->s_flags | sd->flags) & MS_RDONLY)) + return 1; /* Reuse an old R/W-mode super_block */ + + if (s->s_flags & sd->flags & MS_RDONLY) { + if (down_read_trylock(&s->s_umount)) { + ret = s->s_root && + (sd->cno == NILFS_SB(s)->s_snapshot_cno); + up_read(&s->s_umount); + /* + * This path is locked with sb_lock by sget(). + * So, drop_super() causes deadlock. + */ + return ret; + } + } + return 0; +} + +static int +nilfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + struct nilfs_super_data sd; + struct super_block *s, *s2; + struct the_nilfs *nilfs = NULL; + int err, need_to_close = 1; + + sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); + if (IS_ERR(sd.bdev)) + return PTR_ERR(sd.bdev); + + /* + * To get mount instance using sget() vfs-routine, NILFS needs + * much more information than normal filesystems to identify mount + * instance. For snapshot mounts, not only a mount type (ro-mount + * or rw-mount) but also a checkpoint number is required. + * The results are passed in sget() using nilfs_super_data. + */ + sd.cno = 0; + sd.flags = flags; + if (nilfs_identify((char *)data, &sd)) { + err = -EINVAL; + goto failed; + } + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + down(&sd.bdev->bd_mount_sem); + if (!sd.cno && + (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) { + err = (err < 0) ? : -EBUSY; + goto failed_unlock; + } + + /* + * Phase-1: search any existent instance and get the_nilfs + */ + s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); + if (IS_ERR(s)) + goto error_s; + + if (!s->s_root) { + err = -ENOMEM; + nilfs = alloc_nilfs(sd.bdev); + if (!nilfs) + goto cancel_new; + } else { + struct nilfs_sb_info *sbi = NILFS_SB(s); + + /* + * s_umount protects super_block from unmount process; + * It covers pointers of nilfs_sb_info and the_nilfs. + */ + nilfs = sbi->s_nilfs; + get_nilfs(nilfs); + up_write(&s->s_umount); + + /* + * Phase-2: search specified snapshot or R/W mode super_block + */ + if (!sd.cno) + /* trying to get the latest checkpoint. */ + sd.cno = nilfs_last_cno(nilfs); + + s2 = sget(fs_type, nilfs_test_bdev_super2, + nilfs_set_bdev_super, &sd); + deactivate_super(s); + /* + * Although deactivate_super() invokes close_bdev_exclusive() at + * kill_block_super(). Here, s is an existent mount; we need + * one more close_bdev_exclusive() call. + */ + s = s2; + if (IS_ERR(s)) + goto error_s; + } + + if (!s->s_root) { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags; + strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(sd.bdev)); + + err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs); + if (err) + goto cancel_new; + + s->s_flags |= MS_ACTIVE; + need_to_close = 0; + } else if (!(s->s_flags & MS_RDONLY)) { + err = -EBUSY; + } + + up(&sd.bdev->bd_mount_sem); + put_nilfs(nilfs); + if (need_to_close) + close_bdev_exclusive(sd.bdev, flags); + simple_set_mnt(mnt, s); + return 0; + + error_s: + up(&sd.bdev->bd_mount_sem); + if (nilfs) + put_nilfs(nilfs); + close_bdev_exclusive(sd.bdev, flags); + return PTR_ERR(s); + + failed_unlock: + up(&sd.bdev->bd_mount_sem); + failed: + close_bdev_exclusive(sd.bdev, flags); + + return err; + + cancel_new: + /* Abandoning the newly allocated superblock */ + up(&sd.bdev->bd_mount_sem); + if (nilfs) + put_nilfs(nilfs); + up_write(&s->s_umount); + deactivate_super(s); + /* + * deactivate_super() invokes close_bdev_exclusive(). + * We must finish all post-cleaning before this call; + * put_nilfs() and unlocking bd_mount_sem need the block device. + */ + return err; +} + +static int nilfs_test_bdev_super3(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + int ret; + + if (s->s_bdev != sd->bdev) + return 0; + if (down_read_trylock(&s->s_umount)) { + ret = (s->s_flags & MS_RDONLY) && s->s_root && + nilfs_test_opt(NILFS_SB(s), SNAPSHOT); + up_read(&s->s_umount); + if (ret) + return 0; /* ignore snapshot mounts */ + } + return !((sd->flags ^ s->s_flags) & MS_RDONLY); +} + +static int __false_bdev_super(struct super_block *s, void *data) +{ +#if 0 /* XXX: workaround for lock debug. This is not good idea */ + up_write(&s->s_umount); +#endif + return -EFAULT; +} + +/** + * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not. + * fs_type: filesystem type + * bdev: block device + * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount) + * res: pointer to an integer to store result + * + * This function must be called within a section protected by bd_mount_mutex. + */ +static int test_exclusive_mount(struct file_system_type *fs_type, + struct block_device *bdev, int flags) +{ + struct super_block *s; + struct nilfs_super_data sd = { .flags = flags, .bdev = bdev }; + + s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd); + if (IS_ERR(s)) { + if (PTR_ERR(s) != -EFAULT) + return PTR_ERR(s); + return 0; /* Not found */ + } + up_write(&s->s_umount); + deactivate_super(s); + return 1; /* Found */ +} + +struct file_system_type nilfs_fs_type = { + .owner = THIS_MODULE, + .name = "nilfs2", + .get_sb = nilfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_nilfs_fs(void) +{ + int err; + + err = nilfs_init_inode_cache(); + if (err) + goto failed; + + err = nilfs_init_transaction_cache(); + if (err) + goto failed_inode_cache; + + err = nilfs_init_segbuf_cache(); + if (err) + goto failed_transaction_cache; + + err = nilfs_btree_path_cache_init(); + if (err) + goto failed_segbuf_cache; + + err = register_filesystem(&nilfs_fs_type); + if (err) + goto failed_btree_path_cache; + + return 0; + + failed_btree_path_cache: + nilfs_btree_path_cache_destroy(); + + failed_segbuf_cache: + nilfs_destroy_segbuf_cache(); + + failed_transaction_cache: + nilfs_destroy_transaction_cache(); + + failed_inode_cache: + nilfs_destroy_inode_cache(); + + failed: + return err; +} + +static void __exit exit_nilfs_fs(void) +{ + nilfs_destroy_segbuf_cache(); + nilfs_destroy_transaction_cache(); + nilfs_destroy_inode_cache(); + nilfs_btree_path_cache_destroy(); + unregister_filesystem(&nilfs_fs_type); +} + +module_init(init_nilfs_fs) +module_exit(exit_nilfs_fs) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c new file mode 100644 index 000000000000..33400cf0bbe2 --- /dev/null +++ b/fs/nilfs2/the_nilfs.c @@ -0,0 +1,637 @@ +/* + * the_nilfs.c - the_nilfs shared structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/crc32.h> +#include "nilfs.h" +#include "segment.h" +#include "alloc.h" +#include "cpfile.h" +#include "sufile.h" +#include "dat.h" +#include "seglist.h" +#include "segbuf.h" + +void nilfs_set_last_segment(struct the_nilfs *nilfs, + sector_t start_blocknr, u64 seq, __u64 cno) +{ + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_last_pseg = start_blocknr; + nilfs->ns_last_seq = seq; + nilfs->ns_last_cno = cno; + spin_unlock(&nilfs->ns_last_segment_lock); +} + +/** + * alloc_nilfs - allocate the_nilfs structure + * @bdev: block device to which the_nilfs is related + * + * alloc_nilfs() allocates memory for the_nilfs and + * initializes its reference count and locks. + * + * Return Value: On success, pointer to the_nilfs is returned. + * On error, NULL is returned. + */ +struct the_nilfs *alloc_nilfs(struct block_device *bdev) +{ + struct the_nilfs *nilfs; + + nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL); + if (!nilfs) + return NULL; + + nilfs->ns_bdev = bdev; + atomic_set(&nilfs->ns_count, 1); + atomic_set(&nilfs->ns_writer_refcount, -1); + atomic_set(&nilfs->ns_ndirtyblks, 0); + init_rwsem(&nilfs->ns_sem); + mutex_init(&nilfs->ns_writer_mutex); + INIT_LIST_HEAD(&nilfs->ns_supers); + spin_lock_init(&nilfs->ns_last_segment_lock); + nilfs->ns_gc_inodes_h = NULL; + init_rwsem(&nilfs->ns_segctor_sem); + + return nilfs; +} + +/** + * put_nilfs - release a reference to the_nilfs + * @nilfs: the_nilfs structure to be released + * + * put_nilfs() decrements a reference counter of the_nilfs. + * If the reference count reaches zero, the_nilfs is freed. + */ +void put_nilfs(struct the_nilfs *nilfs) +{ + if (!atomic_dec_and_test(&nilfs->ns_count)) + return; + /* + * Increment of ns_count never occur below because the caller + * of get_nilfs() holds at least one reference to the_nilfs. + * Thus its exclusion control is not required here. + */ + might_sleep(); + if (nilfs_loaded(nilfs)) { + nilfs_mdt_clear(nilfs->ns_sufile); + nilfs_mdt_destroy(nilfs->ns_sufile); + nilfs_mdt_clear(nilfs->ns_cpfile); + nilfs_mdt_destroy(nilfs->ns_cpfile); + nilfs_mdt_clear(nilfs->ns_dat); + nilfs_mdt_destroy(nilfs->ns_dat); + /* XXX: how and when to clear nilfs->ns_gc_dat? */ + nilfs_mdt_destroy(nilfs->ns_gc_dat); + } + if (nilfs_init(nilfs)) { + nilfs_destroy_gccache(nilfs); + brelse(nilfs->ns_sbh[0]); + brelse(nilfs->ns_sbh[1]); + } + kfree(nilfs); +} + +static int nilfs_load_super_root(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, sector_t sr_block) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *raw_sr; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + unsigned dat_entry_size, segment_usage_size, checkpoint_size; + unsigned inode_size; + int err; + + err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1); + if (unlikely(err)) + return err; + + down_read(&nilfs->ns_sem); + dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size); + checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size); + segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size); + up_read(&nilfs->ns_sem); + + inode_size = nilfs->ns_inode_size; + + err = -ENOMEM; + nilfs->ns_dat = nilfs_mdt_new( + nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + if (unlikely(!nilfs->ns_dat)) + goto failed; + + nilfs->ns_gc_dat = nilfs_mdt_new( + nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + if (unlikely(!nilfs->ns_gc_dat)) + goto failed_dat; + + nilfs->ns_cpfile = nilfs_mdt_new( + nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP); + if (unlikely(!nilfs->ns_cpfile)) + goto failed_gc_dat; + + nilfs->ns_sufile = nilfs_mdt_new( + nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP); + if (unlikely(!nilfs->ns_sufile)) + goto failed_cpfile; + + err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size); + if (unlikely(err)) + goto failed_sufile; + + err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size); + if (unlikely(err)) + goto failed_sufile; + + nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat); + nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size, + sizeof(struct nilfs_cpfile_header)); + nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size, + sizeof(struct nilfs_sufile_header)); + + err = nilfs_mdt_read_inode_direct( + nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size)); + if (unlikely(err)) + goto failed_sufile; + + err = nilfs_mdt_read_inode_direct( + nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size)); + if (unlikely(err)) + goto failed_sufile; + + err = nilfs_mdt_read_inode_direct( + nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size)); + if (unlikely(err)) + goto failed_sufile; + + raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime); + + failed: + brelse(bh_sr); + return err; + + failed_sufile: + nilfs_mdt_destroy(nilfs->ns_sufile); + + failed_cpfile: + nilfs_mdt_destroy(nilfs->ns_cpfile); + + failed_gc_dat: + nilfs_mdt_destroy(nilfs->ns_gc_dat); + + failed_dat: + nilfs_mdt_destroy(nilfs->ns_dat); + goto failed; +} + +static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri) +{ + memset(ri, 0, sizeof(*ri)); + INIT_LIST_HEAD(&ri->ri_used_segments); +} + +static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri) +{ + nilfs_dispose_segment_list(&ri->ri_used_segments); +} + +/** + * load_nilfs - load and recover the nilfs + * @nilfs: the_nilfs structure to be released + * @sbi: nilfs_sb_info used to recover past segment + * + * load_nilfs() searches and load the latest super root, + * attaches the last segment, and does recovery if needed. + * The caller must call this exclusively for simultaneous mounts. + */ +int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) +{ + struct nilfs_recovery_info ri; + unsigned int s_flags = sbi->s_super->s_flags; + int really_read_only = bdev_read_only(nilfs->ns_bdev); + unsigned valid_fs; + int err = 0; + + nilfs_init_recovery_info(&ri); + + down_write(&nilfs->ns_sem); + valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); + up_write(&nilfs->ns_sem); + + if (!valid_fs && (s_flags & MS_RDONLY)) { + printk(KERN_INFO "NILFS: INFO: recovery " + "required for readonly filesystem.\n"); + if (really_read_only) { + printk(KERN_ERR "NILFS: write access " + "unavailable, cannot proceed.\n"); + err = -EROFS; + goto failed; + } + printk(KERN_INFO "NILFS: write access will " + "be enabled during recovery.\n"); + sbi->s_super->s_flags &= ~MS_RDONLY; + } + + err = nilfs_search_super_root(nilfs, sbi, &ri); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: error searching super root.\n"); + goto failed; + } + + err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: error loading super root.\n"); + goto failed; + } + + if (!valid_fs) { + err = nilfs_recover_logical_segments(nilfs, sbi, &ri); + if (unlikely(err)) { + nilfs_mdt_destroy(nilfs->ns_cpfile); + nilfs_mdt_destroy(nilfs->ns_sufile); + nilfs_mdt_destroy(nilfs->ns_dat); + goto failed; + } + if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED) + sbi->s_super->s_dirt = 1; + } + + set_nilfs_loaded(nilfs); + + failed: + nilfs_clear_recovery_info(&ri); + sbi->s_super->s_flags = s_flags; + return err; +} + +static unsigned long long nilfs_max_size(unsigned int blkbits) +{ + unsigned int max_bits; + unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */ + + max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */ + if (max_bits < 64) + res = min_t(unsigned long long, res, (1ULL << max_bits) - 1); + return res; +} + +static int nilfs_store_disk_layout(struct the_nilfs *nilfs, + struct nilfs_super_block *sbp) +{ + if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) { + printk(KERN_ERR "NILFS: revision mismatch " + "(superblock rev.=%d.%d, current rev.=%d.%d). " + "Please check the version of mkfs.nilfs.\n", + le32_to_cpu(sbp->s_rev_level), + le16_to_cpu(sbp->s_minor_rev_level), + NILFS_CURRENT_REV, NILFS_MINOR_REV); + return -EINVAL; + } + nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes); + if (nilfs->ns_sbsize > BLOCK_SIZE) + return -EINVAL; + + nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); + nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); + + nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); + if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { + printk(KERN_ERR "NILFS: too short segment. \n"); + return -EINVAL; + } + + nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block); + nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments); + nilfs->ns_r_segments_percentage = + le32_to_cpu(sbp->s_r_segments_percentage); + nilfs->ns_nrsvsegs = + max_t(unsigned long, NILFS_MIN_NRSVSEGS, + DIV_ROUND_UP(nilfs->ns_nsegments * + nilfs->ns_r_segments_percentage, 100)); + nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); + return 0; +} + +static int nilfs_valid_sb(struct nilfs_super_block *sbp) +{ + static unsigned char sum[4]; + const int sumoff = offsetof(struct nilfs_super_block, s_sum); + size_t bytes; + u32 crc; + + if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) + return 0; + bytes = le16_to_cpu(sbp->s_bytes); + if (bytes > BLOCK_SIZE) + return 0; + crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, + sumoff); + crc = crc32_le(crc, sum, 4); + crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4, + bytes - sumoff - 4); + return crc == le32_to_cpu(sbp->s_sum); +} + +static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) +{ + return offset < ((le64_to_cpu(sbp->s_nsegments) * + le32_to_cpu(sbp->s_blocks_per_segment)) << + (le32_to_cpu(sbp->s_log_block_size) + 10)); +} + +static void nilfs_release_super_block(struct the_nilfs *nilfs) +{ + int i; + + for (i = 0; i < 2; i++) { + if (nilfs->ns_sbp[i]) { + brelse(nilfs->ns_sbh[i]); + nilfs->ns_sbh[i] = NULL; + nilfs->ns_sbp[i] = NULL; + } + } +} + +void nilfs_fall_back_super_block(struct the_nilfs *nilfs) +{ + brelse(nilfs->ns_sbh[0]); + nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; + nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; + nilfs->ns_sbh[1] = NULL; + nilfs->ns_sbp[1] = NULL; +} + +void nilfs_swap_super_block(struct the_nilfs *nilfs) +{ + struct buffer_head *tsbh = nilfs->ns_sbh[0]; + struct nilfs_super_block *tsbp = nilfs->ns_sbp[0]; + + nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; + nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; + nilfs->ns_sbh[1] = tsbh; + nilfs->ns_sbp[1] = tsbp; +} + +static int nilfs_load_super_block(struct the_nilfs *nilfs, + struct super_block *sb, int blocksize, + struct nilfs_super_block **sbpp) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct buffer_head **sbh = nilfs->ns_sbh; + u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size); + int valid[2], swp = 0; + + sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, + &sbh[0]); + sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); + + if (!sbp[0]) { + if (!sbp[1]) { + printk(KERN_ERR "NILFS: unable to read superblock\n"); + return -EIO; + } + printk(KERN_WARNING + "NILFS warning: unable to read primary superblock\n"); + } else if (!sbp[1]) + printk(KERN_WARNING + "NILFS warning: unable to read secondary superblock\n"); + + valid[0] = nilfs_valid_sb(sbp[0]); + valid[1] = nilfs_valid_sb(sbp[1]); + swp = valid[1] && + (!valid[0] || + le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime)); + + if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { + brelse(sbh[1]); + sbh[1] = NULL; + sbp[1] = NULL; + swp = 0; + } + if (!valid[swp]) { + nilfs_release_super_block(nilfs); + printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n", + sb->s_id); + return -EINVAL; + } + + if (swp) { + printk(KERN_WARNING "NILFS warning: broken superblock. " + "using spare superblock.\n"); + nilfs_swap_super_block(nilfs); + } + + nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime); + nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0; + nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); + *sbpp = sbp[0]; + return 0; +} + +/** + * init_nilfs - initialize a NILFS instance. + * @nilfs: the_nilfs structure + * @sbi: nilfs_sb_info + * @sb: super block + * @data: mount options + * + * init_nilfs() performs common initialization per block device (e.g. + * reading the super block, getting disk layout information, initializing + * shared fields in the_nilfs). It takes on some portion of the jobs + * typically done by a fill_super() routine. This division arises from + * the nature that multiple NILFS instances may be simultaneously + * mounted on a device. + * For multiple mounts on the same device, only the first mount + * invokes these tasks. + * + * Return Value: On success, 0 is returned. On error, a negative error + * code is returned. + */ +int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) +{ + struct super_block *sb = sbi->s_super; + struct nilfs_super_block *sbp; + struct backing_dev_info *bdi; + int blocksize; + int err; + + down_write(&nilfs->ns_sem); + if (nilfs_init(nilfs)) { + /* Load values from existing the_nilfs */ + sbp = nilfs->ns_sbp[0]; + err = nilfs_store_magic_and_option(sb, sbp, data); + if (err) + goto out; + + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); + if (sb->s_blocksize != blocksize && + !sb_set_blocksize(sb, blocksize)) { + printk(KERN_ERR "NILFS: blocksize %d unfit to device\n", + blocksize); + err = -EINVAL; + } + sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits); + goto out; + } + + blocksize = sb_min_blocksize(sb, BLOCK_SIZE); + if (!blocksize) { + printk(KERN_ERR "NILFS: unable to set blocksize\n"); + err = -EINVAL; + goto out; + } + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) + goto out; + + err = nilfs_store_magic_and_option(sb, sbp, data); + if (err) + goto failed_sbh; + + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); + if (sb->s_blocksize != blocksize) { + int hw_blocksize = bdev_hardsect_size(sb->s_bdev); + + if (blocksize < hw_blocksize) { + printk(KERN_ERR + "NILFS: blocksize %d too small for device " + "(sector-size = %d).\n", + blocksize, hw_blocksize); + err = -EINVAL; + goto failed_sbh; + } + nilfs_release_super_block(nilfs); + sb_set_blocksize(sb, blocksize); + + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) + goto out; + /* not failed_sbh; sbh is released automatically + when reloading fails. */ + } + nilfs->ns_blocksize_bits = sb->s_blocksize_bits; + + err = nilfs_store_disk_layout(nilfs, sbp); + if (err) + goto failed_sbh; + + sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits); + + nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); + + bdi = nilfs->ns_bdev->bd_inode_backing_dev_info; + if (!bdi) + bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; + nilfs->ns_bdi = bdi ? : &default_backing_dev_info; + + /* Finding last segment */ + nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); + nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno); + nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq); + + nilfs->ns_seg_seq = nilfs->ns_last_seq; + nilfs->ns_segnum = + nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); + nilfs->ns_cno = nilfs->ns_last_cno + 1; + if (nilfs->ns_segnum >= nilfs->ns_nsegments) { + printk(KERN_ERR "NILFS invalid last segment number.\n"); + err = -EINVAL; + goto failed_sbh; + } + /* Dummy values */ + nilfs->ns_free_segments_count = + nilfs->ns_nsegments - (nilfs->ns_segnum + 1); + + /* Initialize gcinode cache */ + err = nilfs_init_gccache(nilfs); + if (err) + goto failed_sbh; + + set_nilfs_init(nilfs); + err = 0; + out: + up_write(&nilfs->ns_sem); + return err; + + failed_sbh: + nilfs_release_super_block(nilfs); + goto out; +} + +int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) +{ + struct inode *dat = nilfs_dat_inode(nilfs); + unsigned long ncleansegs; + int err; + + down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs); + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + if (likely(!err)) + *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; + return err; +} + +int nilfs_near_disk_full(struct the_nilfs *nilfs) +{ + struct inode *sufile = nilfs->ns_sufile; + unsigned long ncleansegs, nincsegs; + int ret; + + ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs); + if (likely(!ret)) { + nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / + nilfs->ns_blocks_per_segment + 1; + if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs) + ret++; + } + return ret; +} + +int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, + int snapshot_mount) +{ + struct nilfs_sb_info *sbi; + int ret = 0; + + down_read(&nilfs->ns_sem); + if (cno == 0 || cno > nilfs->ns_cno) + goto out_unlock; + + list_for_each_entry(sbi, &nilfs->ns_supers, s_list) { + if (sbi->s_snapshot_cno == cno && + (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) { + /* exclude read-only mounts */ + ret++; + break; + } + } + /* for protecting recent checkpoints */ + if (cno >= nilfs_last_cno(nilfs)) + ret++; + + out_unlock: + up_read(&nilfs->ns_sem); + return ret; +} diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h new file mode 100644 index 000000000000..30fe58778d05 --- /dev/null +++ b/fs/nilfs2/the_nilfs.h @@ -0,0 +1,298 @@ +/* + * the_nilfs.h - the_nilfs shared structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#ifndef _THE_NILFS_H +#define _THE_NILFS_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include "sb.h" + +/* the_nilfs struct */ +enum { + THE_NILFS_INIT = 0, /* Information from super_block is set */ + THE_NILFS_LOADED, /* Roll-back/roll-forward has done and + the latest checkpoint was loaded */ + THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ +}; + +/** + * struct the_nilfs - struct to supervise multiple nilfs mount points + * @ns_flags: flags + * @ns_count: reference count + * @ns_bdev: block device + * @ns_bdi: backing dev info + * @ns_writer: back pointer to writable nilfs_sb_info + * @ns_sem: semaphore for shared states + * @ns_writer_mutex: mutex protecting ns_writer attach/detach + * @ns_writer_refcount: number of referrers on ns_writer + * @ns_sbh: buffer heads of on-disk super blocks + * @ns_sbp: pointers to super block data + * @ns_sbwtime: previous write time of super blocks + * @ns_sbsize: size of valid data in super block + * @ns_supers: list of nilfs super block structs + * @ns_seg_seq: segment sequence counter + * @ns_segnum: index number of the latest full segment. + * @ns_nextnum: index number of the full segment index to be used next + * @ns_pseg_offset: offset of next partial segment in the current full segment + * @ns_cno: next checkpoint number + * @ns_ctime: write time of the last segment + * @ns_nongc_ctime: write time of the last segment not for cleaner operation + * @ns_ndirtyblks: Number of dirty data blocks + * @ns_last_segment_lock: lock protecting fields for the latest segment + * @ns_last_pseg: start block number of the latest segment + * @ns_last_seq: sequence value of the latest segment + * @ns_last_cno: checkpoint number of the latest segment + * @ns_prot_seq: least sequence number of segments which must not be reclaimed + * @ns_free_segments_count: counter of free segments + * @ns_segctor_sem: segment constructor semaphore + * @ns_dat: DAT file inode + * @ns_cpfile: checkpoint file inode + * @ns_sufile: segusage file inode + * @ns_gc_dat: shadow inode of the DAT file inode for GC + * @ns_gc_inodes: dummy inodes to keep live blocks + * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks + * @ns_blocksize_bits: bit length of block size + * @ns_nsegments: number of segments in filesystem + * @ns_blocks_per_segment: number of blocks per segment + * @ns_r_segments_percentage: reserved segments percentage + * @ns_nrsvsegs: number of reserved segments + * @ns_first_data_block: block number of first data block + * @ns_inode_size: size of on-disk inode + * @ns_first_ino: first not-special inode number + * @ns_crc_seed: seed value of CRC32 calculation + */ +struct the_nilfs { + unsigned long ns_flags; + atomic_t ns_count; + + struct block_device *ns_bdev; + struct backing_dev_info *ns_bdi; + struct nilfs_sb_info *ns_writer; + struct rw_semaphore ns_sem; + struct mutex ns_writer_mutex; + atomic_t ns_writer_refcount; + + /* + * used for + * - loading the latest checkpoint exclusively. + * - allocating a new full segment. + * - protecting s_dirt in the super_block struct + * (see nilfs_write_super) and the following fields. + */ + struct buffer_head *ns_sbh[2]; + struct nilfs_super_block *ns_sbp[2]; + time_t ns_sbwtime[2]; + unsigned ns_sbsize; + unsigned ns_mount_state; + struct list_head ns_supers; + + /* + * Following fields are dedicated to a writable FS-instance. + * Except for the period seeking checkpoint, code outside the segment + * constructor must lock a segment semaphore while accessing these + * fields. + * The writable FS-instance is sole during a lifetime of the_nilfs. + */ + u64 ns_seg_seq; + __u64 ns_segnum; + __u64 ns_nextnum; + unsigned long ns_pseg_offset; + __u64 ns_cno; + time_t ns_ctime; + time_t ns_nongc_ctime; + atomic_t ns_ndirtyblks; + + /* + * The following fields hold information on the latest partial segment + * written to disk with a super root. These fields are protected by + * ns_last_segment_lock. + */ + spinlock_t ns_last_segment_lock; + sector_t ns_last_pseg; + u64 ns_last_seq; + __u64 ns_last_cno; + u64 ns_prot_seq; + unsigned long ns_free_segments_count; + + struct rw_semaphore ns_segctor_sem; + + /* + * Following fields are lock free except for the period before + * the_nilfs is initialized. + */ + struct inode *ns_dat; + struct inode *ns_cpfile; + struct inode *ns_sufile; + struct inode *ns_gc_dat; + + /* GC inode list and hash table head */ + struct list_head ns_gc_inodes; + struct hlist_head *ns_gc_inodes_h; + + /* Disk layout information (static) */ + unsigned int ns_blocksize_bits; + unsigned long ns_nsegments; + unsigned long ns_blocks_per_segment; + unsigned long ns_r_segments_percentage; + unsigned long ns_nrsvsegs; + unsigned long ns_first_data_block; + int ns_inode_size; + int ns_first_ino; + u32 ns_crc_seed; +}; + +#define NILFS_GCINODE_HASH_BITS 8 +#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS) + +#define THE_NILFS_FNS(bit, name) \ +static inline void set_nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} \ +static inline void clear_nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} \ +static inline int nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} + +THE_NILFS_FNS(INIT, init) +THE_NILFS_FNS(LOADED, loaded) +THE_NILFS_FNS(DISCONTINUED, discontinued) + +/* Minimum interval of periodical update of superblocks (in seconds) */ +#define NILFS_SB_FREQ 10 +#define NILFS_ALTSB_FREQ 60 /* spare superblock */ + +void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); +struct the_nilfs *alloc_nilfs(struct block_device *); +void put_nilfs(struct the_nilfs *); +int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); +int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); +int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); +int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); +int nilfs_near_disk_full(struct the_nilfs *); +void nilfs_fall_back_super_block(struct the_nilfs *); +void nilfs_swap_super_block(struct the_nilfs *); + + +static inline void get_nilfs(struct the_nilfs *nilfs) +{ + /* Caller must have at least one reference of the_nilfs. */ + atomic_inc(&nilfs->ns_count); +} + +static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs) +{ + if (atomic_inc_and_test(&nilfs->ns_writer_refcount)) + mutex_lock(&nilfs->ns_writer_mutex); + return nilfs->ns_writer; +} + +static inline void nilfs_put_writer(struct the_nilfs *nilfs) +{ + if (atomic_add_negative(-1, &nilfs->ns_writer_refcount)) + mutex_unlock(&nilfs->ns_writer_mutex); +} + +static inline void +nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) +{ + mutex_lock(&nilfs->ns_writer_mutex); + nilfs->ns_writer = sbi; + mutex_unlock(&nilfs->ns_writer_mutex); +} + +static inline void +nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) +{ + mutex_lock(&nilfs->ns_writer_mutex); + if (sbi == nilfs->ns_writer) + nilfs->ns_writer = NULL; + mutex_unlock(&nilfs->ns_writer_mutex); +} + +static inline void +nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, + sector_t *seg_start, sector_t *seg_end) +{ + *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum; + *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1; + if (segnum == 0) + *seg_start = nilfs->ns_first_data_block; +} + +static inline sector_t +nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum) +{ + return (segnum == 0) ? nilfs->ns_first_data_block : + (sector_t)nilfs->ns_blocks_per_segment * segnum; +} + +static inline __u64 +nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr) +{ + sector_t segnum = blocknr; + + sector_div(segnum, nilfs->ns_blocks_per_segment); + return segnum; +} + +static inline void +nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start, + sector_t seg_end) +{ + /* terminate the current full segment (used in case of I/O-error) */ + nilfs->ns_pseg_offset = seg_end - seg_start + 1; +} + +static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs) +{ + /* move forward with a full segment */ + nilfs->ns_segnum = nilfs->ns_nextnum; + nilfs->ns_pseg_offset = 0; + nilfs->ns_seg_seq++; +} + +static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs) +{ + __u64 cno; + + spin_lock(&nilfs->ns_last_segment_lock); + cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + return cno; +} + +static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n) +{ + return n == nilfs->ns_segnum || n == nilfs->ns_nextnum; +} + +#endif /* _THE_NILFS_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a5887df2cd8a..8672b9536039 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1926,7 +1926,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, out->f_path.dentry->d_name.len, out->f_path.dentry->d_name.name); - inode_double_lock(inode, pipe->inode); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { @@ -1941,12 +1941,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, goto out_unlock; } + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); out_unlock: ocfs2_rw_unlock(inode, 1); out: - inode_double_unlock(inode, pipe->inode); + mutex_unlock(&inode->i_mutex); mlog_exit(ret); return ret; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b0ae0be4801f..39e4ad4f59f4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) struct file *file = vma->vm_file; int flags = vma->vm_flags; unsigned long ino = 0; + unsigned long long pgoff = 0; dev_t dev = 0; int len; @@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; + pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", @@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', - ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, + pgoff, MAJOR(dev), MINOR(dev), ino, &len); /* diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 863464d5519c..64a72e2e7650 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -126,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) struct file *file; dev_t dev = 0; int flags, len; + unsigned long long pgoff = 0; flags = vma->vm_flags; file = vma->vm_file; @@ -134,6 +135,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; } seq_printf(m, @@ -144,7 +146,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - (unsigned long long) vma->vm_pgoff << PAGE_SHIFT, + pgoff, MAJOR(dev), MINOR(dev), ino, &len); if (file) { diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index a404fb88e456..3a6b193d8444 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -221,22 +221,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent) save_mount_options(sb, data); fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); + sb->s_fs_info = fsi; if (!fsi) { err = -ENOMEM; goto fail; } - sb->s_fs_info = fsi; err = ramfs_parse_options(data, &fsi->mount_opts); if (err) goto fail; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = RAMFS_MAGIC; - sb->s_op = &ramfs_ops; - sb->s_time_gran = 1; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = RAMFS_MAGIC; + sb->s_op = &ramfs_ops; + sb->s_time_gran = 1; + inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); if (!inode) { err = -ENOMEM; @@ -244,14 +245,16 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent) } root = d_alloc_root(inode); + sb->s_root = root; if (!root) { err = -ENOMEM; goto fail; } - sb->s_root = root; + return 0; fail: kfree(fsi); + sb->s_fs_info = NULL; iput(inode); return err; } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 1e548a4975ba..10ca7d984a8b 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -408,12 +408,17 @@ static void romfs_destroy_inode(struct inode *inode) */ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + buf->f_type = ROMFS_MAGIC; buf->f_namelen = ROMFS_MAXFN; buf->f_bsize = ROMBSIZE; buf->f_bfree = buf->f_bavail = buf->f_ffree; buf->f_blocks = (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); return 0; } diff --git a/fs/splice.c b/fs/splice.c index dd727d43e5b7..c18aa7e03e2b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -737,10 +737,19 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, * ->write_end. Most of the time, these expect i_mutex to * be held. Since this may result in an ABBA deadlock with * pipe->inode, we have to order lock acquiry here. + * + * Outer lock must be inode->i_mutex, as pipe_wait() will + * release and reacquire pipe->inode->i_mutex, AND inode must + * never be a pipe. */ - inode_double_lock(inode, pipe->inode); + WARN_ON(S_ISFIFO(inode->i_mode)); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); ret = __splice_from_pipe(pipe, &sd, actor); - inode_double_unlock(inode, pipe->inode); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + mutex_unlock(&inode->i_mutex); return ret; } @@ -831,11 +840,17 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; - inode_double_lock(inode, pipe->inode); + WARN_ON(S_ISFIFO(inode->i_mode)); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); ret = file_remove_suid(out); - if (likely(!ret)) + if (likely(!ret)) { + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); ret = __splice_from_pipe(pipe, &sd, pipe_to_file); - inode_double_unlock(inode, pipe->inode); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + } + mutex_unlock(&inode->i_mutex); if (ret > 0) { unsigned long nr_pages; diff --git a/fs/super.c b/fs/super.c index 77cb4ec919b9..786fe7d72790 100644 --- a/fs/super.c +++ b/fs/super.c @@ -771,6 +771,46 @@ void kill_litter_super(struct super_block *sb) EXPORT_SYMBOL(kill_litter_super); +static int ns_test_super(struct super_block *sb, void *data) +{ + return sb->s_fs_info == data; +} + +static int ns_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); +} + +int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct super_block *sb; + + sb = sget(fs_type, ns_test_super, ns_set_super, data); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + int err; + sb->s_flags = flags; + err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (err) { + up_write(&sb->s_umount); + deactivate_super(sb); + return err; + } + + sb->s_flags |= MS_ACTIVE; + } + + simple_set_mnt(mnt, sb); + return 0; +} + +EXPORT_SYMBOL(get_sb_ns); + #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) { |