diff options
Diffstat (limited to 'fs')
308 files changed, 9702 insertions, 4556 deletions
diff --git a/fs/Makefile b/fs/Makefile index 4030cbfbc9af..90c88529892b 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o \ - stack.o fs_struct.o statfs.o + stack.o fs_struct.o statfs.o fs_pin.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index c770337c4b45..24575d9d882d 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -153,6 +153,7 @@ extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigne extern unsigned int adfs_map_free(struct super_block *sb); /* Misc */ +__printf(3, 4) void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...); #define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt) diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 0d138c0de293..51c279a29845 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -138,7 +138,7 @@ adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_inf goto out; if (ADFS_I(inode)->parent_id != dir.parent_id) { - adfs_error(sb, "parent directory changed under me! (%lx but got %lx)\n", + adfs_error(sb, "parent directory changed under me! (%lx but got %x)\n", ADFS_I(inode)->parent_id, dir.parent_id); ret = -EIO; goto free_out; diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index d9e3bee4e653..f2ba88ab4aed 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -55,10 +55,10 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct } size >>= sb->s_blocksize_bits; - if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) { + if (size > ARRAY_SIZE(dir->bh)) { /* this directory is too big for fixed bh set, must allocate */ struct buffer_head **bh_fplus = - kzalloc(size * sizeof(struct buffer_head *), + kcalloc(size, sizeof(struct buffer_head *), GFP_KERNEL); if (!bh_fplus) { adfs_error(sb, "not enough memory for" @@ -79,9 +79,8 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct dir->bh_fplus[blk] = sb_bread(sb, block); if (!dir->bh_fplus[blk]) { - adfs_error(sb, "dir object %X failed read for" - " offset %d, mapped block %X", - id, blk, block); + adfs_error(sb, "dir object %x failed read for offset %d, mapped block %lX", + id, blk, block); goto out; } diff --git a/fs/afs/main.c b/fs/afs/main.c index 42dd2e499ed8..35de0c04729f 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -55,13 +55,13 @@ static int __init afs_get_client_UUID(void) afs_uuid.time_low = uuidtime; afs_uuid.time_mid = uuidtime >> 32; afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK; - afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME; + afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME; get_random_bytes(&clockseq, 2); afs_uuid.clock_seq_low = clockseq; afs_uuid.clock_seq_hi_and_reserved = (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK; - afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD; + afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD; _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", afs_uuid.time_low, @@ -506,6 +506,8 @@ static void free_ioctx(struct work_struct *work) aio_free_ring(ctx); free_percpu(ctx->cpu); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); } @@ -715,8 +717,8 @@ err_ctx: err: mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); - free_percpu(ctx->reqs.pcpu_count); - free_percpu(ctx->users.pcpu_count); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); @@ -830,16 +832,20 @@ void exit_aio(struct mm_struct *mm) static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } + local_irq_restore(flags); preempt_enable(); } @@ -847,10 +853,12 @@ static bool get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -869,6 +877,7 @@ static bool get_reqs_available(struct kioctx *ctx) ret = true; kcpu->reqs_available--; out: + local_irq_restore(flags); preempt_enable(); return ret; } @@ -1021,6 +1030,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* everything turned out well, dispose of the aiocb. */ kiocb_free(iocb); + put_reqs_available(ctx, 1); /* * We have to order our ring_info tail store above and test @@ -1062,6 +1072,9 @@ static long aio_read_events_ring(struct kioctx *ctx, if (head == tail) goto out; + head %= ctx->nr_events; + tail %= ctx->nr_events; + while (ret < nr) { long avail; struct io_event *ev; @@ -1100,8 +1113,6 @@ static long aio_read_events_ring(struct kioctx *ctx, flush_dcache_page(ctx->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, tail); - - put_reqs_available(ctx, ret); out: mutex_unlock(&ctx->ring_lock); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index acf32054edd8..9e359fb20c0a 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -143,20 +143,6 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } -/* Does a dentry have some pending activity? */ -static inline int autofs4_ispending(struct dentry *dentry) -{ - struct autofs_info *inf = autofs4_dentry_ino(dentry); - - if (inf->flags & AUTOFS_INF_PENDING) - return 1; - - if (inf->flags & AUTOFS_INF_EXPIRING) - return 1; - - return 0; -} - struct inode *autofs4_get_inode(struct super_block *, umode_t); void autofs4_free_ino(struct autofs_info *); @@ -191,55 +177,6 @@ extern const struct file_operations autofs4_root_operations; extern const struct dentry_operations autofs4_dentry_operations; /* VFS automount flags management functions */ - -static inline void __managed_dentry_set_automount(struct dentry *dentry) -{ - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; -} - -static inline void managed_dentry_set_automount(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __managed_dentry_set_automount(dentry); - spin_unlock(&dentry->d_lock); -} - -static inline void __managed_dentry_clear_automount(struct dentry *dentry) -{ - dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT; -} - -static inline void managed_dentry_clear_automount(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __managed_dentry_clear_automount(dentry); - spin_unlock(&dentry->d_lock); -} - -static inline void __managed_dentry_set_transit(struct dentry *dentry) -{ - dentry->d_flags |= DCACHE_MANAGE_TRANSIT; -} - -static inline void managed_dentry_set_transit(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __managed_dentry_set_transit(dentry); - spin_unlock(&dentry->d_lock); -} - -static inline void __managed_dentry_clear_transit(struct dentry *dentry) -{ - dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT; -} - -static inline void managed_dentry_clear_transit(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __managed_dentry_clear_transit(dentry); - spin_unlock(&dentry->d_lock); -} - static inline void __managed_dentry_set_managed(struct dentry *dentry) { dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 394e90b02c5e..a7be57e39be7 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -333,7 +333,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, if (ino->flags & AUTOFS_INF_PENDING) goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { - struct autofs_info *ino = autofs4_dentry_ino(root); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d7bd395ab586..1c55388ae633 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -210,7 +210,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; - int pgrp; + int pgrp = 0; bool pgrp_set = false; int ret = -EINVAL; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index cc87c1abac97..cdb25ebccc4c 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -166,8 +166,10 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&sbi->lookup_lock); head = &sbi->active_list; + if (list_empty(head)) + return NULL; + spin_lock(&sbi->lookup_lock); list_for_each(p, head) { struct autofs_info *ino; struct dentry *active; @@ -218,8 +220,10 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&sbi->lookup_lock); head = &sbi->expiring_list; + if (list_empty(head)) + return NULL; + spin_lock(&sbi->lookup_lock); list_for_each(p, head) { struct autofs_info *ino; struct dentry *expiring; @@ -373,7 +377,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * this because the leaves of the directory tree under the * mount never trigger mounts themselves (they have an autofs * trigger mount mounted on them). But v4 pseudo direct mounts - * do need the leaves to to trigger mounts. In this case we + * do need the leaves to trigger mounts. In this case we * have no choice but to use the list_empty() check and * require user space behave. */ diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 7c93953030fb..afd2b4408adf 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, return -EIO; } -static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { return -EIO; } @@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops = .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, - .rename = bad_inode_rename, + .rename2 = bad_inode_rename2, .readlink = bad_inode_readlink, /* follow_link must be no-op, otherwise unmounting this inode won't work */ diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index a16fbd4e8241..4cf61ec6b7a8 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -799,13 +799,11 @@ befs_fill_super(struct super_block *sb, void *data, int silent) befs_debug(sb, "---> %s", __func__); -#ifndef CONFIG_BEFS_RW if (!(sb->s_flags & MS_RDONLY)) { befs_warning(sb, "No write support. Marking filesystem read-only"); sb->s_flags |= MS_RDONLY; } -#endif /* CONFIG_BEFS_RW */ /* * Set dummy blocksize to read super block. @@ -834,16 +832,14 @@ befs_fill_super(struct super_block *sb, void *data, int silent) (befs_super_block *) ((void *) bh->b_data + x86_sb_off); } - if (befs_load_sb(sb, disk_sb) != BEFS_OK) + if ((befs_load_sb(sb, disk_sb) != BEFS_OK) || + (befs_check_sb(sb) != BEFS_OK)) goto unacquire_bh; befs_dump_super_block(sb, disk_sb); brelse(bh); - if (befs_check_sb(sb) != BEFS_OK) - goto unacquire_priv_sbp; - if( befs_sb->num_blocks > ~((sector_t)0) ) { befs_error(sb, "blocks count: %llu " "is larger than the host can use", diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h index f7f87e233dd9..f40006db36df 100644 --- a/fs/bfs/bfs.h +++ b/fs/bfs/bfs.h @@ -46,6 +46,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode) /* inode.c */ extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino); +extern void bfs_dump_imap(const char *, struct super_block *); /* file.c */ extern const struct inode_operations bfs_file_inops; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index a399e6d9dc74..08063ae0a17c 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -75,8 +75,6 @@ const struct file_operations bfs_dir_operations = { .llseek = generic_file_llseek, }; -extern void dump_imap(const char *, struct super_block *); - static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -110,7 +108,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, BFS_I(inode)->i_eblock = 0; insert_inode_hash(inode); mark_inode_dirty(inode); - dump_imap("create", s); + bfs_dump_imap("create", s); err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, inode->i_ino); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 7041ac35ace8..90bc079d9982 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -30,8 +30,6 @@ MODULE_LICENSE("GPL"); #define dprintf(x...) #endif -void dump_imap(const char *prefix, struct super_block *s); - struct inode *bfs_iget(struct super_block *sb, unsigned long ino) { struct bfs_inode *di; @@ -194,7 +192,7 @@ static void bfs_evict_inode(struct inode *inode) info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; info->si_freei++; clear_bit(ino, info->si_imap); - dump_imap("delete_inode", s); + bfs_dump_imap("delete_inode", s); } /* @@ -297,7 +295,7 @@ static const struct super_operations bfs_sops = { .statfs = bfs_statfs, }; -void dump_imap(const char *prefix, struct super_block *s) +void bfs_dump_imap(const char *prefix, struct super_block *s) { #ifdef DEBUG int i; @@ -443,7 +441,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) } brelse(bh); brelse(sbh); - dump_imap("read_super", s); + bfs_dump_imap("read_super", s); return 0; out3: diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 92371c414228..1daea0b47187 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace) spin_lock(workspace_lock); if (*num_workspace < num_online_cpus()) { - list_add_tail(workspace, idle_workspace); + list_add(workspace, idle_workspace); (*num_workspace)++; spin_unlock(workspace_lock); goto wake; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2af6e66fe788..eea26e1b2fda 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "sysfs.h" static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); @@ -562,6 +563,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_srcdev(fs_info, src_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8bb4aa19898f..08e65e9cf2aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -369,7 +369,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); - btrfs_tree_read_unlock_blocking(eb); + if (need_lock) + btrfs_tree_read_unlock_blocking(eb); return ret; } @@ -2904,7 +2905,9 @@ retry_root_backup: if (ret) goto fail_qgroup; + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING "BTRFS: failed to recover relocation\n"); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 99c253918208..813537f362f9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5678,7 +5678,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; down_write(&fs_info->commit_root_sem); @@ -5701,9 +5700,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->commit_root_sem); - list_for_each_entry_rcu(space_info, &fs_info->space_info, list) - percpu_counter_set(&space_info->total_bytes_pinned, 0); - update_global_block_rsv(fs_info); } @@ -5741,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a389820d158b..3e11aab9f391 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3437,16 +3437,10 @@ done_unlocked: return 0; } -static int eb_wait(void *word) -{ - io_schedule(); - return 0; -} - void wait_on_extent_buffer_writeback(struct extent_buffer *eb) { - wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, - TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, + TASK_UNINTERRUPTIBLE); } static noinline_for_stack int diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3668048e16f8..3183742d6f0d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8476,6 +8476,16 @@ out_notrans: return ret; } +static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; @@ -9019,7 +9029,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .link = btrfs_link, .mkdir = btrfs_mkdir, .rmdir = btrfs_rmdir, - .rename = btrfs_rename, + .rename2 = btrfs_rename2, .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0d321c23069a..47aceb494d1d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -136,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) void btrfs_update_iflags(struct inode *inode) { struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + unsigned int new_fl = 0; if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); } /* @@ -3139,7 +3142,6 @@ out: static void clone_update_extent_map(struct inode *inode, const struct btrfs_trans_handle *trans, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, const u64 hole_offset, const u64 hole_len) { @@ -3154,7 +3156,11 @@ static void clone_update_extent_map(struct inode *inode, return; } - if (fi) { + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); btrfs_extent_item_to_extent_map(inode, path, fi, false, em); em->generation = -1; if (btrfs_file_extent_type(path->nodes[0], fi) == @@ -3508,18 +3514,15 @@ process_slot: btrfs_item_ptr_offset(leaf, slot), size); inode_add_bytes(inode, datal); - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); } /* If we have an implicit hole (NO_HOLES feature). */ if (drop_start < new_key.offset) clone_update_extent_map(inode, trans, - path, NULL, drop_start, + NULL, drop_start, new_key.offset - drop_start); - clone_update_extent_map(inode, trans, path, - extent, 0, 0); + clone_update_extent_map(inode, trans, path, 0, 0); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -3562,12 +3565,10 @@ process_slot: btrfs_end_transaction(trans, root); goto out; } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen); - if (ret) - goto out; - clone_update_extent_map(inode, trans, path, NULL, last_dest_end, - destoff + len - last_dest_end); } out: diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e12441c7cf1d..7187b14faa6c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -484,8 +484,19 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) log_list); list_del_init(&ordered->log_list); spin_unlock_irq(&log->log_extents_lock[index]); + + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + struct inode *inode = ordered->inode; + u64 start = ordered->file_offset; + u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(!inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); + btrfs_put_ordered_extent(ordered); spin_lock_irq(&log->log_extents_lock[index]); } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6efd70d3b64f..9626b4ad3b9a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot) +static void print_extent_item(struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - int type; u32 item_size = btrfs_item_size_nr(eb, slot); u64 flags; u64 offset; @@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot) btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), flags); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); @@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); break; case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref\n"); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4055291a523e..4a88f073fdd7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1956,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * pages are going to be uptodate. */ for (stripe = 0; stripe < bbio->num_stripes; stripe++) { - if (rbio->faila == stripe || - rbio->failb == stripe) + if (rbio->faila == stripe || rbio->failb == stripe) { + atomic_inc(&rbio->bbio->error); continue; + } for (pagenr = 0; pagenr < nr_pages; pagenr++) { struct page *p; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4662d92a4b73..67b48b9a03e0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -522,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_ssd_spread: btrfs_set_and_info(root, SSD_SPREAD, "use spread ssd allocation scheme"); + btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_clear_and_info(root, NOSSD, + btrfs_set_and_info(root, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; @@ -850,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb, struct btrfs_path *path; struct btrfs_key location; struct inode *inode; - struct dentry *dentry; u64 dir_id; int new = 0; @@ -921,13 +921,7 @@ setup_root: return dget(sb->s_root); } - dentry = d_obtain_alias(inode); - if (!IS_ERR(dentry)) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_DISCONNECTED; - spin_unlock(&dentry->d_lock); - } - return dentry; + return d_obtain_root(inode); } static int btrfs_fill_super(struct super_block *sb, @@ -1467,7 +1461,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; /* recover relocation */ + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret) goto restore; @@ -1808,6 +1804,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) list_for_each_entry(dev, head, dev_list) { if (dev->missing) continue; + if (!dev->name) + continue; if (!first_dev || dev->devid < first_dev->devid) first_dev = dev; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index df39458f1487..78699364f537 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -605,14 +605,37 @@ static void init_feature_attrs(void) } } -static int add_device_membership(struct btrfs_fs_info *fs_info) +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!fs_info->device_dir_kobj) + return -EINVAL; + + if (one_device) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_info->device_dir_kobj, + disk_kobj->name); + } + + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) { int error = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *dev; - fs_info->device_dir_kobj = kobject_create_and_add("devices", + if (!fs_info->device_dir_kobj) + fs_info->device_dir_kobj = kobject_create_and_add("devices", &fs_info->super_kobj); + if (!fs_info->device_dir_kobj) return -ENOMEM; @@ -623,6 +646,9 @@ static int add_device_membership(struct btrfs_fs_info *fs_info) if (!dev->bdev) continue; + if (one_device && one_device != dev) + continue; + disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; @@ -666,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) if (error) goto failure; - error = add_device_membership(fs_info); + error = btrfs_kobj_add_device(fs_info, NULL); if (error) goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9ab576318a84..ac46df37504c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -66,4 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 511839c04f11..5f379affdf23 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -386,11 +386,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, bool reloc_reserved = false; int ret; + /* Send isn't supposed to start transactions. */ + ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); - if (current->journal_info && - current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) { + if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; @@ -491,6 +493,7 @@ again: smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && may_wait_transaction(root, type)) { + current->journal_info = h; btrfs_commit_transaction(h, root); goto again; } @@ -1615,11 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_items(trans, root); - /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ if (ret) return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c83b24251e53..6cb82f62cb7c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,7 @@ #include "rcu-string.h" #include "math.h" #include "dev-replace.h" +#include "sysfs.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -554,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * This is ok to do without rcu read locked because we hold the * uuid mutex so nothing we touch in here is going to disappear. */ - name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); - if (!name) { - kfree(device); - goto error; + if (orig_dev->name) { + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { + kfree(device); + goto error; + } + rcu_assign_pointer(device->name, name); } - rcu_assign_pointer(device->name, name); list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; @@ -1677,8 +1680,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev == root->fs_info->fs_devices->latest_bdev) root->fs_info->fs_devices->latest_bdev = next_device->bdev; - if (device->bdev) + if (device->bdev) { device->fs_devices->open_devices--; + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); + } call_rcu(&device->rcu, free_device); @@ -2143,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); + + /* add sysfs device entry */ + btrfs_kobj_add_device(root->fs_info, device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = init_first_rw_device(trans, root, device); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2156,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_abort_transaction(trans, root, ret); goto error_trans; } + + /* Sprouting would change fsid of the mounted root, + * so rename the fsid on the sysfs + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", + root->fs_info->fsid); + if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); if (ret) { @@ -2205,6 +2224,7 @@ error_trans: unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); + btrfs_kobj_rm_device(root->fs_info, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 4f196314c0c1..b67d8fc81277 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws, if (workspace->def_strm.total_in > 8192 && workspace->def_strm.total_in < workspace->def_strm.total_out) { - ret = -EIO; + ret = -E2BIG; goto out; } /* we need another page for writing out. Test this diff --git a/fs/buffer.c b/fs/buffer.c index eba6e4f621ce..8f05111bbb8b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -61,16 +61,9 @@ inline void touch_buffer(struct buffer_head *bh) } EXPORT_SYMBOL(touch_buffer); -static int sleep_on_buffer(void *word) -{ - io_schedule(); - return 0; -} - void __lock_buffer(struct buffer_head *bh) { - wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer, - TASK_UNINTERRUPTIBLE); + wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_buffer); @@ -123,7 +116,7 @@ EXPORT_SYMBOL(buffer_check_dirty_writeback); */ void __wait_on_buffer(struct buffer_head * bh) { - wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__wait_on_buffer); diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 469f2e8657e8..cebf2ebefb55 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -172,14 +172,24 @@ out: int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; + umode_t new_mode = inode->i_mode; int error; - error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + error = posix_acl_create(dir, &new_mode, &default_acl, &acl); if (error) return error; - if (!default_acl && !acl) + if (!default_acl && !acl) { cache_no_acl(inode); + if (new_mode != inode->i_mode) { + struct iattr newattrs = { + .ia_mode = new_mode, + .ia_valid = ATTR_MODE, + }; + error = ceph_setattr(dentry, &newattrs); + } + return error; + } if (default_acl) { error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1fde164b74b5..6d1cd45dca89 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3277,7 +3277,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->ino = cpu_to_le64(ceph_ino(inode)); rel->cap_id = cpu_to_le64(cap->cap_id); rel->seq = cpu_to_le32(cap->seq); - rel->issue_seq = cpu_to_le32(cap->issue_seq), + rel->issue_seq = cpu_to_le32(cap->issue_seq); rel->mseq = cpu_to_le32(cap->mseq); rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 302085100c28..2eb02f80a0ab 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -423,6 +423,9 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + if (!len) + return 0; /* * flush any page cache pages in this range. this * will make concurrent normal and sync io slow, @@ -470,8 +473,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, size_t left = ret; while (left) { - int copy = min_t(size_t, PAGE_SIZE, left); - l = copy_page_to_iter(pages[k++], 0, copy, i); + size_t page_off = off & ~PAGE_MASK; + size_t copy = min_t(size_t, + PAGE_SIZE - page_off, left); + l = copy_page_to_iter(pages[k++], page_off, + copy, i); off += l; left -= l; if (l < copy) @@ -531,7 +537,7 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -547,7 +553,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) int check_caps = 0; int ret; struct timespec mtime = CURRENT_TIME; - loff_t pos = iocb->ki_pos; size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) @@ -646,7 +651,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) +static ssize_t +ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -663,7 +669,6 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) int check_caps = 0; int ret; struct timespec mtime = CURRENT_TIME; - loff_t pos = iocb->ki_pos; size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) @@ -918,9 +923,9 @@ retry_snap: /* we might need to revert back to that point */ data = *from; if (file->f_flags & O_DIRECT) - written = ceph_sync_direct_write(iocb, &data); + written = ceph_sync_direct_write(iocb, &data, pos); else - written = ceph_sync_write(iocb, &data); + written = ceph_sync_write(iocb, &data, pos); if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", @@ -1177,6 +1182,9 @@ static long ceph_fallocate(struct file *file, int mode, loff_t endoff = 0; loff_t size; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 92a2548278fc..bad07c09f91e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1904,6 +1904,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); if (req->r_got_unsafe) { + void *p; /* * Replay. Do not regenerate message (and rebuild * paths, etc.); just use the original message. @@ -1924,8 +1925,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, /* remove cap/dentry releases from message */ rhead->num_releases = 0; - msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); - msg->front.iov_len = req->r_request_release_offset; + + /* time stamp */ + p = msg->front.iov_base + req->r_request_release_offset; + ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); return 0; } @@ -2061,11 +2067,12 @@ static void __wake_requests(struct ceph_mds_client *mdsc, static void kick_requests(struct ceph_mds_client *mdsc, int mds) { struct ceph_mds_request *req; - struct rb_node *p; + struct rb_node *p = rb_first(&mdsc->request_tree); dout("kick_requests mds%d\n", mds); - for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { + while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); if (req->r_got_unsafe) continue; if (req->r_session && @@ -2248,6 +2255,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) */ if (result == -ESTALE) { dout("got ESTALE on request %llu", req->r_tid); + req->r_resend_mds = -1; if (req->r_direct_mode != USE_AUTH_MDS) { dout("not using auth, setting for that now"); req->r_direct_mode = USE_AUTH_MDS; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 06150fd745ac..f6e12377335c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, goto out; } } else { - root = d_obtain_alias(inode); + root = d_obtain_root(inode); } ceph_init_dentry(root); dout("open_root_inode success, root dentry is %p\n", root); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index c9c2b887381e..12f58d22e017 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -592,12 +592,12 @@ start: xattr_version = ci->i_xattrs.version; spin_unlock(&ci->i_ceph_lock); - xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), + xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *), GFP_NOFS); err = -ENOMEM; if (!xattrs) goto bad_lock; - memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); + for (i = 0; i < numattr; i++) { xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index f3ac4154cbb6..44ec72684df5 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -213,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) tcon->nativeFileSystem); } seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" - "\n\tPathComponentMax: %d Status: 0x%d", + "\n\tPathComponentMax: %d Status: %d", le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), le32_to_cpu(tcon->fsAttrInfo.Attributes), le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 0227b45ef00a..15e9505aa35f 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -290,7 +290,8 @@ int cifsConvertToUTF16(__le16 *target, const char *source, int srclen, const struct nls_table *cp, int mapChars) { - int i, j, charlen; + int i, charlen; + int j = 0; char src_char; __le16 dst_char; wchar_t tmp; @@ -298,12 +299,11 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, if (!mapChars) return cifs_strtoUTF16(target, source, PATH_MAX, cp); - for (i = 0, j = 0; i < srclen; j++) { + for (i = 0; i < srclen; j++) { src_char = source[i]; charlen = 1; switch (src_char) { case 0: - put_unaligned(0, &target[j]); goto ctoUTF16_out; case ':': dst_char = cpu_to_le16(UNI_COLON); @@ -350,6 +350,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, } ctoUTF16_out: + put_unaligned(0, &target[j]); /* Null terminate target unicode string */ return j; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2c90d07c0b3a..ac4f260155c8 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -725,6 +725,19 @@ out_nls: goto out; } +static ssize_t +cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t rc; + struct inode *inode = file_inode(iocb->ki_filp); + + rc = cifs_revalidate_mapping(inode); + if (rc) + return rc; + + return generic_file_read_iter(iocb, iter); +} + static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -835,7 +848,7 @@ const struct inode_operations cifs_dir_inode_ops = { .link = cifs_hardlink, .mkdir = cifs_mkdir, .rmdir = cifs_rmdir, - .rename = cifs_rename, + .rename2 = cifs_rename2, .permission = cifs_permission, /* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, @@ -881,7 +894,7 @@ const struct inode_operations cifs_symlink_inode_ops = { const struct file_operations cifs_file_ops = { .read = new_sync_read, .write = new_sync_write, - .read_iter = generic_file_read_iter, + .read_iter = cifs_loose_read_iter, .write_iter = cifs_file_write_iter, .open = cifs_open, .release = cifs_close, @@ -939,7 +952,7 @@ const struct file_operations cifs_file_direct_ops = { const struct file_operations cifs_file_nobrl_ops = { .read = new_sync_read, .write = new_sync_write, - .read_iter = generic_file_read_iter, + .read_iter = cifs_loose_read_iter, .write_iter = cifs_file_write_iter, .open = cifs_open, .release = cifs_close, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 70f178a7c759..b0fafa499505 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); extern int cifs_mkdir(struct inode *, struct dentry *, umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); -extern int cifs_rename(struct inode *, struct dentry *, struct inode *, - struct dentry *); +extern int cifs_rename2(struct inode *, struct dentry *, struct inode *, + struct dentry *, unsigned int); extern int cifs_revalidate_file_attr(struct file *filp); extern int cifs_revalidate_dentry_attr(struct dentry *); extern int cifs_revalidate_file(struct file *filp); @@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.03" +#define CIFS_VERSION "2.04" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index de6aed8c78e5..0012e1e291d4 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -404,6 +404,11 @@ struct smb_version_operations { const struct cifs_fid *, u32 *); int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, int); + /* writepages retry size */ + unsigned int (*wp_retry_size)(struct inode *); + /* get mtu credits */ + int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int, + unsigned int *, unsigned int *); }; struct smb_version_values { @@ -640,6 +645,16 @@ add_credits(struct TCP_Server_Info *server, const unsigned int add, } static inline void +add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add, + const int optype) +{ + if (add) { + server->ops->add_credits(server, add, optype); + wake_up(&server->request_q); + } +} + +static inline void set_credits(struct TCP_Server_Info *server, const int val) { server->ops->set_credits(server, val); @@ -1044,6 +1059,7 @@ struct cifs_readdata { struct address_space *mapping; __u64 offset; unsigned int bytes; + unsigned int got_bytes; pid_t pid; int result; struct work_struct work; @@ -1053,6 +1069,7 @@ struct cifs_readdata { struct kvec iov; unsigned int pagesz; unsigned int tailsz; + unsigned int credits; unsigned int nr_pages; struct page *pages[]; }; @@ -1073,6 +1090,7 @@ struct cifs_writedata { int result; unsigned int pagesz; unsigned int tailsz; + unsigned int credits; unsigned int nr_pages; struct page *pages[]; }; @@ -1398,6 +1416,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, #define CIFS_OBREAK_OP 0x0100 /* oplock break request */ #define CIFS_NEG_OP 0x0200 /* negotiate request */ #define CIFS_OP_MASK 0x0380 /* mask request type */ +#define CIFS_HAS_CREDITS 0x0400 /* already has credits */ /* Security Flags: indicate type of session setup needed */ #define CIFSSEC_MAY_SIGN 0x00001 diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index ca7980a1e303..c31ce98c1704 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -36,6 +36,7 @@ extern struct smb_hdr *cifs_buf_get(void); extern void cifs_buf_release(void *); extern struct smb_hdr *cifs_small_buf_get(void); extern void cifs_small_buf_release(void *); +extern void free_rsp_buf(int, void *); extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx, struct kvec *iov); extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, @@ -89,6 +90,9 @@ extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *, struct smb_rqst *); extern int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error); +extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server, + unsigned int size, unsigned int *num, + unsigned int *credits); extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec *, int /* nvec to send */, int * /* type of buf returned */ , const int flags); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6ce4e0954b98..66f65001a6d8 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -196,10 +196,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) if (rc) goto out; - /* - * FIXME: check if wsize needs updated due to negotiated smb buffer - * size shrinking - */ atomic_inc(&tconInfoReconnectCount); /* tell server Unix caps we support */ @@ -1517,7 +1513,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return length; server->total_read += length; - rdata->bytes = length; cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", server->total_read, buflen, data_len); @@ -1560,12 +1555,18 @@ cifs_readv_callback(struct mid_q_entry *mid) rc); } /* FIXME: should this be counted toward the initiating task? */ - task_io_account_read(rdata->bytes); - cifs_stats_bytes_read(tcon, rdata->bytes); + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ + rdata->got_bytes = 0; + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; default: rdata->result = -EIO; @@ -1734,10 +1735,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, /* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ if (*buf) { - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base); } else if (resp_buf_type != CIFS_NO_BUFFER) { /* return buffer to caller to free */ *buf = iov[0].iov_base; @@ -1899,28 +1897,80 @@ cifs_writedata_release(struct kref *refcount) static void cifs_writev_requeue(struct cifs_writedata *wdata) { - int i, rc; + int i, rc = 0; struct inode *inode = wdata->cfile->dentry->d_inode; struct TCP_Server_Info *server; + unsigned int rest_len; - for (i = 0; i < wdata->nr_pages; i++) { - lock_page(wdata->pages[i]); - clear_page_dirty_for_io(wdata->pages[i]); - } - + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + i = 0; + rest_len = wdata->bytes; do { - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata, cifs_writedata_release); - } while (rc == -EAGAIN); + struct cifs_writedata *wdata2; + unsigned int j, nr_pages, wsize, tailsz, cur_len; + + wsize = server->ops->wp_retry_size(inode); + if (wsize < rest_len) { + nr_pages = wsize / PAGE_CACHE_SIZE; + if (!nr_pages) { + rc = -ENOTSUPP; + break; + } + cur_len = nr_pages * PAGE_CACHE_SIZE; + tailsz = PAGE_CACHE_SIZE; + } else { + nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE); + cur_len = rest_len; + tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE; + } - for (i = 0; i < wdata->nr_pages; i++) { - unlock_page(wdata->pages[i]); - if (rc != 0) { - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - page_cache_release(wdata->pages[i]); + wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); + if (!wdata2) { + rc = -ENOMEM; + break; } - } + + for (j = 0; j < nr_pages; j++) { + wdata2->pages[j] = wdata->pages[i + j]; + lock_page(wdata2->pages[j]); + clear_page_dirty_for_io(wdata2->pages[j]); + } + + wdata2->sync_mode = wdata->sync_mode; + wdata2->nr_pages = nr_pages; + wdata2->offset = page_offset(wdata2->pages[0]); + wdata2->pagesz = PAGE_CACHE_SIZE; + wdata2->tailsz = tailsz; + wdata2->bytes = cur_len; + + wdata2->cfile = find_writable_file(CIFS_I(inode), false); + if (!wdata2->cfile) { + cifs_dbg(VFS, "No writable handles for inode\n"); + rc = -EBADF; + break; + } + wdata2->pid = wdata2->cfile->pid; + rc = server->ops->async_writev(wdata2, cifs_writedata_release); + + for (j = 0; j < nr_pages; j++) { + unlock_page(wdata2->pages[j]); + if (rc != 0 && rc != -EAGAIN) { + SetPageError(wdata2->pages[j]); + end_page_writeback(wdata2->pages[j]); + page_cache_release(wdata2->pages[j]); + } + } + + if (rc) { + kref_put(&wdata2->refcount, cifs_writedata_release); + if (rc == -EAGAIN) + continue; + break; + } + + rest_len -= cur_len; + i += nr_pages; + } while (i < wdata->nr_pages); mapping_set_error(inode->i_mapping, rc); kref_put(&wdata->refcount, cifs_writedata_release); @@ -2203,10 +2253,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, } /* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -2451,10 +2498,7 @@ plk_err_exit: if (pSMB) cifs_small_buf_release(pSMB); - if (resp_buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(resp_buf_type, iov[0].iov_base); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -3838,10 +3882,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, } } qsec_out: - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(iov[0].iov_base); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); + free_rsp_buf(buf_type, iov[0].iov_base); /* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ return rc; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 20d75b8ddb26..03ed8a09581c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -557,7 +557,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, try_to_freeze(); if (server_unresponsive(server)) { - total_read = -EAGAIN; + total_read = -ECONNABORTED; break; } @@ -571,7 +571,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, break; } else if (server->tcpStatus == CifsNeedReconnect) { cifs_reconnect(server); - total_read = -EAGAIN; + total_read = -ECONNABORTED; break; } else if (length == -ERESTARTSYS || length == -EAGAIN || @@ -588,7 +588,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, cifs_dbg(FYI, "Received no data or error: expecting %d\n" "got %d", to_read, length); cifs_reconnect(server); - total_read = -EAGAIN; + total_read = -ECONNABORTED; break; } } @@ -786,7 +786,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); cifs_reconnect(server); wake_up(&server->response_q); - return -EAGAIN; + return -ECONNABORTED; } /* switch to large buffer if too big for a small one */ @@ -3934,13 +3934,6 @@ cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) return tlink_tcon(cifs_sb_master_tlink(cifs_sb)); } -static int -cifs_sb_tcon_pending_wait(void *unused) -{ - schedule(); - return signal_pending(current) ? -ERESTARTSYS : 0; -} - /* find and return a tlink with given uid */ static struct tcon_link * tlink_rb_search(struct rb_root *root, kuid_t uid) @@ -4039,11 +4032,10 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb) } else { wait_for_construction: ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, - cifs_sb_tcon_pending_wait, TASK_INTERRUPTIBLE); if (ret) { cifs_put_tlink(tlink); - return ERR_PTR(ret); + return ERR_PTR(-ERESTARTSYS); } /* if it's good, return it */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index e90a1e9aa627..4ab2f79ffa7a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1670,8 +1670,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, break; } - len = min((size_t)cifs_sb->wsize, - write_size - total_written); + len = min(server->ops->wp_retry_size(dentry->d_inode), + (unsigned int)write_size - total_written); /* iov[0] is reserved for smb header */ iov[1].iov_base = (char *)write_data + total_written; iov[1].iov_len = len; @@ -1878,15 +1878,163 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) return rc; } +static struct cifs_writedata * +wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, + pgoff_t end, pgoff_t *index, + unsigned int *found_pages) +{ + unsigned int nr_pages; + struct page **pages; + struct cifs_writedata *wdata; + + wdata = cifs_writedata_alloc((unsigned int)tofind, + cifs_writev_complete); + if (!wdata) + return NULL; + + /* + * find_get_pages_tag seems to return a max of 256 on each + * iteration, so we must call it several times in order to + * fill the array or the wsize is effectively limited to + * 256 * PAGE_CACHE_SIZE. + */ + *found_pages = 0; + pages = wdata->pages; + do { + nr_pages = find_get_pages_tag(mapping, index, + PAGECACHE_TAG_DIRTY, tofind, + pages); + *found_pages += nr_pages; + tofind -= nr_pages; + pages += nr_pages; + } while (nr_pages && tofind && *index <= end); + + return wdata; +} + +static unsigned int +wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, + struct address_space *mapping, + struct writeback_control *wbc, + pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done) +{ + unsigned int nr_pages = 0, i; + struct page *page; + + for (i = 0; i < found_pages; i++) { + page = wdata->pages[i]; + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + + if (nr_pages == 0) + lock_page(page); + else if (!trylock_page(page)) + break; + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + break; + } + + if (!wbc->range_cyclic && page->index > end) { + *done = true; + unlock_page(page); + break; + } + + if (*next && (page->index != *next)) { + /* Not next consecutive page */ + unlock_page(page); + break; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + break; + } + + /* + * This actually clears the dirty bit in the radix tree. + * See cifs_writepage() for more commentary. + */ + set_page_writeback(page); + if (page_offset(page) >= i_size_read(mapping->host)) { + *done = true; + unlock_page(page); + end_page_writeback(page); + break; + } + + wdata->pages[i] = page; + *next = page->index + 1; + ++nr_pages; + } + + /* reset index to refind any pages skipped */ + if (nr_pages == 0) + *index = wdata->pages[0]->index + 1; + + /* put any pages we aren't going to use */ + for (i = nr_pages; i < found_pages; i++) { + page_cache_release(wdata->pages[i]); + wdata->pages[i] = NULL; + } + + return nr_pages; +} + +static int +wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, + struct address_space *mapping, struct writeback_control *wbc) +{ + int rc = 0; + struct TCP_Server_Info *server; + unsigned int i; + + wdata->sync_mode = wbc->sync_mode; + wdata->nr_pages = nr_pages; + wdata->offset = page_offset(wdata->pages[0]); + wdata->pagesz = PAGE_CACHE_SIZE; + wdata->tailsz = min(i_size_read(mapping->host) - + page_offset(wdata->pages[nr_pages - 1]), + (loff_t)PAGE_CACHE_SIZE); + wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz; + + if (wdata->cfile != NULL) + cifsFileInfo_put(wdata->cfile); + wdata->cfile = find_writable_file(CIFS_I(mapping->host), false); + if (!wdata->cfile) { + cifs_dbg(VFS, "No writable handles for inode\n"); + rc = -EBADF; + } else { + wdata->pid = wdata->cfile->pid; + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + rc = server->ops->async_writev(wdata, cifs_writedata_release); + } + + for (i = 0; i < nr_pages; ++i) + unlock_page(wdata->pages[i]); + + return rc; +} + static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb); + struct TCP_Server_Info *server; bool done = false, scanned = false, range_whole = false; pgoff_t end, index; struct cifs_writedata *wdata; - struct TCP_Server_Info *server; - struct page *page; int rc = 0; /* @@ -1906,152 +2054,50 @@ static int cifs_writepages(struct address_space *mapping, range_whole = true; scanned = true; } + server = cifs_sb_master_tcon(cifs_sb)->ses->server; retry: while (!done && index <= end) { - unsigned int i, nr_pages, found_pages; - pgoff_t next = 0, tofind; - struct page **pages; + unsigned int i, nr_pages, found_pages, wsize, credits; + pgoff_t next = 0, tofind, saved_index = index; + + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break; - tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1, - end - index) + 1; + tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1; - wdata = cifs_writedata_alloc((unsigned int)tofind, - cifs_writev_complete); + wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index, + &found_pages); if (!wdata) { rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; } - /* - * find_get_pages_tag seems to return a max of 256 on each - * iteration, so we must call it several times in order to - * fill the array or the wsize is effectively limited to - * 256 * PAGE_CACHE_SIZE. - */ - found_pages = 0; - pages = wdata->pages; - do { - nr_pages = find_get_pages_tag(mapping, &index, - PAGECACHE_TAG_DIRTY, - tofind, pages); - found_pages += nr_pages; - tofind -= nr_pages; - pages += nr_pages; - } while (nr_pages && tofind && index <= end); - if (found_pages == 0) { kref_put(&wdata->refcount, cifs_writedata_release); + add_credits_and_wake_if(server, credits, 0); break; } - nr_pages = 0; - for (i = 0; i < found_pages; i++) { - page = wdata->pages[i]; - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping - */ - - if (nr_pages == 0) - lock_page(page); - else if (!trylock_page(page)) - break; - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - break; - } - - if (!wbc->range_cyclic && page->index > end) { - done = true; - unlock_page(page); - break; - } - - if (next && (page->index != next)) { - /* Not next consecutive page */ - unlock_page(page); - break; - } - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - break; - } - - /* - * This actually clears the dirty bit in the radix tree. - * See cifs_writepage() for more commentary. - */ - set_page_writeback(page); - - if (page_offset(page) >= i_size_read(mapping->host)) { - done = true; - unlock_page(page); - end_page_writeback(page); - break; - } - - wdata->pages[i] = page; - next = page->index + 1; - ++nr_pages; - } - - /* reset index to refind any pages skipped */ - if (nr_pages == 0) - index = wdata->pages[0]->index + 1; - - /* put any pages we aren't going to use */ - for (i = nr_pages; i < found_pages; i++) { - page_cache_release(wdata->pages[i]); - wdata->pages[i] = NULL; - } + nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc, + end, &index, &next, &done); /* nothing to write? */ if (nr_pages == 0) { kref_put(&wdata->refcount, cifs_writedata_release); + add_credits_and_wake_if(server, credits, 0); continue; } - wdata->sync_mode = wbc->sync_mode; - wdata->nr_pages = nr_pages; - wdata->offset = page_offset(wdata->pages[0]); - wdata->pagesz = PAGE_CACHE_SIZE; - wdata->tailsz = - min(i_size_read(mapping->host) - - page_offset(wdata->pages[nr_pages - 1]), - (loff_t)PAGE_CACHE_SIZE); - wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + - wdata->tailsz; - - do { - if (wdata->cfile != NULL) - cifsFileInfo_put(wdata->cfile); - wdata->cfile = find_writable_file(CIFS_I(mapping->host), - false); - if (!wdata->cfile) { - cifs_dbg(VFS, "No writable handles for inode\n"); - rc = -EBADF; - break; - } - wdata->pid = wdata->cfile->pid; - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata, - cifs_writedata_release); - } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); + wdata->credits = credits; - for (i = 0; i < nr_pages; ++i) - unlock_page(wdata->pages[i]); + rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); /* send failure -- clean up the mess */ if (rc != 0) { + add_credits_and_wake_if(server, wdata->credits, 0); for (i = 0; i < nr_pages; ++i) { if (rc == -EAGAIN) redirty_page_for_writepage(wbc, @@ -2066,6 +2112,11 @@ retry: } kref_put(&wdata->refcount, cifs_writedata_release); + if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) { + index = saved_index; + continue; + } + wbc->nr_to_write -= nr_pages; if (wbc->nr_to_write <= 0) done = true; @@ -2362,123 +2413,109 @@ cifs_uncached_writev_complete(struct work_struct *work) kref_put(&wdata->refcount, cifs_uncached_writedata_release); } -/* attempt to send write to server, retry on any -EAGAIN errors */ static int -cifs_uncached_retry_writev(struct cifs_writedata *wdata) +wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, + size_t *len, unsigned long *num_pages) { - int rc; - struct TCP_Server_Info *server; + size_t save_len, copied, bytes, cur_len = *len; + unsigned long i, nr_pages = *num_pages; - server = tlink_tcon(wdata->cfile->tlink)->ses->server; + save_len = cur_len; + for (i = 0; i < nr_pages; i++) { + bytes = min_t(const size_t, cur_len, PAGE_SIZE); + copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from); + cur_len -= copied; + /* + * If we didn't copy as much as we expected, then that + * may mean we trod into an unmapped area. Stop copying + * at that point. On the next pass through the big + * loop, we'll likely end up getting a zero-length + * write and bailing out of it. + */ + if (copied < bytes) + break; + } + cur_len = save_len - cur_len; + *len = cur_len; - do { - if (wdata->cfile->invalidHandle) { - rc = cifs_reopen_file(wdata->cfile, false); - if (rc != 0) - continue; - } - rc = server->ops->async_writev(wdata, - cifs_uncached_writedata_release); - } while (rc == -EAGAIN); + /* + * If we have no data to send, then that probably means that + * the copy above failed altogether. That's most likely because + * the address in the iovec was bogus. Return -EFAULT and let + * the caller free anything we allocated and bail out. + */ + if (!cur_len) + return -EFAULT; - return rc; + /* + * i + 1 now represents the number of pages we actually used in + * the copy phase above. + */ + *num_pages = i + 1; + return 0; } -static ssize_t -cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) +static int +cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, + struct cifsFileInfo *open_file, + struct cifs_sb_info *cifs_sb, struct list_head *wdata_list) { - unsigned long nr_pages, i; - size_t bytes, copied, len, cur_len; - ssize_t total_written = 0; - loff_t offset; - struct cifsFileInfo *open_file; - struct cifs_tcon *tcon; - struct cifs_sb_info *cifs_sb; - struct cifs_writedata *wdata, *tmp; - struct list_head wdata_list; - int rc; + int rc = 0; + size_t cur_len; + unsigned long nr_pages, num_pages, i; + struct cifs_writedata *wdata; + struct iov_iter saved_from; + loff_t saved_offset = offset; pid_t pid; - - len = iov_iter_count(from); - rc = generic_write_checks(file, poffset, &len, 0); - if (rc) - return rc; - - if (!len) - return 0; - - iov_iter_truncate(from, len); - - INIT_LIST_HEAD(&wdata_list); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_writev) - return -ENOSYS; - - offset = *poffset; + struct TCP_Server_Info *server; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; + server = tlink_tcon(open_file->tlink)->ses->server; + memcpy(&saved_from, from, sizeof(struct iov_iter)); + do { - size_t save_len; + unsigned int wsize, credits; - nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len); + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break; + + nr_pages = get_numpages(wsize, len, &cur_len); wdata = cifs_writedata_alloc(nr_pages, cifs_uncached_writev_complete); if (!wdata) { rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; } rc = cifs_write_allocate_pages(wdata->pages, nr_pages); if (rc) { kfree(wdata); + add_credits_and_wake_if(server, credits, 0); break; } - save_len = cur_len; - for (i = 0; i < nr_pages; i++) { - bytes = min_t(size_t, cur_len, PAGE_SIZE); - copied = copy_page_from_iter(wdata->pages[i], 0, bytes, - from); - cur_len -= copied; - /* - * If we didn't copy as much as we expected, then that - * may mean we trod into an unmapped area. Stop copying - * at that point. On the next pass through the big - * loop, we'll likely end up getting a zero-length - * write and bailing out of it. - */ - if (copied < bytes) - break; - } - cur_len = save_len - cur_len; - - /* - * If we have no data to send, then that probably means that - * the copy above failed altogether. That's most likely because - * the address in the iovec was bogus. Set the rc to -EFAULT, - * free anything we allocated and bail out. - */ - if (!cur_len) { + num_pages = nr_pages; + rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages); + if (rc) { for (i = 0; i < nr_pages; i++) put_page(wdata->pages[i]); kfree(wdata); - rc = -EFAULT; + add_credits_and_wake_if(server, credits, 0); break; } /* - * i + 1 now represents the number of pages we actually used in - * the copy phase above. Bring nr_pages down to that, and free - * any pages that we didn't use. + * Bring nr_pages down to the number of pages we actually used, + * and free any pages that we didn't use. */ - for ( ; nr_pages > i + 1; nr_pages--) + for ( ; nr_pages > num_pages; nr_pages--) put_page(wdata->pages[nr_pages - 1]); wdata->sync_mode = WB_SYNC_ALL; @@ -2489,18 +2526,69 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) wdata->bytes = cur_len; wdata->pagesz = PAGE_SIZE; wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); - rc = cifs_uncached_retry_writev(wdata); + wdata->credits = credits; + + if (!wdata->cfile->invalidHandle || + !cifs_reopen_file(wdata->cfile, false)) + rc = server->ops->async_writev(wdata, + cifs_uncached_writedata_release); if (rc) { + add_credits_and_wake_if(server, wdata->credits, 0); kref_put(&wdata->refcount, cifs_uncached_writedata_release); + if (rc == -EAGAIN) { + memcpy(from, &saved_from, + sizeof(struct iov_iter)); + iov_iter_advance(from, offset - saved_offset); + continue; + } break; } - list_add_tail(&wdata->list, &wdata_list); + list_add_tail(&wdata->list, wdata_list); offset += cur_len; len -= cur_len; } while (len > 0); + return rc; +} + +static ssize_t +cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) +{ + size_t len; + ssize_t total_written = 0; + struct cifsFileInfo *open_file; + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; + struct cifs_writedata *wdata, *tmp; + struct list_head wdata_list; + struct iov_iter saved_from; + int rc; + + len = iov_iter_count(from); + rc = generic_write_checks(file, poffset, &len, 0); + if (rc) + return rc; + + if (!len) + return 0; + + iov_iter_truncate(from, len); + + INIT_LIST_HEAD(&wdata_list); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_writev) + return -ENOSYS; + + memcpy(&saved_from, from, sizeof(struct iov_iter)); + + rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb, + &wdata_list); + /* * If at least one write was successfully sent, then discard any rc * value from the later writes. If the other write succeeds, then @@ -2529,7 +2617,25 @@ restart_loop: /* resend call if it's a retryable error */ if (rc == -EAGAIN) { - rc = cifs_uncached_retry_writev(wdata); + struct list_head tmp_list; + struct iov_iter tmp_from; + + INIT_LIST_HEAD(&tmp_list); + list_del_init(&wdata->list); + + memcpy(&tmp_from, &saved_from, + sizeof(struct iov_iter)); + iov_iter_advance(&tmp_from, + wdata->offset - *poffset); + + rc = cifs_write_from_iter(wdata->offset, + wdata->bytes, &tmp_from, + open_file, cifs_sb, &tmp_list); + + list_splice(&tmp_list, &wdata_list); + + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); goto restart_loop; } } @@ -2722,26 +2828,6 @@ cifs_uncached_readdata_release(struct kref *refcount) cifs_readdata_release(refcount); } -static int -cifs_retry_async_readv(struct cifs_readdata *rdata) -{ - int rc; - struct TCP_Server_Info *server; - - server = tlink_tcon(rdata->cfile->tlink)->ses->server; - - do { - if (rdata->cfile->invalidHandle) { - rc = cifs_reopen_file(rdata->cfile, true); - if (rc != 0) - continue; - } - rc = server->ops->async_readv(rdata); - } while (rc == -EAGAIN); - - return rc; -} - /** * cifs_readdata_to_iov - copy data from pages in response to an iovec * @rdata: the readdata response with list of pages holding data @@ -2754,7 +2840,7 @@ cifs_retry_async_readv(struct cifs_readdata *rdata) static int cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) { - size_t remaining = rdata->bytes; + size_t remaining = rdata->got_bytes; unsigned int i; for (i = 0; i < rdata->nr_pages; i++) { @@ -2782,11 +2868,12 @@ static int cifs_uncached_read_into_pages(struct TCP_Server_Info *server, struct cifs_readdata *rdata, unsigned int len) { - int total_read = 0, result = 0; + int result = 0; unsigned int i; unsigned int nr_pages = rdata->nr_pages; struct kvec iov; + rdata->got_bytes = 0; rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; @@ -2820,55 +2907,45 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, if (result < 0) break; - total_read += result; + rdata->got_bytes += result; } - return total_read > 0 ? total_read : result; + return rdata->got_bytes > 0 && result != -ECONNABORTED ? + rdata->got_bytes : result; } -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +static int +cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, + struct cifs_sb_info *cifs_sb, struct list_head *rdata_list) { - struct file *file = iocb->ki_filp; - ssize_t rc; - size_t len, cur_len; - ssize_t total_read = 0; - loff_t offset = iocb->ki_pos; - unsigned int npages; - struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; - struct cifsFileInfo *open_file; - struct cifs_readdata *rdata, *tmp; - struct list_head rdata_list; + struct cifs_readdata *rdata; + unsigned int npages, rsize, credits; + size_t cur_len; + int rc; pid_t pid; + struct TCP_Server_Info *server; - len = iov_iter_count(to); - if (!len) - return 0; - - INIT_LIST_HEAD(&rdata_list); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_readv) - return -ENOSYS; + server = tlink_tcon(open_file->tlink)->ses->server; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; - if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cifs_dbg(FYI, "attempting read on write only file instance\n"); - do { - cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break; + + cur_len = min_t(const size_t, len, rsize); npages = DIV_ROUND_UP(cur_len, PAGE_SIZE); /* allocate a readdata struct */ rdata = cifs_readdata_alloc(npages, cifs_uncached_readv_complete); if (!rdata) { + add_credits_and_wake_if(server, credits, 0); rc = -ENOMEM; break; } @@ -2884,44 +2961,113 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) rdata->pid = pid; rdata->pagesz = PAGE_SIZE; rdata->read_into_pages = cifs_uncached_read_into_pages; + rdata->credits = credits; - rc = cifs_retry_async_readv(rdata); + if (!rdata->cfile->invalidHandle || + !cifs_reopen_file(rdata->cfile, true)) + rc = server->ops->async_readv(rdata); error: if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); kref_put(&rdata->refcount, cifs_uncached_readdata_release); + if (rc == -EAGAIN) + continue; break; } - list_add_tail(&rdata->list, &rdata_list); + list_add_tail(&rdata->list, rdata_list); offset += cur_len; len -= cur_len; } while (len > 0); + return rc; +} + +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + ssize_t rc; + size_t len; + ssize_t total_read = 0; + loff_t offset = iocb->ki_pos; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsFileInfo *open_file; + struct cifs_readdata *rdata, *tmp; + struct list_head rdata_list; + + len = iov_iter_count(to); + if (!len) + return 0; + + INIT_LIST_HEAD(&rdata_list); + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); + + if (!tcon->ses->server->ops->async_readv) + return -ENOSYS; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cifs_dbg(FYI, "attempting read on write only file instance\n"); + + rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list); + /* if at least one read request send succeeded, then reset rc */ if (!list_empty(&rdata_list)) rc = 0; len = iov_iter_count(to); /* the loop below should proceed in the order of increasing offsets */ +again: list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { - again: if (!rc) { /* FIXME: freezable sleep too? */ rc = wait_for_completion_killable(&rdata->done); if (rc) rc = -EINTR; - else if (rdata->result) { - rc = rdata->result; + else if (rdata->result == -EAGAIN) { /* resend call if it's a retryable error */ - if (rc == -EAGAIN) { - rc = cifs_retry_async_readv(rdata); - goto again; + struct list_head tmp_list; + unsigned int got_bytes = rdata->got_bytes; + + list_del_init(&rdata->list); + INIT_LIST_HEAD(&tmp_list); + + /* + * Got a part of data and then reconnect has + * happened -- fill the buffer and continue + * reading. + */ + if (got_bytes && got_bytes < rdata->bytes) { + rc = cifs_readdata_to_iov(rdata, to); + if (rc) { + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + continue; + } } - } else { + + rc = cifs_send_async_read( + rdata->offset + got_bytes, + rdata->bytes - got_bytes, + rdata->cfile, cifs_sb, + &tmp_list); + + list_splice(&tmp_list, &rdata_list); + + kref_put(&rdata->refcount, + cifs_uncached_readdata_release); + goto again; + } else if (rdata->result) + rc = rdata->result; + else rc = cifs_readdata_to_iov(rdata, to); - } + /* if there was a short read -- discard anything left */ + if (rdata->got_bytes && rdata->got_bytes < rdata->bytes) + rc = -ENODATA; } list_del_init(&rdata->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); @@ -3030,18 +3176,19 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) for (total_read = 0, cur_offset = read_data; read_size > total_read; total_read += bytes_read, cur_offset += bytes_read) { - current_read_size = min_t(uint, read_size - total_read, rsize); - /* - * For windows me and 9x we do not want to request more than it - * negotiated since it will refuse the read then. - */ - if ((tcon->ses) && !(tcon->ses->capabilities & + do { + current_read_size = min_t(uint, read_size - total_read, + rsize); + /* + * For windows me and 9x we do not want to request more + * than it negotiated since it will refuse the read + * then. + */ + if ((tcon->ses) && !(tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)) { - current_read_size = min_t(uint, current_read_size, - CIFSMaxBufSize); - } - rc = -EAGAIN; - while (rc == -EAGAIN) { + current_read_size = min_t(uint, + current_read_size, CIFSMaxBufSize); + } if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); if (rc != 0) @@ -3054,7 +3201,8 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) rc = server->ops->sync_read(xid, open_file, &io_parms, &bytes_read, &cur_offset, &buf_type); - } + } while (rc == -EAGAIN); + if (rc || (bytes_read == 0)) { if (total_read) { break; @@ -3133,25 +3281,30 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) static void cifs_readv_complete(struct work_struct *work) { - unsigned int i; + unsigned int i, got_bytes; struct cifs_readdata *rdata = container_of(work, struct cifs_readdata, work); + got_bytes = rdata->got_bytes; for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; lru_cache_add_file(page); - if (rdata->result == 0) { + if (rdata->result == 0 || + (rdata->result == -EAGAIN && got_bytes)) { flush_dcache_page(page); SetPageUptodate(page); } unlock_page(page); - if (rdata->result == 0) + if (rdata->result == 0 || + (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page); + got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes); + page_cache_release(page); rdata->pages[i] = NULL; } @@ -3162,7 +3315,7 @@ static int cifs_readpages_read_into_pages(struct TCP_Server_Info *server, struct cifs_readdata *rdata, unsigned int len) { - int total_read = 0, result = 0; + int result = 0; unsigned int i; u64 eof; pgoff_t eof_index; @@ -3174,6 +3327,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index); + rdata->got_bytes = 0; rdata->tailsz = PAGE_CACHE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; @@ -3228,10 +3382,70 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, if (result < 0) break; - total_read += result; + rdata->got_bytes += result; } - return total_read > 0 ? total_read : result; + return rdata->got_bytes > 0 && result != -ECONNABORTED ? + rdata->got_bytes : result; +} + +static int +readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + unsigned int rsize, struct list_head *tmplist, + unsigned int *nr_pages, loff_t *offset, unsigned int *bytes) +{ + struct page *page, *tpage; + unsigned int expected_index; + int rc; + + INIT_LIST_HEAD(tmplist); + + page = list_entry(page_list->prev, struct page, lru); + + /* + * Lock the page and put it in the cache. Since no one else + * should have access to this page, we're safe to simply set + * PG_locked without checking it first. + */ + __set_page_locked(page); + rc = add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL); + + /* give up if we can't stick it in the cache */ + if (rc) { + __clear_page_locked(page); + return rc; + } + + /* move first page to the tmplist */ + *offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + *bytes = PAGE_CACHE_SIZE; + *nr_pages = 1; + list_move_tail(&page->lru, tmplist); + + /* now try and add more pages onto the request */ + expected_index = page->index + 1; + list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { + /* discontinuity ? */ + if (page->index != expected_index) + break; + + /* would this page push the read over the rsize? */ + if (*bytes + PAGE_CACHE_SIZE > rsize) + break; + + __set_page_locked(page); + if (add_to_page_cache_locked(page, mapping, page->index, + GFP_KERNEL)) { + __clear_page_locked(page); + break; + } + list_move_tail(&page->lru, tmplist); + (*bytes) += PAGE_CACHE_SIZE; + expected_index++; + (*nr_pages)++; + } + return rc; } static int cifs_readpages(struct file *file, struct address_space *mapping, @@ -3241,19 +3455,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, struct list_head tmplist; struct cifsFileInfo *open_file = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - unsigned int rsize = cifs_sb->rsize; + struct TCP_Server_Info *server; pid_t pid; /* - * Give up immediately if rsize is too small to read an entire page. - * The VFS will fall back to readpage. We should never reach this - * point however since we set ra_pages to 0 when the rsize is smaller - * than a cache page. - */ - if (unlikely(rsize < PAGE_CACHE_SIZE)) - return 0; - - /* * Reads as many pages as possible from fscache. Returns -ENOBUFS * immediately if the cookie is negative * @@ -3271,7 +3476,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, pid = current->tgid; rc = 0; - INIT_LIST_HEAD(&tmplist); + server = tlink_tcon(open_file->tlink)->ses->server; cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", __func__, file, mapping, num_pages); @@ -3288,58 +3493,35 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * the rdata->pages, then we want them in increasing order. */ while (!list_empty(page_list)) { - unsigned int i; - unsigned int bytes = PAGE_CACHE_SIZE; - unsigned int expected_index; - unsigned int nr_pages = 1; + unsigned int i, nr_pages, bytes, rsize; loff_t offset; struct page *page, *tpage; struct cifs_readdata *rdata; + unsigned credits; - page = list_entry(page_list->prev, struct page, lru); + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break; /* - * Lock the page and put it in the cache. Since no one else - * should have access to this page, we're safe to simply set - * PG_locked without checking it first. + * Give up immediately if rsize is too small to read an entire + * page. The VFS will fall back to readpage. We should never + * reach this point however since we set ra_pages to 0 when the + * rsize is smaller than a cache page. */ - __set_page_locked(page); - rc = add_to_page_cache_locked(page, mapping, - page->index, GFP_KERNEL); + if (unlikely(rsize < PAGE_CACHE_SIZE)) { + add_credits_and_wake_if(server, credits, 0); + return 0; + } - /* give up if we can't stick it in the cache */ + rc = readpages_get_pages(mapping, page_list, rsize, &tmplist, + &nr_pages, &offset, &bytes); if (rc) { - __clear_page_locked(page); + add_credits_and_wake_if(server, credits, 0); break; } - /* move first page to the tmplist */ - offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - list_move_tail(&page->lru, &tmplist); - - /* now try and add more pages onto the request */ - expected_index = page->index + 1; - list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { - /* discontinuity ? */ - if (page->index != expected_index) - break; - - /* would this page push the read over the rsize? */ - if (bytes + PAGE_CACHE_SIZE > rsize) - break; - - __set_page_locked(page); - if (add_to_page_cache_locked(page, mapping, - page->index, GFP_KERNEL)) { - __clear_page_locked(page); - break; - } - list_move_tail(&page->lru, &tmplist); - bytes += PAGE_CACHE_SIZE; - expected_index++; - nr_pages++; - } - rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); if (!rdata) { /* best to give up if we're out of mem */ @@ -3350,6 +3532,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, page_cache_release(page); } rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); break; } @@ -3360,21 +3543,32 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata->pid = pid; rdata->pagesz = PAGE_CACHE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; + rdata->credits = credits; list_for_each_entry_safe(page, tpage, &tmplist, lru) { list_del(&page->lru); rdata->pages[rdata->nr_pages++] = page; } - rc = cifs_retry_async_readv(rdata); - if (rc != 0) { + if (!rdata->cfile->invalidHandle || + !cifs_reopen_file(rdata->cfile, true)) + rc = server->ops->async_readv(rdata); + if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; lru_cache_add_file(page); unlock_page(page); page_cache_release(page); + if (rc == -EAGAIN) + list_add_tail(&page->lru, &tmplist); } kref_put(&rdata->refcount, cifs_readdata_release); + if (rc == -EAGAIN) { + /* Re-add pages to the page_list and retry */ + list_splice(&tmplist, page_list); + continue; + } break; } @@ -3618,13 +3812,6 @@ static int cifs_launder_page(struct page *page) return rc; } -static int -cifs_pending_writers_wait(void *unused) -{ - schedule(); - return 0; -} - void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3636,7 +3823,7 @@ void cifs_oplock_break(struct work_struct *work) int rc = 0; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, - cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); server->ops->downgrade_oplock(server, cinode, test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a174605f6afa..426d6c6ad8bf 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1627,8 +1627,9 @@ do_rename_exit: } int -cifs_rename(struct inode *source_dir, struct dentry *source_dentry, - struct inode *target_dir, struct dentry *target_dentry) +cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, + struct inode *target_dir, struct dentry *target_dentry, + unsigned int flags) { char *from_name = NULL; char *to_name = NULL; @@ -1640,6 +1641,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry, unsigned int xid; int rc, tmprc; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + cifs_sb = CIFS_SB(source_dir->i_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1667,6 +1671,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry, rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); + /* + * No-replace is the natural behavior for CIFS, so skip unlink hacks. + */ + if (flags & RENAME_NOREPLACE) + goto cifs_rename_exit; + if (rc == -EEXIST && tcon->unix_ext) { /* * Are src and dst hardlinks of same inode? We can only tell @@ -1780,7 +1790,7 @@ cifs_invalidate_mapping(struct inode *inode) * @word: long word containing the bit lock */ static int -cifs_wait_bit_killable(void *word) +cifs_wait_bit_killable(struct wait_bit_key *key) { if (fatal_signal_pending(current)) return -ERESTARTSYS; @@ -1794,8 +1804,8 @@ cifs_revalidate_mapping(struct inode *inode) int rc; unsigned long *flags = &CIFS_I(inode)->flags; - rc = wait_on_bit_lock(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, - TASK_KILLABLE); + rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, + TASK_KILLABLE); if (rc) return rc; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 264ece71bdb2..68559fd557fb 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -374,7 +374,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; oparms.create_options = create_options; - oparms.disposition = FILE_OPEN; + oparms.disposition = FILE_CREATE; oparms.path = path; oparms.fid = &fid; oparms.reconnect = false; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 3b0c62e622da..81340c6253eb 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -226,6 +226,15 @@ cifs_small_buf_release(void *buf_to_free) return; } +void +free_rsp_buf(int resp_buftype, void *rsp) +{ + if (resp_buftype == CIFS_SMALL_BUFFER) + cifs_small_buf_release(rsp); + else if (resp_buftype == CIFS_LARGE_BUFFER) + cifs_buf_release(rsp); +} + /* NB: MID can not be set if treeCon not passed in, in that case it is responsbility of caller to set the mid */ void @@ -414,7 +423,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) return true; } if (pSMBr->hdr.Status.CifsError) { - cifs_dbg(FYI, "notify err 0x%d\n", + cifs_dbg(FYI, "notify err 0x%x\n", pSMBr->hdr.Status.CifsError); return true; } @@ -441,7 +450,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) if (pSMB->hdr.WordCount != 8) return false; - cifs_dbg(FYI, "oplock type 0x%d level 0x%d\n", + cifs_dbg(FYI, "oplock type 0x%x level 0x%x\n", pSMB->LockType, pSMB->OplockLevel); if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) return false; @@ -582,7 +591,7 @@ int cifs_get_writer(struct cifsInodeInfo *cinode) start: rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, - cifs_oplock_break_wait, TASK_KILLABLE); + TASK_KILLABLE); if (rc) return rc; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index e87387dbf39f..39ee32688eac 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -520,382 +520,559 @@ select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) } } -int -CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_cp) +struct sess_data { + unsigned int xid; + struct cifs_ses *ses; + struct nls_table *nls_cp; + void (*func)(struct sess_data *); + int result; + + /* we will send the SMB in three pieces: + * a fixed length beginning part, an optional + * SPNEGO blob (which can be zero length), and a + * last part which will include the strings + * and rest of bcc area. This allows us to avoid + * a large buffer 17K allocation + */ + int buf0_type; + struct kvec iov[3]; +}; + +static int +sess_alloc_buffer(struct sess_data *sess_data, int wct) { - int rc = 0; - int wct; + int rc; + struct cifs_ses *ses = sess_data->ses; struct smb_hdr *smb_buf; - char *bcc_ptr; - char *str_area; - SESSION_SETUP_ANDX *pSMB; - __u32 capabilities; - __u16 count; - int resp_buf_type; - struct kvec iov[3]; - enum securityEnum type; - __u16 action, bytes_remaining; - struct key *spnego_key = NULL; - __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ - u16 blob_len; - char *ntlmsspblob = NULL; - if (ses == NULL) { - WARN(1, "%s: ses == NULL!", __func__); - return -EINVAL; - } + rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, + (void **)&smb_buf); - type = select_sectype(ses->server, ses->sectype); - cifs_dbg(FYI, "sess setup type %d\n", type); - if (type == Unspecified) { - cifs_dbg(VFS, - "Unable to select appropriate authentication method!"); - return -EINVAL; + if (rc) + return rc; + + sess_data->iov[0].iov_base = (char *)smb_buf; + sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; + /* + * This variable will be used to clear the buffer + * allocated above in case of any error in the calling function. + */ + sess_data->buf0_type = CIFS_SMALL_BUFFER; + + /* 2000 big enough to fit max user, domain, NOS name etc. */ + sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL); + if (!sess_data->iov[2].iov_base) { + rc = -ENOMEM; + goto out_free_smb_buf; } - if (type == RawNTLMSSP) { - /* if memory allocation is successful, caller of this function - * frees it. - */ - ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); - if (!ses->ntlmssp) - return -ENOMEM; - ses->ntlmssp->sesskey_per_smbsess = false; + return 0; + +out_free_smb_buf: + kfree(smb_buf); + sess_data->iov[0].iov_base = NULL; + sess_data->iov[0].iov_len = 0; + sess_data->buf0_type = CIFS_NO_BUFFER; + return rc; +} + +static void +sess_free_buffer(struct sess_data *sess_data) +{ + free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + sess_data->buf0_type = CIFS_NO_BUFFER; + kfree(sess_data->iov[2].iov_base); +} + +static int +sess_establish_session(struct sess_data *sess_data) +{ + struct cifs_ses *ses = sess_data->ses; + + mutex_lock(&ses->server->srv_mutex); + if (!ses->server->session_estab) { + if (ses->server->sign) { + ses->server->session_key.response = + kmemdup(ses->auth_key.response, + ses->auth_key.len, GFP_KERNEL); + if (!ses->server->session_key.response) { + mutex_unlock(&ses->server->srv_mutex); + return -ENOMEM; + } + ses->server->session_key.len = + ses->auth_key.len; + } + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; } + mutex_unlock(&ses->server->srv_mutex); -ssetup_ntlmssp_authenticate: - if (phase == NtLmChallenge) - phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ + cifs_dbg(FYI, "CIFS session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); - if (type == LANMAN) { -#ifndef CONFIG_CIFS_WEAK_PW_HASH - /* LANMAN and plaintext are less secure and off by default. - So we make this explicitly be turned on in kconfig (in the - build) and turned on at runtime (changed from the default) - in proc/fs/cifs or via mount parm. Unfortunately this is - needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ - return -EOPNOTSUPP; -#endif - wct = 10; /* lanman 2 style sessionsetup */ - } else if ((type == NTLM) || (type == NTLMv2)) { - /* For NTLMv2 failures eventually may need to retry NTLM */ - wct = 13; /* old style NTLM sessionsetup */ - } else /* same size: negotiate or auth, NTLMSSP or extended security */ - wct = 12; + return 0; +} - rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, - (void **)&smb_buf); - if (rc) - return rc; +static int +sess_sendreceive(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base; + __u16 count; - pSMB = (SESSION_SETUP_ANDX *)smb_buf; + count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len; + smb_buf->smb_buf_length = + cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + put_bcc(count, smb_buf); + + rc = SendReceive2(sess_data->xid, sess_data->ses, + sess_data->iov, 3 /* num_iovecs */, + &sess_data->buf0_type, + CIFS_LOG_ERROR); + + return rc; +} +/* + * LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 + */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH +static void +sess_auth_lanman(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + __u32 capabilities; + __u16 bytes_remaining; + + /* lanman 2 style sessionsetup */ + /* wct = 10 */ + rc = sess_alloc_buffer(sess_data, 10); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; capabilities = cifs_ssetup_hdr(ses, pSMB); - /* we will send the SMB in three pieces: - a fixed length beginning part, an optional - SPNEGO blob (which can be zero length), and a - last part which will include the strings - and rest of bcc area. This allows us to avoid - a large buffer 17K allocation */ - iov[0].iov_base = (char *)pSMB; - iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; - - /* setting this here allows the code at the end of the function - to free the request buffer if there's an error */ - resp_buf_type = CIFS_SMALL_BUFFER; + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; - /* 2000 big enough to fit max user, domain, NOS name etc. */ - str_area = kmalloc(2000, GFP_KERNEL); - if (str_area == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - bcc_ptr = str_area; + /* no capabilities flags in old lanman negotiation */ + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - iov[1].iov_base = NULL; - iov[1].iov_len = 0; + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + rc = calc_lanman_hash(ses->password, ses->server->cryptkey, + ses->server->sec_mode & SECMODE_PW_ENCRYPT ? + true : false, lnm_session_key); - if (type == LANMAN) { -#ifdef CONFIG_CIFS_WEAK_PW_HASH - char lnm_session_key[CIFS_AUTH_RESP_SIZE]; + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + /* + * can not sign if LANMAN negotiated so no need + * to calculate signing key? but what if server + * changed to do higher than lanman dialect and + * we reconnected would we ever calc signing_key? + */ - pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); + /* Unicode not allowed for LANMAN dialects */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); - /* no capabilities flags in old lanman negotiation */ + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; - pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + rc = sess_sendreceive(sess_data); + if (rc) + goto out; - /* Calculate hash with password and copy into bcc_ptr. - * Encryption Key (stored as in cryptkey) gets used if the - * security mode bit in Negottiate Protocol response states - * to use challenge/response method (i.e. Password bit is 1). - */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; - rc = calc_lanman_hash(ses->password, ses->server->cryptkey, - ses->server->sec_mode & SECMODE_PW_ENCRYPT ? - true : false, lnm_session_key); + /* lanman response has a word count of 3 */ + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } - memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); - /* can not sign if LANMAN negotiated so no need - to calculate signing key? but what if server - changed to do higher than lanman dialect and - we reconnected would we ever calc signing_key? */ + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); - cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); - /* Unicode not allowed for LANMAN dialects */ - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); +} + +#else + +static void +sess_auth_lanman(struct sess_data *sess_data) +{ + sess_data->result = -EOPNOTSUPP; + sess_data->func = NULL; +} #endif - } else if (type == NTLM) { - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - pSMB->req_no_secext.CaseInsensitivePasswordLength = + +static void +sess_auth_ntlm(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + + /* old style NTLM sessionsetup */ + /* wct = 13 */ + rc = sess_alloc_buffer(sess_data, 13); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + pSMB->req_no_secext.CaseInsensitivePasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - pSMB->req_no_secext.CaseSensitivePasswordLength = + pSMB->req_no_secext.CaseSensitivePasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLM authentication\n", + /* calculate ntlm response and session key */ + rc = setup_ntlm_response(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLM authentication\n", rc); - goto ssetup_exit; - } + goto out; + } - /* copy ntlm response */ - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - - if (ses->capabilities & CAP_UNICODE) { - /* unicode strings must be word aligned */ - if (iov[0].iov_len % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == NTLMv2) { - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - - /* LM2 password would be here if we supported it */ - pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; - - /* calculate nlmv2 response and session key */ - rc = setup_ntlmv2_rsp(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", - rc); - goto ssetup_exit; + /* copy ntlm response */ + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if (sess_data->iov[0].iov_len % 2) { + *bcc_ptr = 0; + bcc_ptr++; } - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - ses->auth_key.len - CIFS_SESS_KEY_SIZE); - bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; - - /* set case sensitive password length after tilen may get - * assigned, tilen is 0 otherwise. - */ - pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } else { + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } - if (ses->capabilities & CAP_UNICODE) { - if (iov[0].iov_len % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == Kerberos) { -#ifdef CONFIG_CIFS_UPCALL - struct cifs_spnego_msg *msg; - spnego_key = cifs_get_spnego_key(ses); - if (IS_ERR(spnego_key)) { - rc = PTR_ERR(spnego_key); - spnego_key = NULL; - goto ssetup_exit; - } + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; - msg = spnego_key->payload.data; - /* check version field to make sure that cifs.upcall is - sending us a response in an expected form */ - if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cifs_dbg(VFS, "incorrect version of cifs.upcall " - "expected %d but got %d)", - CIFS_SPNEGO_UPCALL_VERSION, msg->version); - rc = -EKEYREJECTED; - goto ssetup_exit; - } + rc = sess_sendreceive(sess_data); + if (rc) + goto out; - ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); - rc = -ENOMEM; - goto ssetup_exit; - } - ses->auth_key.len = msg->sesskey_len; - - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - capabilities |= CAP_EXTENDED_SECURITY; - pSMB->req.Capabilities = cpu_to_le32(capabilities); - iov[1].iov_base = msg->data + msg->sesskey_len; - iov[1].iov_len = msg->secblob_len; - pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len); - - if (ses->capabilities & CAP_UNICODE) { - /* unicode strings must be word aligned */ - if ((iov[0].iov_len + iov[1].iov_len) % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_oslm_strings(&bcc_ptr, nls_cp); - unicode_domain_string(&bcc_ptr, ses, nls_cp); - } else - /* BB: is this right? */ - ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); -#else /* ! CONFIG_CIFS_UPCALL */ - cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); - rc = -ENOSYS; - goto ssetup_exit; -#endif /* CONFIG_CIFS_UPCALL */ - } else if (type == RawNTLMSSP) { - if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { - cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); - rc = -ENOSYS; - goto ssetup_exit; - } + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; - cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase); - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - capabilities |= CAP_EXTENDED_SECURITY; - pSMB->req.Capabilities |= cpu_to_le32(capabilities); - switch(phase) { - case NtLmNegotiate: - build_ntlmssp_negotiate_blob( - pSMB->req.SecurityBlob, ses); - iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); - iov[1].iov_base = pSMB->req.SecurityBlob; - pSMB->req.SecurityBlobLength = - cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); - break; - case NtLmAuthenticate: - /* - * 5 is an empirical value, large enough to hold - * authenticate message plus max 10 of av paris, - * domain, user, workstation names, flags, etc. - */ - ntlmsspblob = kzalloc( - 5*sizeof(struct _AUTHENTICATE_MESSAGE), - GFP_KERNEL); - if (!ntlmsspblob) { - rc = -ENOMEM; - goto ssetup_exit; - } + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } - rc = build_ntlmssp_auth_blob(ntlmsspblob, - &blob_len, ses, nls_cp); - if (rc) - goto ssetup_exit; - iov[1].iov_len = blob_len; - iov[1].iov_base = ntlmsspblob; - pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); - /* - * Make sure that we tell the server that we are using - * the uid that it just gave us back on the response - * (challenge) - */ - smb_buf->Uid = ses->Suid; - break; - default: - cifs_dbg(VFS, "invalid phase %d\n", phase); - rc = -ENOSYS; - goto ssetup_exit; + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; } - /* unicode strings must be word aligned */ - if ((iov[0].iov_len + iov[1].iov_len) % 2) { + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +static void +sess_auth_ntlmv2(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + + /* old style NTLM sessionsetup */ + /* wct = 13 */ + rc = sess_alloc_buffer(sess_data, 13); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + + /* LM2 password would be here if we supported it */ + pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; + + /* calculate nlmv2 response and session key */ + rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc); + goto out; + } + + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + + /* set case sensitive password length after tilen may get + * assigned, tilen is 0 otherwise. + */ + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + + if (ses->capabilities & CAP_UNICODE) { + if (sess_data->iov[0].iov_len % 2) { *bcc_ptr = 0; bcc_ptr++; } - unicode_oslm_strings(&bcc_ptr, nls_cp); + unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); } else { - cifs_dbg(VFS, "secType %d not supported!\n", type); - rc = -ENOSYS; - goto ssetup_exit; + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); } - iov[2].iov_base = str_area; - iov[2].iov_len = (long) bcc_ptr - (long) str_area; - count = iov[1].iov_len + iov[2].iov_len; - smb_buf->smb_buf_length = - cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; - put_bcc(count, smb_buf); + rc = sess_sendreceive(sess_data); + if (rc) + goto out; - rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, - CIFS_LOG_ERROR); - /* SMB request buf freed in SendReceive2 */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + if (smb_buf->WordCount != 3) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); - pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; - smb_buf = (struct smb_hdr *)iov[0].iov_base; + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); - if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) && - (smb_buf->Status.CifsError == - cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { - if (phase != NtLmNegotiate) { - cifs_dbg(VFS, "Unexpected more processing error\n"); - goto ssetup_exit; + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; } - /* NTLMSSP Negotiate sent now processing challenge (response) */ - phase = NtLmChallenge; /* process ntlmssp challenge */ - rc = 0; /* MORE_PROC rc is not an error here, but expected */ + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); } + + rc = sess_establish_session(sess_data); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +#ifdef CONFIG_CIFS_UPCALL +static void +sess_auth_kerberos(struct sess_data *sess_data) +{ + int rc = 0; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + char *bcc_ptr; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + __u16 bytes_remaining; + struct key *spnego_key = NULL; + struct cifs_spnego_msg *msg; + u16 blob_len; + + /* extended security */ + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); if (rc) - goto ssetup_exit; + goto out; - if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + bcc_ptr = sess_data->iov[2].iov_base; + capabilities = cifs_ssetup_hdr(ses, pSMB); + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto out; + } + + msg = spnego_key->payload.data; + /* + * check version field to make sure that cifs.upcall is + * sending us a response in an expected form + */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cifs_dbg(VFS, + "incorrect version of cifs.upcall (expected %d but got %d)", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto out_put_spnego_key; + } + + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; + + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities = cpu_to_le32(capabilities); + sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; + sess_data->iov[1].iov_len = msg->secblob_len; + pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len); + + if (ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if ((sess_data->iov[0].iov_len + + sess_data->iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp); + } else { + /* BB: is this right? */ + ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + } + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out_put_spnego_key; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + if (smb_buf->WordCount != 4) { rc = -EIO; cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); - goto ssetup_exit; + goto out_put_spnego_key; } - action = le16_to_cpu(pSMB->resp.Action); - if (action & GUEST_LOGIN) + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ cifs_dbg(FYI, "UID = %llu\n", ses->Suid); - /* response can have either 3 or 4 word count - Samba sends 3 */ - /* and lanman response is 3 */ + bytes_remaining = get_bcc(smb_buf); bcc_ptr = pByteArea(smb_buf); - if (smb_buf->WordCount == 4) { - blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); - if (blob_len > bytes_remaining) { - cifs_dbg(VFS, "bad security blob length %d\n", - blob_len); - rc = -EINVAL; - goto ssetup_exit; - } - if (phase == NtLmChallenge) { - rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); - /* now goto beginning for ntlmssp authenticate phase */ - if (rc) - goto ssetup_exit; - } - bcc_ptr += blob_len; - bytes_remaining -= blob_len; + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out_put_spnego_key; } + bcc_ptr += blob_len; + bytes_remaining -= blob_len; /* BB check if Unicode and decode strings */ if (bytes_remaining == 0) { @@ -906,60 +1083,371 @@ ssetup_ntlmssp_authenticate: ++bcc_ptr; --bytes_remaining; } - decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); } else { - decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); } -ssetup_exit: - if (spnego_key) { - key_invalidate(spnego_key); - key_put(spnego_key); + rc = sess_establish_session(sess_data); +out_put_spnego_key: + key_invalidate(spnego_key); + key_put(spnego_key); +out: + sess_data->result = rc; + sess_data->func = NULL; + sess_free_buffer(sess_data); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; +} + +#else + +static void +sess_auth_kerberos(struct sess_data *sess_data) +{ + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + sess_data->result = -ENOSYS; + sess_data->func = NULL; +} +#endif /* ! CONFIG_CIFS_UPCALL */ + +/* + * The required kvec buffers have to be allocated before calling this + * function. + */ +static int +_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) +{ + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u32 capabilities; + char *bcc_ptr; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)pSMB; + + capabilities = cifs_ssetup_hdr(ses, pSMB); + if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { + cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); + return -ENOSYS; } - kfree(str_area); - kfree(ntlmsspblob); - ntlmsspblob = NULL; - if (resp_buf_type == CIFS_SMALL_BUFFER) { - cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base); - cifs_small_buf_release(iov[0].iov_base); - } else if (resp_buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(iov[0].iov_base); - /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ - if ((phase == NtLmChallenge) && (rc == 0)) - goto ssetup_ntlmssp_authenticate; + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities |= cpu_to_le32(capabilities); + + bcc_ptr = sess_data->iov[2].iov_base; + /* unicode strings must be word aligned */ + if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + + sess_data->iov[2].iov_len = (long) bcc_ptr - + (long) sess_data->iov[2].iov_base; + + return 0; +} + +static void +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data); + +static void +sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u16 bytes_remaining; + char *bcc_ptr; + u16 blob_len; + + cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n"); + + /* + * if memory allocation is successful, caller of this function + * frees it. + */ + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) { + rc = -ENOMEM; + goto out; + } + ses->ntlmssp->sesskey_per_smbsess = false; + + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + + /* Build security blob before we assemble the request */ + build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses); + sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); + sess_data->iov[1].iov_base = pSMB->req.SecurityBlob; + pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE)); + + rc = _sess_auth_rawntlmssp_assemble_req(sess_data); + if (rc) + goto out; + + rc = sess_sendreceive(sess_data); + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + + /* If true, rc here is expected and not an error */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && + smb_buf->Status.CifsError == + cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)) + rc = 0; + + if (rc) + goto out; + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out; + } + + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out; + } + + rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); +out: + sess_free_buffer(sess_data); if (!rc) { - mutex_lock(&ses->server->srv_mutex); - if (!ses->server->session_estab) { - if (ses->server->sign) { - ses->server->session_key.response = - kmemdup(ses->auth_key.response, - ses->auth_key.len, GFP_KERNEL); - if (!ses->server->session_key.response) { - rc = -ENOMEM; - mutex_unlock(&ses->server->srv_mutex); - goto keycp_exit; - } - ses->server->session_key.len = - ses->auth_key.len; - } - ses->server->sequence_number = 0x2; - ses->server->session_estab = true; - } - mutex_unlock(&ses->server->srv_mutex); + sess_data->func = sess_auth_rawntlmssp_authenticate; + return; + } + + /* Else error. Cleanup */ + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + sess_data->func = NULL; + sess_data->result = rc; +} - cifs_dbg(FYI, "CIFS session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); +static void +sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) +{ + int rc; + struct smb_hdr *smb_buf; + SESSION_SETUP_ANDX *pSMB; + struct cifs_ses *ses = sess_data->ses; + __u16 bytes_remaining; + char *bcc_ptr; + char *ntlmsspblob = NULL; + u16 blob_len; + + cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n"); + + /* wct = 12 */ + rc = sess_alloc_buffer(sess_data, 12); + if (rc) + goto out; + + /* Build security blob before we assemble the request */ + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)pSMB; + /* + * 5 is an empirical value, large enough to hold + * authenticate message plus max 10 of av paris, + * domain, user, workstation names, flags, etc. + */ + ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE), + GFP_KERNEL); + if (!ntlmsspblob) { + rc = -ENOMEM; + goto out; } -keycp_exit: + rc = build_ntlmssp_auth_blob(ntlmsspblob, + &blob_len, ses, sess_data->nls_cp); + if (rc) + goto out_free_ntlmsspblob; + sess_data->iov[1].iov_len = blob_len; + sess_data->iov[1].iov_base = ntlmsspblob; + pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); + /* + * Make sure that we tell the server that we are using + * the uid that it just gave us back on the response + * (challenge) + */ + smb_buf->Uid = ses->Suid; + + rc = _sess_auth_rawntlmssp_assemble_req(sess_data); + if (rc) + goto out_free_ntlmsspblob; + + rc = sess_sendreceive(sess_data); + if (rc) + goto out_free_ntlmsspblob; + + pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; + smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; + if (smb_buf->WordCount != 4) { + rc = -EIO; + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); + goto out_free_ntlmsspblob; + } + + if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + + bytes_remaining = get_bcc(smb_buf); + bcc_ptr = pByteArea(smb_buf); + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + if (blob_len > bytes_remaining) { + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); + rc = -EINVAL; + goto out_free_ntlmsspblob; + } + bcc_ptr += blob_len; + bytes_remaining -= blob_len; + + + /* BB check if Unicode and decode strings */ + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + /* unicode string area must be word-aligned */ + if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + ++bcc_ptr; + --bytes_remaining; + } + decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } else { + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, + sess_data->nls_cp); + } + +out_free_ntlmsspblob: + kfree(ntlmsspblob); +out: + sess_free_buffer(sess_data); + + if (!rc) + rc = sess_establish_session(sess_data); + + /* Cleanup */ kfree(ses->auth_key.response); ses->auth_key.response = NULL; kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + + sess_data->func = NULL; + sess_data->result = rc; +} + +static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) +{ + int type; + + type = select_sectype(ses->server, ses->sectype); + cifs_dbg(FYI, "sess setup type %d\n", type); + if (type == Unspecified) { + cifs_dbg(VFS, + "Unable to select appropriate authentication method!"); + return -EINVAL; + } + + switch (type) { + case LANMAN: + /* LANMAN and plaintext are less secure and off by default. + * So we make this explicitly be turned on in kconfig (in the + * build) and turned on at runtime (changed from the default) + * in proc/fs/cifs or via mount parm. Unfortunately this is + * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH + sess_data->func = sess_auth_lanman; + break; +#else + return -EOPNOTSUPP; +#endif + case NTLM: + sess_data->func = sess_auth_ntlm; + break; + case NTLMv2: + sess_data->func = sess_auth_ntlmv2; + break; + case Kerberos: +#ifdef CONFIG_CIFS_UPCALL + sess_data->func = sess_auth_kerberos; + break; +#else + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + return -ENOSYS; + break; +#endif /* CONFIG_CIFS_UPCALL */ + case RawNTLMSSP: + sess_data->func = sess_auth_rawntlmssp_negotiate; + break; + default: + cifs_dbg(VFS, "secType %d not supported!\n", type); + return -ENOSYS; + } + + return 0; +} + +int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct sess_data *sess_data; + + if (ses == NULL) { + WARN(1, "%s: ses == NULL!", __func__); + return -EINVAL; + } + + sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL); + if (!sess_data) + return -ENOMEM; + + rc = select_sec(ses, sess_data); + if (rc) + goto out; + + sess_data->xid = xid; + sess_data->ses = ses; + sess_data->buf0_type = CIFS_NO_BUFFER; + sess_data->nls_cp = (struct nls_table *) nls_cp; + + while (sess_data->func) + sess_data->func(sess_data); + + /* Store result before we free sess_data */ + rc = sess_data->result; +out: + kfree(sess_data); return rc; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index d1fdfa848703..5e8c22d6c7b9 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1009,6 +1009,12 @@ cifs_is_read_op(__u32 oplock) return oplock == OPLOCK_READ; } +static unsigned int +cifs_wp_retry_size(struct inode *inode) +{ + return CIFS_SB(inode->i_sb)->wsize; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1019,6 +1025,7 @@ struct smb_version_operations smb1_operations = { .set_credits = cifs_set_credits, .get_credits_field = cifs_get_credits_field, .get_credits = cifs_get_credits, + .wait_mtu_credits = cifs_wait_mtu_credits, .get_next_mid = cifs_get_next_mid, .read_data_offset = cifs_read_data_offset, .read_data_length = cifs_read_data_length, @@ -1078,6 +1085,7 @@ struct smb_version_operations smb1_operations = { .query_mf_symlink = cifs_query_mf_symlink, .create_mf_symlink = cifs_create_mf_symlink, .is_read_op = cifs_is_read_op, + .wp_retry_size = cifs_wp_retry_size, #ifdef CONFIG_CIFS_XATTR .query_all_EAs = CIFSSMBQAllEAs, .set_EA = CIFSSMBSetEA, diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 84c012a6aba0..0150182a4494 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -91,7 +91,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, case SMB2_OP_SET_EOF: tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid, fid.volatile_fid, current->tgid, - (__le64 *)data); + (__le64 *)data, false); break; case SMB2_OP_SET_INFO: tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid, diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 94bd4fbb13d3..e31a9dfdcd39 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -605,7 +605,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_MAPPED_FILE_SIZE_ZERO, -EIO, "STATUS_MAPPED_FILE_SIZE_ZERO"}, {STATUS_TOO_MANY_OPENED_FILES, -EMFILE, "STATUS_TOO_MANY_OPENED_FILES"}, {STATUS_CANCELLED, -EIO, "STATUS_CANCELLED"}, - {STATUS_CANNOT_DELETE, -EIO, "STATUS_CANNOT_DELETE"}, + {STATUS_CANNOT_DELETE, -EACCES, "STATUS_CANNOT_DELETE"}, {STATUS_INVALID_COMPUTER_NAME, -EIO, "STATUS_INVALID_COMPUTER_NAME"}, {STATUS_FILE_DELETED, -EIO, "STATUS_FILE_DELETED"}, {STATUS_SPECIAL_ACCOUNT, -EIO, "STATUS_SPECIAL_ACCOUNT"}, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index b8021fde987d..f2e6ac29a8d6 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -437,7 +437,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, continue; cifs_dbg(FYI, "found in the open list\n"); - cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + cifs_dbg(FYI, "lease key match, lease break 0x%x\n", le32_to_cpu(rsp->NewLeaseState)); server->ops->set_oplock_level(cinode, lease_state, 0, NULL); @@ -467,7 +467,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, } cifs_dbg(FYI, "found in the pending open list\n"); - cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + cifs_dbg(FYI, "lease key match, lease break 0x%x\n", le32_to_cpu(rsp->NewLeaseState)); open->oplock = lease_state; @@ -546,7 +546,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) return false; } - cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel); + cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel); /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 787844bde384..77f8aeb9c2fc 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -19,6 +19,7 @@ #include <linux/pagemap.h> #include <linux/vfs.h> +#include <linux/falloc.h> #include "cifsglob.h" #include "smb2pdu.h" #include "smb2proto.h" @@ -112,6 +113,53 @@ smb2_get_credits(struct mid_q_entry *mid) return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest); } +static int +smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, + unsigned int *num, unsigned int *credits) +{ + int rc = 0; + unsigned int scredits; + + spin_lock(&server->req_lock); + while (1) { + if (server->credits <= 0) { + spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); + rc = wait_event_killable(server->request_q, + has_credits(server, &server->credits)); + cifs_num_waiters_dec(server); + if (rc) + return rc; + spin_lock(&server->req_lock); + } else { + if (server->tcpStatus == CifsExiting) { + spin_unlock(&server->req_lock); + return -ENOENT; + } + + scredits = server->credits; + /* can deadlock with reopen */ + if (scredits == 1) { + *num = SMB2_MAX_BUFFER_SIZE; + *credits = 0; + break; + } + + /* leave one credit for a possible reopen */ + scredits--; + *num = min_t(unsigned int, size, + scredits * SMB2_MAX_BUFFER_SIZE); + + *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); + server->credits -= *credits; + server->in_flight++; + break; + } + } + spin_unlock(&server->req_lock); + return rc; +} + static __u64 smb2_get_next_mid(struct TCP_Server_Info *server) { @@ -182,8 +230,9 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified wsize, or default */ wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); - /* set it to the maximum buffer size value we can send with 1 credit */ - wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); return wsize; } @@ -197,8 +246,9 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified rsize, or default */ rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); - /* set it to the maximum buffer size value we can send with 1 credit */ - rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); return rsize; } @@ -687,7 +737,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, { __le64 eof = cpu_to_le64(size); return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, &eof, false); } static int @@ -1104,6 +1154,13 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch) return le32_to_cpu(lc->lcontext.LeaseState); } +static unsigned int +smb2_wp_retry_size(struct inode *inode) +{ + return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize, + SMB2_MAX_BUFFER_SIZE); +} + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -1113,6 +1170,7 @@ struct smb_version_operations smb20_operations = { .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = cifs_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@ -1177,6 +1235,7 @@ struct smb_version_operations smb20_operations = { .create_lease_buf = smb2_create_lease_buf, .parse_lease_buf = smb2_parse_lease_buf, .clone_range = smb2_clone_range, + .wp_retry_size = smb2_wp_retry_size, }; struct smb_version_operations smb21_operations = { @@ -1188,6 +1247,7 @@ struct smb_version_operations smb21_operations = { .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@ -1252,6 +1312,7 @@ struct smb_version_operations smb21_operations = { .create_lease_buf = smb2_create_lease_buf, .parse_lease_buf = smb2_parse_lease_buf, .clone_range = smb2_clone_range, + .wp_retry_size = smb2_wp_retry_size, }; struct smb_version_operations smb30_operations = { @@ -1263,6 +1324,7 @@ struct smb_version_operations smb30_operations = { .set_credits = smb2_set_credits, .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, @@ -1330,6 +1392,7 @@ struct smb_version_operations smb30_operations = { .parse_lease_buf = smb3_parse_lease_buf, .clone_range = smb2_clone_range, .validate_negotiate = smb3_validate_negotiate, + .wp_retry_size = smb2_wp_retry_size, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b0b260dbb19d..42ebc1a8be6c 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -108,7 +108,6 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , if (!tcon) goto out; - /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */ /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */ /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ if ((tcon->ses) && @@ -245,10 +244,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if (rc) goto out; atomic_inc(&tconInfoReconnectCount); - /* - * BB FIXME add code to check if wsize needs update due to negotiated - * smb buffer size shrinking. - */ out: /* * Check if handle based operation so we know whether we can continue @@ -309,16 +304,6 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, return rc; } -static void -free_rsp_buf(int resp_buftype, void *rsp) -{ - if (resp_buftype == CIFS_SMALL_BUFFER) - cifs_small_buf_release(rsp); - else if (resp_buftype == CIFS_LARGE_BUFFER) - cifs_buf_release(rsp); -} - - /* * * SMB2 Worker functions follow: @@ -1738,12 +1723,18 @@ smb2_readv_callback(struct mid_q_entry *mid) rc); } /* FIXME: should this be counted toward the initiating task? */ - task_io_account_read(rdata->bytes); - cifs_stats_bytes_read(tcon, rdata->bytes); + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: rdata->result = -EAGAIN; + if (server->sign && rdata->got_bytes) + /* reset bytes number since we can not check a sign */ + rdata->got_bytes = 0; + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->got_bytes); + cifs_stats_bytes_read(tcon, rdata->got_bytes); break; default: if (rdata->result != -ENODATA) @@ -1762,11 +1753,12 @@ smb2_readv_callback(struct mid_q_entry *mid) int smb2_async_readv(struct cifs_readdata *rdata) { - int rc; + int rc, flags = 0; struct smb2_hdr *buf; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = &rdata->iov, .rq_nvec = 1 }; + struct TCP_Server_Info *server; cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@ -1777,18 +1769,41 @@ smb2_async_readv(struct cifs_readdata *rdata) io_parms.persistent_fid = rdata->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->cfile->fid.volatile_fid; io_parms.pid = rdata->pid; + + server = io_parms.tcon->ses->server; + rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0); - if (rc) + if (rc) { + if (rc == -EAGAIN && rdata->credits) { + /* credits was reset by reconnect */ + rdata->credits = 0; + /* reduce in_flight value since we won't send the req */ + spin_lock(&server->req_lock); + server->in_flight--; + spin_unlock(&server->req_lock); + } return rc; + } buf = (struct smb2_hdr *)rdata->iov.iov_base; /* 4 for rfc1002 length field */ rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4; + if (rdata->credits) { + buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, + SMB2_MAX_BUFFER_SIZE)); + spin_lock(&server->req_lock); + server->credits += rdata->credits - + le16_to_cpu(buf->CreditCharge); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + flags = CIFS_HAS_CREDITS; + } + kref_get(&rdata->refcount); rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, cifs_readv_receive, smb2_readv_callback, - rdata, 0); + rdata, flags); if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); @@ -1906,15 +1921,25 @@ int smb2_async_writev(struct cifs_writedata *wdata, void (*release)(struct kref *kref)) { - int rc = -EACCES; + int rc = -EACCES, flags = 0; struct smb2_write_req *req = NULL; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; struct kvec iov; struct smb_rqst rqst; rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req); - if (rc) + if (rc) { + if (rc == -EAGAIN && wdata->credits) { + /* credits was reset by reconnect */ + wdata->credits = 0; + /* reduce in_flight value since we won't send the req */ + spin_lock(&server->req_lock); + server->in_flight--; + spin_unlock(&server->req_lock); + } goto async_writev_out; + } req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid); @@ -1947,9 +1972,20 @@ smb2_async_writev(struct cifs_writedata *wdata, inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */); + if (wdata->credits) { + req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, + SMB2_MAX_BUFFER_SIZE)); + spin_lock(&server->req_lock); + server->credits += wdata->credits - + le16_to_cpu(req->hdr.CreditCharge); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + flags = CIFS_HAS_CREDITS; + } + kref_get(&wdata->refcount); - rc = cifs_call_async(tcon->ses->server, &rqst, NULL, - smb2_writev_callback, wdata, 0); + rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata, + flags); if (rc) { kref_put(&wdata->refcount, release); @@ -2325,7 +2361,7 @@ SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 pid, __le64 *eof) + u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc) { struct smb2_file_eof_info info; void *data; @@ -2336,8 +2372,12 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, data = &info; size = sizeof(struct smb2_file_eof_info); - return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, - FILE_END_OF_FILE_INFORMATION, 1, &data, &size); + if (is_falloc) + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + pid, FILE_ALLOCATION_INFORMATION, 1, &data, &size); + else + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + pid, FILE_END_OF_FILE_INFORMATION, 1, &data, &size); } int diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 0ce48db20a65..67e8ce8055de 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -139,7 +139,7 @@ extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, __le16 *target_file); extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 pid, - __le64 *eof); + __le64 *eof, bool is_fallocate); extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 59c748ce872f..5111e7272db6 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -466,7 +466,12 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) static inline void smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr) { + unsigned int i, num = le16_to_cpu(hdr->CreditCharge); + hdr->MessageId = get_next_mid64(server); + /* skip message numbers according to CreditCharge field */ + for (i = 1; i < num; i++) + get_next_mid(server); } static struct mid_q_entry * diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 18cd5650a5fc..9d087f4e7d4e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -448,6 +448,15 @@ wait_for_free_request(struct TCP_Server_Info *server, const int timeout, return wait_for_free_credits(server, timeout, val); } +int +cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, + unsigned int *num, unsigned int *credits) +{ + *num = size; + *credits = 0; + return 0; +} + static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, struct mid_q_entry **ppmidQ) { @@ -531,20 +540,23 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, { int rc, timeout, optype; struct mid_q_entry *mid; + unsigned int credits = 0; timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; - rc = wait_for_free_request(server, timeout, optype); - if (rc) - return rc; + if ((flags & CIFS_HAS_CREDITS) == 0) { + rc = wait_for_free_request(server, timeout, optype); + if (rc) + return rc; + credits = 1; + } mutex_lock(&server->srv_mutex); mid = server->ops->setup_async_request(server, rqst); if (IS_ERR(mid)) { mutex_unlock(&server->srv_mutex); - add_credits(server, 1, optype); - wake_up(&server->request_q); + add_credits_and_wake_if(server, credits, optype); return PTR_ERR(mid); } @@ -572,8 +584,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, return 0; cifs_delete_mid(mid); - add_credits(server, 1, optype); - wake_up(&server->request_q); + add_credits_and_wake_if(server, credits, optype); return rc; } diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 1da168c61d35..278f8fdeb9ef 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -13,7 +13,7 @@ #include <linux/fs.h> #include <linux/stat.h> #include <linux/errno.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/list.h> #include <linux/sched.h> diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 2849f41e72a2..1326d38960db 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -13,7 +13,7 @@ #include <linux/fs.h> #include <linux/stat.h> #include <linux/errno.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/coda.h> diff --git a/fs/coda/dir.c b/fs/coda/dir.c index cd8a63238b11..9c3dedc000d1 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -19,8 +19,7 @@ #include <linux/string.h> #include <linux/spinlock.h> #include <linux/namei.h> - -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/file.c b/fs/coda/file.c index 9e83b7790212..d244d743a232 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -18,7 +18,7 @@ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/inode.c b/fs/coda/inode.c index fe3afb2de880..b945410bfcd5 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -21,9 +21,7 @@ #include <linux/vfs.h> #include <linux/slab.h> #include <linux/pid_namespace.h> - -#include <asm/uaccess.h> - +#include <linux/uaccess.h> #include <linux/fs.h> #include <linux/vmalloc.h> diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 3f5de96bbb58..4326d172fc27 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -16,7 +16,7 @@ #include <linux/string.h> #include <linux/namei.h> #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 5c1e4242368b..822629126e89 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -40,7 +40,7 @@ #include <linux/pid_namespace.h> #include <asm/io.h> #include <asm/poll.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 21fcf8dcb9cd..5bb6e27298a4 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -27,7 +27,7 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/mutex.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/vfs.h> diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index e82289047272..afec6450450f 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -59,7 +59,7 @@ #include <linux/gfp.h> #include <net/bluetooth/bluetooth.h> -#include <net/bluetooth/hci.h> +#include <net/bluetooth/hci_sock.h> #include <net/bluetooth/rfcomm.h> #include <linux/capi.h> diff --git a/fs/coredump.c b/fs/coredump.c index 0b2528fb640e..a93f7e6ea4cf 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -306,7 +306,7 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (unlikely(nr < 0)) return nr; - tsk->flags = PF_DUMPCORE; + tsk->flags |= PF_DUMPCORE; if (atomic_read(&mm->mm_users) == nr + 1) goto done; /* diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index ddcfe590b8a8..355c522f3585 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -11,6 +11,8 @@ * The actual compression is based on zlib, see the other files. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -21,7 +23,7 @@ #include <linux/vfs.h> #include <linux/mutex.h> #include <uapi/linux/cramfs_fs.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -153,7 +155,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE]; static unsigned buffer_blocknr[READ_BUFFERS]; -static struct super_block * buffer_dev[READ_BUFFERS]; +static struct super_block *buffer_dev[READ_BUFFERS]; static int next_buffer; /* @@ -205,6 +207,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = pages[i]; + if (page) { wait_on_page_locked(page); if (!PageUptodate(page)) { @@ -223,6 +226,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i data = read_buffers[buffer]; for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = pages[i]; + if (page) { memcpy(data, kmap(page), PAGE_CACHE_SIZE); kunmap(page); @@ -237,6 +241,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + kill_block_super(sb); kfree(sbi); } @@ -277,7 +282,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) /* check for wrong endianness */ if (super.magic == CRAMFS_MAGIC_WEND) { if (!silent) - printk(KERN_ERR "cramfs: wrong endianness\n"); + pr_err("wrong endianness\n"); return -EINVAL; } @@ -287,22 +292,22 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) mutex_unlock(&read_mutex); if (super.magic != CRAMFS_MAGIC) { if (super.magic == CRAMFS_MAGIC_WEND && !silent) - printk(KERN_ERR "cramfs: wrong endianness\n"); + pr_err("wrong endianness\n"); else if (!silent) - printk(KERN_ERR "cramfs: wrong magic\n"); + pr_err("wrong magic\n"); return -EINVAL; } } /* get feature flags first */ if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { - printk(KERN_ERR "cramfs: unsupported filesystem features\n"); + pr_err("unsupported filesystem features\n"); return -EINVAL; } /* Check that the root inode is in a sane state */ if (!S_ISDIR(super.root.mode)) { - printk(KERN_ERR "cramfs: root is not a directory\n"); + pr_err("root is not a directory\n"); return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ @@ -310,23 +315,23 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) root_offset = super.root.offset << 2; if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { - sbi->size=super.size; - sbi->blocks=super.fsid.blocks; - sbi->files=super.fsid.files; + sbi->size = super.size; + sbi->blocks = super.fsid.blocks; + sbi->files = super.fsid.files; } else { - sbi->size=1<<28; - sbi->blocks=0; - sbi->files=0; + sbi->size = 1<<28; + sbi->blocks = 0; + sbi->files = 0; } - sbi->magic=super.magic; - sbi->flags=super.flags; + sbi->magic = super.magic; + sbi->flags = super.flags; if (root_offset == 0) - printk(KERN_INFO "cramfs: empty filesystem"); + pr_info("empty filesystem"); else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && ((root_offset != sizeof(struct cramfs_super)) && (root_offset != 512 + sizeof(struct cramfs_super)))) { - printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset); + pr_err("bad root offset %lu\n", root_offset); return -EINVAL; } @@ -425,7 +430,7 @@ static int cramfs_readdir(struct file *file, struct dir_context *ctx) /* * Lookup and fill in the inode data.. */ -static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned int offset = 0; struct inode *inode = NULL; @@ -483,7 +488,7 @@ out: return NULL; } -static int cramfs_readpage(struct file *file, struct page * page) +static int cramfs_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; u32 maxblock; @@ -511,7 +516,7 @@ static int cramfs_readpage(struct file *file, struct page * page) if (compr_len == 0) ; /* hole */ else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) { - pr_err("cramfs: bad compressed blocksize %u\n", + pr_err("bad compressed blocksize %u\n", compr_len); goto err; } else { diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c index 1760c1b84d97..ec4f1d4fdad0 100644 --- a/fs/cramfs/uncompress.c +++ b/fs/cramfs/uncompress.c @@ -15,6 +15,8 @@ * then is used by multiple filesystems. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/errno.h> #include <linux/vmalloc.h> @@ -37,7 +39,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen) err = zlib_inflateReset(&stream); if (err != Z_OK) { - printk("zlib_inflateReset error %d\n", err); + pr_err("zlib_inflateReset error %d\n", err); zlib_inflateEnd(&stream); zlib_inflateInit(&stream); } @@ -48,8 +50,8 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen) return stream.total_out; err: - printk("Error %d while decompressing!\n", err); - printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); + pr_err("Error %d while decompressing!\n", err); + pr_err("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); return -EIO; } @@ -57,7 +59,7 @@ int cramfs_uncompress_init(void) { if (!initialized++) { stream.workspace = vmalloc(zlib_inflate_workspacesize()); - if ( !stream.workspace ) { + if (!stream.workspace) { initialized = 0; return -ENOMEM; } diff --git a/fs/dcache.c b/fs/dcache.c index 06f65857a855..d30ce699ae4b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -731,8 +731,6 @@ EXPORT_SYMBOL(dget_parent); /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question - * @want_discon: flag, used by d_splice_alias, to request - * that only a DISCONNECTED alias be returned. * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. @@ -741,10 +739,9 @@ EXPORT_SYMBOL(dget_parent); * of a filesystem. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that one unless @want_discon is set, - * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. + * any other hashed alias over that one. */ -static struct dentry *__d_find_alias(struct inode *inode, int want_discon) +static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias, *discon_alias; @@ -756,7 +753,7 @@ again: if (IS_ROOT(alias) && (alias->d_flags & DCACHE_DISCONNECTED)) { discon_alias = alias; - } else if (!want_discon) { + } else { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; @@ -768,12 +765,9 @@ again: alias = discon_alias; spin_lock(&alias->d_lock); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { - if (IS_ROOT(alias) && - (alias->d_flags & DCACHE_DISCONNECTED)) { - __dget_dlock(alias); - spin_unlock(&alias->d_lock); - return alias; - } + __dget_dlock(alias); + spin_unlock(&alias->d_lock); + return alias; } spin_unlock(&alias->d_lock); goto again; @@ -787,7 +781,7 @@ struct dentry *d_find_alias(struct inode *inode) if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); - de = __d_find_alias(inode, 0); + de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; @@ -1781,25 +1775,7 @@ struct dentry *d_find_any_alias(struct inode *inode) } EXPORT_SYMBOL(d_find_any_alias); -/** - * d_obtain_alias - find or allocate a dentry for a given inode - * @inode: inode to allocate the dentry for - * - * Obtain a dentry for an inode resulting from NFS filehandle conversion or - * similar open by handle operations. The returned dentry may be anonymous, - * or may have a full name (if the inode was already in the cache). - * - * When called on a directory inode, we must ensure that the inode only ever - * has one dentry. If a dentry is found, that is returned instead of - * allocating a new one. - * - * On successful return, the reference to the inode has been transferred - * to the dentry. In case of an error the reference on the inode is released. - * To make it easier to use in export operations a %NULL or IS_ERR inode may - * be passed in and will be the error will be propagate to the return value, - * with a %NULL @inode replaced by ERR_PTR(-ESTALE). - */ -struct dentry *d_obtain_alias(struct inode *inode) +static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) { static const struct qstr anonstring = QSTR_INIT("/", 1); struct dentry *tmp; @@ -1830,7 +1806,10 @@ struct dentry *d_obtain_alias(struct inode *inode) } /* attach a disconnected dentry */ - add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED; + add_flags = d_flags_for_inode(inode); + + if (disconnected) + add_flags |= DCACHE_DISCONNECTED; spin_lock(&tmp->d_lock); tmp->d_inode = inode; @@ -1851,59 +1830,51 @@ struct dentry *d_obtain_alias(struct inode *inode) iput(inode); return res; } -EXPORT_SYMBOL(d_obtain_alias); /** - * d_splice_alias - splice a disconnected dentry into the tree if one exists - * @inode: the inode which may have a disconnected dentry - * @dentry: a negative dentry which we want to point to the inode. - * - * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and - * DCACHE_DISCONNECTED), then d_move that in place of the given dentry - * and return it, else simply d_add the inode to the dentry and return NULL. + * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode + * @inode: inode to allocate the dentry for * - * This is needed in the lookup routine of any filesystem that is exportable - * (via knfsd) so that we can build dcache paths to directories effectively. + * Obtain a dentry for an inode resulting from NFS filehandle conversion or + * similar open by handle operations. The returned dentry may be anonymous, + * or may have a full name (if the inode was already in the cache). * - * If a dentry was found and moved, then it is returned. Otherwise NULL - * is returned. This matches the expected return value of ->lookup. + * When called on a directory inode, we must ensure that the inode only ever + * has one dentry. If a dentry is found, that is returned instead of + * allocating a new one. * - * Cluster filesystems may call this function with a negative, hashed dentry. - * In that case, we know that the inode will be a regular file, and also this - * will only occur during atomic_open. So we need to check for the dentry - * being already hashed only in the final case. + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is released. + * To make it easier to use in export operations a %NULL or IS_ERR inode may + * be passed in and the error will be propagated to the return value, + * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ -struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) +struct dentry *d_obtain_alias(struct inode *inode) { - struct dentry *new = NULL; - - if (IS_ERR(inode)) - return ERR_CAST(inode); + return __d_obtain_alias(inode, 1); +} +EXPORT_SYMBOL(d_obtain_alias); - if (inode && S_ISDIR(inode->i_mode)) { - spin_lock(&inode->i_lock); - new = __d_find_alias(inode, 1); - if (new) { - BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); - spin_unlock(&inode->i_lock); - security_d_instantiate(new, inode); - d_move(new, dentry); - iput(inode); - } else { - /* already taking inode->i_lock, so d_add() by hand */ - __d_instantiate(dentry, inode); - spin_unlock(&inode->i_lock); - security_d_instantiate(dentry, inode); - d_rehash(dentry); - } - } else { - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) - d_rehash(dentry); - } - return new; +/** + * d_obtain_root - find or allocate a dentry for a given inode + * @inode: inode to allocate the dentry for + * + * Obtain an IS_ROOT dentry for the root of a filesystem. + * + * We must ensure that directory inodes only ever have one dentry. If a + * dentry is found, that is returned instead of allocating a new one. + * + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is + * released. A %NULL or IS_ERR inode may be passed in and will be the + * error will be propagate to the return value, with a %NULL @inode + * replaced by ERR_PTR(-ESTALE). + */ +struct dentry *d_obtain_root(struct inode *inode) +{ + return __d_obtain_alias(inode, 0); } -EXPORT_SYMBOL(d_splice_alias); +EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name @@ -2697,6 +2668,75 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) } /** + * d_splice_alias - splice a disconnected dentry into the tree if one exists + * @inode: the inode which may have a disconnected dentry + * @dentry: a negative dentry which we want to point to the inode. + * + * If inode is a directory and has an IS_ROOT alias, then d_move that in + * place of the given dentry and return it, else simply d_add the inode + * to the dentry and return NULL. + * + * If a non-IS_ROOT directory is found, the filesystem is corrupt, and + * we should error out: directories can't have multiple aliases. + * + * This is needed in the lookup routine of any filesystem that is exportable + * (via knfsd) so that we can build dcache paths to directories effectively. + * + * If a dentry was found and moved, then it is returned. Otherwise NULL + * is returned. This matches the expected return value of ->lookup. + * + * Cluster filesystems may call this function with a negative, hashed dentry. + * In that case, we know that the inode will be a regular file, and also this + * will only occur during atomic_open. So we need to check for the dentry + * being already hashed only in the final case. + */ +struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) +{ + struct dentry *new = NULL; + + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (inode && S_ISDIR(inode->i_mode)) { + spin_lock(&inode->i_lock); + new = __d_find_any_alias(inode); + if (new) { + if (!IS_ROOT(new)) { + spin_unlock(&inode->i_lock); + dput(new); + return ERR_PTR(-EIO); + } + if (d_ancestor(new, dentry)) { + spin_unlock(&inode->i_lock); + dput(new); + return ERR_PTR(-EIO); + } + write_seqlock(&rename_lock); + __d_materialise_dentry(dentry, new); + write_sequnlock(&rename_lock); + __d_drop(new); + _d_rehash(new); + spin_unlock(&new->d_lock); + spin_unlock(&inode->i_lock); + security_d_instantiate(new, inode); + iput(inode); + } else { + /* already taking inode->i_lock, so d_add() by hand */ + __d_instantiate(dentry, inode); + spin_unlock(&inode->i_lock); + security_d_instantiate(dentry, inode); + d_rehash(dentry); + } + } else { + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) + d_rehash(dentry); + } + return new; +} +EXPORT_SYMBOL(d_splice_alias); + +/** * d_materialise_unique - introduce an inode into the tree * @dentry: candidate dentry * @inode: inode to bind to the dentry, to which aliases may be attached @@ -2724,7 +2764,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) struct dentry *alias; /* Does an aliased dentry already exist? */ - alias = __d_find_alias(inode, 0); + alias = __d_find_alias(inode); if (alias) { actual = alias; write_seqlock(&rename_lock); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 63146295153b..76c08c2beb2f 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -451,7 +451,7 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf, { char buf[3]; u32 *val = file->private_data; - + if (*val) buf[0] = 'Y'; else diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 8c41b52da358..1e3b99d3db0d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -66,7 +66,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev break; } } - return inode; + return inode; } /* SMP-safe */ @@ -317,7 +317,7 @@ static struct dentry *__create_file(const char *name, umode_t mode, goto exit; /* If the parent is not specified, we create it in the root. - * We need the root dentry to do this, which is in the super + * We need the root dentry to do this, which is in the super * block. A pointer to that is in the struct vfsmount that we * have around. */ @@ -330,7 +330,7 @@ static struct dentry *__create_file(const char *name, umode_t mode, switch (mode & S_IFMT) { case S_IFDIR: error = debugfs_mkdir(parent->d_inode, dentry, mode); - + break; case S_IFLNK: error = debugfs_link(parent->d_inode, dentry, mode, @@ -534,7 +534,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove); */ void debugfs_remove_recursive(struct dentry *dentry) { - struct dentry *child, *next, *parent; + struct dentry *child, *parent; if (IS_ERR_OR_NULL(dentry)) return; @@ -546,30 +546,49 @@ void debugfs_remove_recursive(struct dentry *dentry) parent = dentry; down: mutex_lock(&parent->d_inode->i_mutex); - list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) { + loop: + /* + * The parent->d_subdirs is protected by the d_lock. Outside that + * lock, the child can be unlinked and set to be freed which can + * use the d_u.d_child as the rcu head and corrupt this list. + */ + spin_lock(&parent->d_lock); + list_for_each_entry(child, &parent->d_subdirs, d_u.d_child) { if (!debugfs_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { + spin_unlock(&parent->d_lock); mutex_unlock(&parent->d_inode->i_mutex); parent = child; goto down; } - up: + + spin_unlock(&parent->d_lock); + if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); + + /* + * The parent->d_lock protects agaist child from unlinking + * from d_subdirs. When releasing the parent->d_lock we can + * no longer trust that the next pointer is valid. + * Restart the loop. We'll skip this one with the + * debugfs_positive() check. + */ + goto loop; } + spin_unlock(&parent->d_lock); mutex_unlock(&parent->d_inode->i_mutex); child = parent; parent = parent->d_parent; mutex_lock(&parent->d_inode->i_mutex); - if (child != dentry) { - next = list_next_entry(child, d_u.d_child); - goto up; - } + if (child != dentry) + /* go up */ + goto loop; if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); diff --git a/fs/direct-io.c b/fs/direct-io.c index 98040ba388ac..c3116404ab49 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -71,7 +71,6 @@ struct dio_submit { been performed at the start of a write */ int pages_in_io; /* approximate total IO pages */ - size_t size; /* total request size (doesn't change)*/ sector_t block_in_file; /* Current offset into the underlying file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ @@ -159,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { ssize_t ret; - ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE, + ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES, &sdio->from); if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { @@ -198,9 +197,8 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) * L1 cache. */ static inline struct page *dio_get_page(struct dio *dio, - struct dio_submit *sdio, size_t *from, size_t *to) + struct dio_submit *sdio) { - int n; if (dio_pages_present(sdio) == 0) { int ret; @@ -209,10 +207,7 @@ static inline struct page *dio_get_page(struct dio *dio, return ERR_PTR(ret); BUG_ON(dio_pages_present(sdio) == 0); } - n = sdio->head++; - *from = n ? 0 : sdio->from; - *to = (n == sdio->tail - 1) ? sdio->to : PAGE_SIZE; - return dio->pages[n]; + return dio->pages[sdio->head]; } /** @@ -911,11 +906,15 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, while (sdio->block_in_file < sdio->final_block_in_request) { struct page *page; size_t from, to; - page = dio_get_page(dio, sdio, &from, &to); + + page = dio_get_page(dio, sdio); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } + from = sdio->head ? 0 : sdio->from; + to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE; + sdio->head++; while (from < to) { unsigned this_chunk_bytes; /* # of bytes mapped */ @@ -1104,7 +1103,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - loff_t end = offset + iov_iter_count(iter); + size_t count = iov_iter_count(iter); + loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; struct buffer_head map_bh = { 0, }; @@ -1287,10 +1287,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ BUG_ON(retval == -EIOCBQUEUED); if (dio->is_async && retval == 0 && dio->result && - ((rw == READ) || (dio->result == sdio.size))) + (rw == READ || dio->result == count)) retval = -EIOCBQUEUED; - - if (retval != -EIOCBQUEUED) + else dio_await_completion(dio); if (drop_refcount(dio) == 0) { diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 8d77ba7b1756..1323c568e362 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -718,16 +718,11 @@ static const struct file_operations waiters_fops = { void dlm_delete_debug_file(struct dlm_ls *ls) { - if (ls->ls_debug_rsb_dentry) - debugfs_remove(ls->ls_debug_rsb_dentry); - if (ls->ls_debug_waiters_dentry) - debugfs_remove(ls->ls_debug_waiters_dentry); - if (ls->ls_debug_locks_dentry) - debugfs_remove(ls->ls_debug_locks_dentry); - if (ls->ls_debug_all_dentry) - debugfs_remove(ls->ls_debug_all_dentry); - if (ls->ls_debug_toss_dentry) - debugfs_remove(ls->ls_debug_toss_dentry); + debugfs_remove(ls->ls_debug_rsb_dentry); + debugfs_remove(ls->ls_debug_waiters_dentry); + debugfs_remove(ls->ls_debug_locks_dentry); + debugfs_remove(ls->ls_debug_all_dentry); + debugfs_remove(ls->ls_debug_toss_dentry); } int dlm_create_debug_file(struct dlm_ls *ls) diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 356c044e2cd3..bbee8f063dfa 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -12,7 +12,8 @@ #include "efs.h" -static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) { +static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) +{ struct buffer_head *bh; int slot, namelen; @@ -40,10 +41,10 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { pr_err("%s(): invalid directory block\n", __func__); brelse(bh); - return(0); + return 0; } - for(slot = 0; slot < dirblock->slots; slot++) { + for (slot = 0; slot < dirblock->slots; slot++) { dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); namelen = dirslot->namelen; @@ -52,12 +53,12 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) if ((namelen == len) && (!memcmp(name, nameptr, len))) { inodenum = be32_to_cpu(dirslot->inode); brelse(bh); - return(inodenum); + return inodenum; } } brelse(bh); } - return(0); + return 0; } struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) diff --git a/fs/exec.c b/fs/exec.c index a3d33fe592d6..a2b42a98c743 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -368,10 +368,6 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; - err = init_new_context(current, mm); - if (err) - goto err; - err = __bprm_mm_init(bprm); if (err) goto err; @@ -1216,7 +1212,7 @@ EXPORT_SYMBOL(install_exec_creds); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against - * PTRACE_ATTACH + * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { @@ -1234,7 +1230,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ - if (current->no_new_privs) + if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; t = p; @@ -1272,7 +1268,7 @@ int prepare_binprm(struct linux_binprm *bprm) bprm->cred->egid = current_egid(); if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && - !current->no_new_privs && + !task_no_new_privs(current) && kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { /* Set-uid? */ diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 7f20f25c232c..84529b8a331b 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -116,7 +116,7 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, pages_in_unit - i); - __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); + __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL); if (unlikely(!__a1pa)) { ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", num_a1pa); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 3750031cfa2f..b88edc05c230 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -161,7 +161,7 @@ static struct kmem_cache * ext2_inode_cachep; static struct inode *ext2_alloc_inode(struct super_block *sb) { struct ext2_inode_info *ei; - ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->i_block_alloc_info = NULL; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0762d143e252..581ef40fbe90 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -194,7 +194,16 @@ static void ext4_init_block_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -359,6 +368,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return; @@ -369,6 +379,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -376,6 +389,9 @@ static void ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -623,7 +639,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, if (!(*errp) && ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ef1bed66c14f..0bb3f9ea0832 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -571,6 +571,31 @@ static int ext4_release_dir(struct inode *inode, struct file *filp) return 0; } +int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, + int buf_size) +{ + struct ext4_dir_entry_2 *de; + int nlen, rlen; + unsigned int offset = 0; + char *top; + + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size; + while ((char *) de < top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EIO; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -EIO; + + return 0; +} + const struct file_operations ext4_dir_operations = { .llseek = ext4_dir_llseek, .read = generic_read_dir, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7cc5a0e23688..5b19760b1de5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -591,7 +591,6 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 -#define EXT4_FREE_BLOCKS_RESERVE 0x0040 /* * ioctl commands @@ -2029,6 +2028,8 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype) return ext4_filetype_table[filetype]; } +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); @@ -2144,8 +2145,8 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); -extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t first, ext4_lblk_t stop); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -2560,7 +2561,6 @@ extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ -extern int ext4_has_inline_data(struct inode *inode); extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, @@ -2626,6 +2626,12 @@ extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4da228a0e6d0..76c2df382b7d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -161,6 +161,8 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err; + + WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ @@ -1808,8 +1810,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, - EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | - EXT4_FREE_BLOCKS_RESERVE); + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* @@ -3253,7 +3254,7 @@ out: fix_extent_len: ex->ee_len = orig_ex.ee_len; - ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_dirty(handle, inode, path + path->p_depth); return err; } @@ -5403,16 +5404,13 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) int ret; /* Collapse range works only on fs block size aligned offsets. */ - if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || - len & (EXT4_BLOCK_SIZE(sb) - 1)) + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) return -EINVAL; if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) - return -EOPNOTSUPP; - trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 3f5c188953a4..0b7e28e7eaa4 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -966,10 +966,10 @@ retry: continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei) + if (ei->i_es_lru_nr == 0 || ei == locked_ei || + !write_trylock(&ei->i_es_lock)) continue; - write_lock(&ei->i_es_lock); shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); if (ei->i_es_lru_nr == 0) list_del_init(&ei->i_es_lru); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8695f70af1ef..aca7b24a4432 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -200,10 +200,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { - struct address_space *mapping = file->f_mapping; - - if (!mapping->a_ops->readpage) - return -ENOEXEC; file_accessed(file); vma->vm_ops = &ext4_file_vm_ops; return 0; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0ee59a6644e2..5b87fc36aab8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, struct ext4_group_desc *gdp) { struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent @@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +196,12 @@ verify: ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } @@ -321,6 +338,12 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); } @@ -851,6 +874,13 @@ got: goto out; } + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -887,13 +917,6 @@ got: } } - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) { - ext4_std_error(sb, err); - goto out; - } - /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 8a57e9fcd1b9..e75f840000a0 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return 0; failed: for (; i >= 0; i--) { - if (i != indirect_blks && branch[i].bh) + /* + * We want to ext4_forget() only freshly allocated indirect + * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and + * buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called. + */ + if (i > 0 && i != indirect_blks && branch[i].bh) ext4_forget(handle, 1, inode, branch[i].bh, branch[i].bh->b_blocknr); ext4_free_blocks(handle, inode, NULL, new_blocks[i], @@ -1289,89 +1295,220 @@ do_indirects: } } -static int free_hole_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, __le32 *i_data, - int level, ext4_lblk_t first, - ext4_lblk_t count, int max) +/** + * ext4_ind_remove_space - remove space from the range + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @start: First block to remove + * @end: One block after the last block to remove (exclusive) + * + * Free the blocks in the defined range (end is exclusive endpoint of + * range). This is used by ext4_punch_hole(). + */ +int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end) { - struct buffer_head *bh = NULL; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ret = 0; - int i, inc; - ext4_lblk_t offset; - __le32 blk; - - inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level); - for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) { - if (offset >= count + first) - break; - if (*i_data == 0 || (offset + inc) <= first) - continue; - blk = *i_data; - if (level > 0) { - ext4_lblk_t first2; - bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), - "Read failure"); - return -EIO; + ext4_lblk_t offsets[4], offsets2[4]; + Indirect chain[4], chain2[4]; + Indirect *partial, *partial2; + ext4_lblk_t max_block; + __le32 nr = 0, nr2 = 0; + int n = 0, n2 = 0; + unsigned blocksize = inode->i_sb->s_blocksize; + + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + if (end >= max_block) + end = max_block; + if ((start >= end) || (start > max_block)) + return 0; + + n = ext4_block_to_path(inode, start, offsets, NULL); + n2 = ext4_block_to_path(inode, end, offsets2, NULL); + + BUG_ON(n > n2); + + if ((n == 1) && (n == n2)) { + /* We're punching only within direct block range */ + ext4_free_data(handle, inode, NULL, i_data + offsets[0], + i_data + offsets2[0]); + return 0; + } else if (n2 > n) { + /* + * Start and end are on a different levels so we're going to + * free partial block at start, and partial block at end of + * the range. If there are some levels in between then + * do_indirects label will take care of that. + */ + + if (n == 1) { + /* + * Start is at the direct block level, free + * everything to the end of the level. + */ + ext4_free_data(handle, inode, NULL, i_data + offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto end_range; + } + + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); } - first2 = (first > offset) ? first - offset : 0; - ret = free_hole_blocks(handle, inode, bh, - (__le32 *)bh->b_data, level - 1, - first2, count - offset, - inode->i_sb->s_blocksize >> 2); - if (ret) { - brelse(bh); - goto err; + } + + /* + * Clear the ends of indirect blocks on the shared branch + * at the start of the range + */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + +end_range: + partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + if (nr2) { + if (partial2 == chain2) { + /* + * Remember, end is exclusive so here we're at + * the start of the next level we're not going + * to free. Everything was covered by the start + * of the range. + */ + return 0; + } else { + /* Shared branch grows from an indirect block */ + partial2--; } + } else { + /* + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element + */ + partial2->p++; } - if (level == 0 || - (bh && all_zeroes((__le32 *)bh->b_data, - (__le32 *)bh->b_data + addr_per_block))) { - ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); - *i_data = 0; + + /* + * Clear the ends of indirect blocks on the shared branch + * at the end of the range + */ + while (partial2 > chain2) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, + (chain2+n2-1) - partial2); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + partial2--; } - brelse(bh); - bh = NULL; + goto do_indirects; } -err: - return ret; -} - -int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t first, ext4_lblk_t stop) -{ - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int level, ret = 0; - int num = EXT4_NDIR_BLOCKS; - ext4_lblk_t count, max = EXT4_NDIR_BLOCKS; - __le32 *i_data = EXT4_I(inode)->i_data; - - count = stop - first; - for (level = 0; level < 4; level++, max *= addr_per_block) { - if (first < max) { - ret = free_hole_blocks(handle, inode, NULL, i_data, - level, first, count, num); - if (ret) - goto err; - if (count > max - first) - count -= max - first; - else - break; - first = 0; - } else { - first -= max; + /* Punch happened within the same level (n == n2) */ + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + /* + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element + */ + partial2->p++; + while ((partial > chain) || (partial2 > chain2)) { + /* We're at the same block, so we're almost finished */ + if ((partial->bh && partial2->bh) && + (partial->bh->b_blocknr == partial2->bh->b_blocknr)) { + if ((partial > chain) && (partial2 > chain2)) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + partial2->p, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + } + return 0; } - i_data += num; - if (level == 0) { - num = 1; - max = 1; + /* + * Clear the ends of indirect blocks on the shared branch + * at the start of the range + */ + if (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + /* + * Clear the ends of indirect blocks on the shared branch + * at the end of the range + */ + if (partial2 > chain2) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, + (chain2+n-1) - partial2); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + partial2--; } } -err: - return ret; +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + if (++n >= n2) + return 0; + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + if (++n >= n2) + return 0; + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + if (++n >= n2) + return 0; + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + return 0; } - diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 645205d8ada6..bea662bd0ca6 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -120,12 +120,6 @@ int ext4_get_max_inline_size(struct inode *inode) return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; } -int ext4_has_inline_data(struct inode *inode) -{ - return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && - EXT4_I(inode)->i_inline_off; -} - /* * this function does not take xattr_sem, which is OK because it is * currently only used in a code path coming form ext4_iget, before @@ -1178,6 +1172,18 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, if (error < 0) goto out; + /* + * Make sure the inline directory entries pass checks before we try to + * convert them, so that we avoid touching stuff that needs fsck. + */ + if (S_ISDIR(inode->i_mode)) { + error = ext4_check_all_de(inode, iloc->bh, + buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + if (error) + goto out; + } + error = ext4_destroy_inline_data_nolock(handle, inode); if (error) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8a064734e6eb..367a60c07cf0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -325,18 +325,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) #endif /* - * Calculate the number of metadata blocks need to reserve - * to allocate a block located at @lblock - */ -static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) -{ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ext_calc_metadata_amount(inode, lblock); - - return ext4_ind_calc_metadata_amount(inode, lblock); -} - -/* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ @@ -357,35 +345,10 @@ void ext4_da_update_reserve_space(struct inode *inode, used = ei->i_reserved_data_blocks; } - if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { - ext4_warning(inode->i_sb, "ino %lu, allocated %d " - "with only %d reserved metadata blocks " - "(releasing %d blocks with reserved %d data blocks)", - inode->i_ino, ei->i_allocated_meta_blocks, - ei->i_reserved_meta_blocks, used, - ei->i_reserved_data_blocks); - WARN_ON(1); - ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; - } - /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; - ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - used + ei->i_allocated_meta_blocks); - ei->i_allocated_meta_blocks = 0; + percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); - if (ei->i_reserved_data_blocks == 0) { - /* - * We can release all of the reserved metadata blocks - * only when we have written all of the delayed - * allocation blocks. - */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - ei->i_reserved_meta_blocks); - ei->i_reserved_meta_blocks = 0; - ei->i_da_metadata_calc_len = 0; - } spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); /* Update quota subsystem for data blocks */ @@ -1222,49 +1185,6 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a metadata for a single block located at lblock - */ -static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned int md_needed; - ext4_lblk_t save_last_lblock; - int save_len; - - /* - * recalculate the amount of metadata blocks to reserve - * in order to allocate nrblocks - * worse case is one extent per block - */ - spin_lock(&ei->i_block_reservation_lock); - /* - * ext4_calc_metadata_amount() has side effects, which we have - * to be prepared undo if we fail to claim space. - */ - save_len = ei->i_da_metadata_calc_len; - save_last_lblock = ei->i_da_metadata_calc_last_lblock; - md_needed = EXT4_NUM_B2C(sbi, - ext4_calc_metadata_amount(inode, lblock)); - trace_ext4_da_reserve_space(inode, md_needed); - - /* - * We do still charge estimated metadata to the sb though; - * we cannot afford to run out of free blocks. - */ - if (ext4_claim_free_clusters(sbi, md_needed, 0)) { - ei->i_da_metadata_calc_len = save_len; - ei->i_da_metadata_calc_last_lblock = save_last_lblock; - spin_unlock(&ei->i_block_reservation_lock); - return -ENOSPC; - } - ei->i_reserved_meta_blocks += md_needed; - spin_unlock(&ei->i_block_reservation_lock); - - return 0; /* success */ -} - -/* * Reserve a single cluster located at lblock */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) @@ -1273,8 +1193,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; int ret; - ext4_lblk_t save_last_lblock; - int save_len; /* * We will charge metadata quota at writeout time; this saves @@ -1295,25 +1213,15 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) * ext4_calc_metadata_amount() has side effects, which we have * to be prepared undo if we fail to claim space. */ - save_len = ei->i_da_metadata_calc_len; - save_last_lblock = ei->i_da_metadata_calc_last_lblock; - md_needed = EXT4_NUM_B2C(sbi, - ext4_calc_metadata_amount(inode, lblock)); - trace_ext4_da_reserve_space(inode, md_needed); + md_needed = 0; + trace_ext4_da_reserve_space(inode, 0); - /* - * We do still charge estimated metadata to the sb though; - * we cannot afford to run out of free blocks. - */ - if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) { - ei->i_da_metadata_calc_len = save_len; - ei->i_da_metadata_calc_last_lblock = save_last_lblock; + if (ext4_claim_free_clusters(sbi, 1, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } ei->i_reserved_data_blocks++; - ei->i_reserved_meta_blocks += md_needed; spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1346,20 +1254,6 @@ static void ext4_da_release_space(struct inode *inode, int to_free) } ei->i_reserved_data_blocks -= to_free; - if (ei->i_reserved_data_blocks == 0) { - /* - * We can release all of the reserved metadata blocks - * only when we have written all of the delayed - * allocation blocks. - * Note that in case of bigalloc, i_reserved_meta_blocks, - * i_reserved_data_blocks, etc. refer to number of clusters. - */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - ei->i_reserved_meta_blocks); - ei->i_reserved_meta_blocks = 0; - ei->i_da_metadata_calc_len = 0; - } - /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); @@ -1500,10 +1394,6 @@ static void ext4_print_free_blocks(struct inode *inode) ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); - ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", - ei->i_reserved_meta_blocks); - ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", - ei->i_allocated_meta_blocks); return; } @@ -1620,13 +1510,6 @@ add_delayed: retval = ret; goto out_unlock; } - } else { - ret = ext4_da_reserve_metadata(inode, iblock); - if (ret) { - /* not enough space to reserve */ - retval = ret; - goto out_unlock; - } } ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -2843,8 +2726,7 @@ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); - if (!EXT4_I(inode)->i_reserved_data_blocks && - !EXT4_I(inode)->i_reserved_meta_blocks) + if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* @@ -3624,7 +3506,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_ext_remove_space(inode, first_block, stop_block - 1); else - ret = ext4_free_hole_blocks(handle, inode, first_block, + ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); up_write(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 59e31622cc6e..956027711faf 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; @@ -751,14 +752,17 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd; " - "block bitmap corrupt.", + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -1431,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1441,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); /* Mark the block group as corrupt. */ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &e4b->bd_info->bb_state); @@ -3067,8 +3075,9 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, (23 - bsbits)) << 23; size = 8 * 1024 * 1024; } else { - start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; - size = ac->ac_o_ex.fe_len << bsbits; + start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; + size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), + ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; start = start_off >> bsbits; @@ -3208,8 +3217,27 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; + struct ext4_buddy e4b; + int err; - if (pa && pa->pa_type == MB_INODE_PA) + if (pa == NULL) { + err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); + if (err) { + /* + * This should never happen since we pin the + * pages in the ext4_allocation_context so + * ext4_mb_load_buddy() should never fail. + */ + WARN(1, "mb_load_buddy failed (%d)", err); + return; + } + ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, + ac->ac_f_ex.fe_len); + ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + return; + } + if (pa->pa_type == MB_INODE_PA) pa->pa_free += ac->ac_b_ex.fe_len; } @@ -4619,7 +4647,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; - struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; @@ -4830,19 +4857,7 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { - percpu_counter_add(&sbi->s_dirtyclusters_counter, - count_clusters); - spin_lock(&ei->i_block_reservation_lock); - if (flags & EXT4_FREE_BLOCKS_METADATA) - ei->i_reserved_meta_blocks += count_clusters; - else - ei->i_reserved_data_blocks += count_clusters; - spin_unlock(&ei->i_block_reservation_lock); - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_reclaim_block(inode, - EXT4_C2B(sbi, count_clusters)); - } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index ec092437d3e0..d3567f27bae7 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -39,6 +39,8 @@ static int finish_range(handle_t *handle, struct inode *inode, newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); + /* Locking only for convinience since we are operating on temp inode */ + down_write(&EXT4_I(inode)->i_data_sem); path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { @@ -61,7 +63,9 @@ static int finish_range(handle_t *handle, struct inode *inode, */ if (needed && ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS)) { + up_write((&EXT4_I(inode)->i_data_sem)); retval = ext4_journal_restart(handle, needed); + down_write((&EXT4_I(inode)->i_data_sem)); if (retval) goto err_out; } else if (needed) { @@ -70,13 +74,16 @@ static int finish_range(handle_t *handle, struct inode *inode, /* * IF not able to extend the journal restart the journal */ + up_write((&EXT4_I(inode)->i_data_sem)); retval = ext4_journal_restart(handle, needed); + down_write((&EXT4_I(inode)->i_data_sem)); if (retval) goto err_out; } } retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); err_out: + up_write((&EXT4_I(inode)->i_data_sem)); if (path) { ext4_ext_drop_refs(path); kfree(path); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2484c7ec6a72..671a74b14fd7 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1013,10 +1013,11 @@ data_copy: *err = -EBUSY; goto unlock_pages; } - + ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, err); + ext4_double_up_write_data_sem(orig_inode, donor_inode); if (*err) { if (replaced_count) { block_len_in_page = replaced_count; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3520ab8a6639..b147a67baa0d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3455,7 +3455,6 @@ const struct inode_operations ext4_dir_inode_operations = { .rmdir = ext4_rmdir, .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, - .rename = ext4_rename, .rename2 = ext4_rename2, .setattr = ext4_setattr, .setxattr = generic_setxattr, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b9b9aabfb4d2..32b43ad154b9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1525,8 +1525,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, arg = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * arg; } else if (token == Opt_max_batch_time) { - if (arg == 0) - arg = EXT4_DEF_MAX_BATCH_TIME; sbi->s_max_batch_time = arg; } else if (token == Opt_min_batch_time) { sbi->s_min_batch_time = arg; @@ -2144,10 +2142,6 @@ static int ext4_check_descriptors(struct super_block *sb, } if (NULL != first_not_zeroed) *first_not_zeroed = grp; - - ext4_free_blocks_count_set(sbi->s_es, - EXT4_C2B(sbi, ext4_count_free_clusters(sb))); - sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); return 1; } @@ -2809,10 +2803,11 @@ static void print_daily_error_info(unsigned long arg) es = sbi->s_es; if (es->s_error_count) - ext4_msg(sb, KERN_NOTICE, "error count: %u", + /* fsck newer than v1.41.13 is needed to clean this condition. */ + ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, @@ -2826,7 +2821,7 @@ static void print_daily_error_info(unsigned long arg) printk("\n"); } if (es->s_last_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, @@ -3880,38 +3875,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - - /* - * set up enough so that it can read an inode, - * and create new inode for buddy allocator - */ - sbi->s_gdb_count = db_count; - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); - goto failed_mount2; - } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2a; + goto failed_mount2; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) - if (!ext4_fill_flex_info(sb)) { - ext4_msg(sb, KERN_ERR, - "unable to initialize " - "flex_bg meta info!"); - goto failed_mount2a; - } + sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3922,23 +3891,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Register extent status tree shrinker */ ext4_es_register_shrinker(sbi); - err = percpu_counter_init(&sbi->s_freeclusters_counter, - ext4_count_free_clusters(sb)); - if (!err) { - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); - } - if (!err) { - err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0); - } - if (err) { + if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; } @@ -3946,6 +3899,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; + /* + * set up enough so that it can read an inode + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4034,18 +3995,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; - /* - * The journal may have updated the bg summary counts, so we - * need to update the global counters. - */ - percpu_counter_set(&sbi->s_freeclusters_counter, - ext4_count_free_clusters(sb)); - percpu_counter_set(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - percpu_counter_set(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); - no_journal: if (ext4_mballoc_ready) { sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); @@ -4135,16 +4084,51 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount5; + goto failed_mount4a; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); + goto failed_mount4a; + } + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); goto failed_mount5; } + block = ext4_count_free_clusters(sb); + ext4_free_blocks_count_set(sbi->s_es, + EXT4_C2B(sbi, block)); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block); + if (!err) { + unsigned long freei = ext4_count_free_inodes(sb); + sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); + } + if (!err) + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + if (!err) + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount6; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (!ext4_fill_flex_info(sb)) { + ext4_msg(sb, KERN_ERR, + "unable to initialize " + "flex_bg meta info!"); + goto failed_mount6; + } + err = ext4_register_li_request(sb, first_not_zeroed); if (err) goto failed_mount6; @@ -4218,8 +4202,17 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_release_system_zone(sb); + ext4_mb_release(sb); + if (sbi->s_flex_groups) + ext4_kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); failed_mount5: + ext4_ext_release(sb); + ext4_release_system_zone(sb); +failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4234,23 +4227,14 @@ failed_mount_wq: failed_mount3: ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); - if (sbi->s_flex_groups) - ext4_kvfree(sbi->s_flex_groups); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); -failed_mount2a: - ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: - ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { @@ -4560,11 +4544,13 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - ext4_free_blocks_count_set(es, + if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter)) + ext4_free_blocks_count_set(es, EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeclusters_counter))); - es->s_free_inodes_count = - cpu_to_le32(percpu_counter_sum_positive( + if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index dbe2141d10ad..83b9b5a8d112 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -203,12 +203,6 @@ static int __f2fs_set_acl(struct inode *inode, int type, size_t size = 0; int error; - if (acl) { - error = posix_acl_valid(acl); - if (error < 0) - return error; - } - switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0b4710c1d370..6aeed5bada52 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -22,7 +22,7 @@ #include "segment.h" #include <trace/events/f2fs.h> -static struct kmem_cache *orphan_entry_slab; +static struct kmem_cache *ino_entry_slab; static struct kmem_cache *inode_entry_slab; /* @@ -282,72 +282,120 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, }; +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + struct ino_entry *e; +retry: + spin_lock(&sbi->ino_lock[type]); + + e = radix_tree_lookup(&sbi->ino_root[type], ino); + if (!e) { + e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); + if (!e) { + spin_unlock(&sbi->ino_lock[type]); + goto retry; + } + if (radix_tree_insert(&sbi->ino_root[type], ino, e)) { + spin_unlock(&sbi->ino_lock[type]); + kmem_cache_free(ino_entry_slab, e); + goto retry; + } + memset(e, 0, sizeof(struct ino_entry)); + e->ino = ino; + + list_add_tail(&e->list, &sbi->ino_list[type]); + } + spin_unlock(&sbi->ino_lock[type]); +} + +static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + struct ino_entry *e; + + spin_lock(&sbi->ino_lock[type]); + e = radix_tree_lookup(&sbi->ino_root[type], ino); + if (e) { + list_del(&e->list); + radix_tree_delete(&sbi->ino_root[type], ino); + if (type == ORPHAN_INO) + sbi->n_orphans--; + spin_unlock(&sbi->ino_lock[type]); + kmem_cache_free(ino_entry_slab, e); + return; + } + spin_unlock(&sbi->ino_lock[type]); +} + +void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* add new dirty ino entry into list */ + __add_ino_entry(sbi, ino, type); +} + +void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* remove dirty ino entry from list */ + __remove_ino_entry(sbi, ino, type); +} + +/* mode should be APPEND_INO or UPDATE_INO */ +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +{ + struct ino_entry *e; + spin_lock(&sbi->ino_lock[mode]); + e = radix_tree_lookup(&sbi->ino_root[mode], ino); + spin_unlock(&sbi->ino_lock[mode]); + return e ? true : false; +} + +static void release_dirty_inode(struct f2fs_sb_info *sbi) +{ + struct ino_entry *e, *tmp; + int i; + + for (i = APPEND_INO; i <= UPDATE_INO; i++) { + spin_lock(&sbi->ino_lock[i]); + list_for_each_entry_safe(e, tmp, &sbi->ino_list[i], list) { + list_del(&e->list); + radix_tree_delete(&sbi->ino_root[i], e->ino); + kmem_cache_free(ino_entry_slab, e); + } + spin_unlock(&sbi->ino_lock[i]); + } +} + int acquire_orphan_inode(struct f2fs_sb_info *sbi) { int err = 0; - spin_lock(&sbi->orphan_inode_lock); + spin_lock(&sbi->ino_lock[ORPHAN_INO]); if (unlikely(sbi->n_orphans >= sbi->max_orphans)) err = -ENOSPC; else sbi->n_orphans++; - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); return err; } void release_orphan_inode(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->orphan_inode_lock); + spin_lock(&sbi->ino_lock[ORPHAN_INO]); f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); } void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head; - struct orphan_inode_entry *new, *orphan; - - new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); - new->ino = ino; - - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; - list_for_each_entry(orphan, head, list) { - if (orphan->ino == ino) { - spin_unlock(&sbi->orphan_inode_lock); - kmem_cache_free(orphan_entry_slab, new); - return; - } - - if (orphan->ino > ino) - break; - } - - /* add new orphan entry into list which is sorted by inode number */ - list_add_tail(&new->list, &orphan->list); - spin_unlock(&sbi->orphan_inode_lock); + /* add new orphan ino entry into list */ + __add_ino_entry(sbi, ino, ORPHAN_INO); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head; - struct orphan_inode_entry *orphan; - - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; - list_for_each_entry(orphan, head, list) { - if (orphan->ino == ino) { - list_del(&orphan->list); - f2fs_bug_on(sbi->n_orphans == 0); - sbi->n_orphans--; - spin_unlock(&sbi->orphan_inode_lock); - kmem_cache_free(orphan_entry_slab, orphan); - return; - } - } - spin_unlock(&sbi->orphan_inode_lock); + /* remove orphan entry from orphan list */ + __remove_ino_entry(sbi, ino, ORPHAN_INO); } static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -401,14 +449,14 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); struct page *page = NULL; - struct orphan_inode_entry *orphan = NULL; + struct ino_entry *orphan = NULL; for (index = 0; index < orphan_blocks; index++) grab_meta_page(sbi, start_blk + index); index = 1; - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; + spin_lock(&sbi->ino_lock[ORPHAN_INO]); + head = &sbi->ino_list[ORPHAN_INO]; /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { @@ -448,7 +496,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) f2fs_put_page(page, 1); } - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -714,10 +762,10 @@ retry_flush_dents: * until finishing nat/sit flush. */ retry_flush_nodes: - mutex_lock(&sbi->node_write); + down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - mutex_unlock(&sbi->node_write); + up_write(&sbi->node_write); sync_node_pages(sbi, 0, &wbc); goto retry_flush_nodes; } @@ -726,7 +774,7 @@ retry_flush_nodes: static void unblock_operations(struct f2fs_sb_info *sbi) { - mutex_unlock(&sbi->node_write); + up_write(&sbi->node_write); f2fs_unlock_all(sbi); } @@ -748,6 +796,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); nid_t last_nid = 0; block_t start_blk; struct page *cp_page; @@ -761,7 +810,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) * This avoids to conduct wrong roll-forward operations and uses * metapages, so should be called prior to sync_meta_pages below. */ - discard_next_dnode(sbi); + discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg)); /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) @@ -885,8 +934,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { + if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { clear_prefree_segments(sbi); + release_dirty_inode(sbi); F2FS_RESET_SB_DIRT(sbi); } } @@ -932,31 +982,37 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } -void init_orphan_info(struct f2fs_sb_info *sbi) +void init_ino_entry_info(struct f2fs_sb_info *sbi) { - spin_lock_init(&sbi->orphan_inode_lock); - INIT_LIST_HEAD(&sbi->orphan_inode_list); - sbi->n_orphans = 0; + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) { + INIT_RADIX_TREE(&sbi->ino_root[i], GFP_ATOMIC); + spin_lock_init(&sbi->ino_lock[i]); + INIT_LIST_HEAD(&sbi->ino_list[i]); + } + /* * considering 512 blocks in a segment 8 blocks are needed for cp * and log segment summaries. Remaining blocks are used to keep * orphan entries with the limitation one reserved segment * for cp pack we can have max 1020*504 orphan entries */ + sbi->n_orphans = 0; sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) { - orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry)); - if (!orphan_entry_slab) + ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", + sizeof(struct ino_entry)); + if (!ino_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", sizeof(struct dir_inode_entry)); if (!inode_entry_slab) { - kmem_cache_destroy(orphan_entry_slab); + kmem_cache_destroy(ino_entry_slab); return -ENOMEM; } return 0; @@ -964,6 +1020,6 @@ int __init create_checkpoint_caches(void) void destroy_checkpoint_caches(void) { - kmem_cache_destroy(orphan_entry_slab); + kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(inode_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0924521306b4..03313099c51c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -139,7 +139,10 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + if (test_opt(sbi, NOBARRIER)) + io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; + else + io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); up_write(&io->io_rwsem); @@ -608,8 +611,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, bool fiemap) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; @@ -626,8 +629,10 @@ static int get_data_block(struct inode *inode, sector_t iblock, if (check_extent_cache(inode, pgofs, bh_result)) goto out; - if (create) + if (create) { + f2fs_balance_fs(sbi); f2fs_lock_op(sbi); + } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -637,7 +642,7 @@ static int get_data_block(struct inode *inode, sector_t iblock, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; if (dn.data_blkaddr != NULL_ADDR) { @@ -671,7 +676,7 @@ get_next: err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); @@ -708,10 +713,23 @@ out: return err; } +static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, false); +} + +static int get_data_block_fiemap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, true); +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, get_data_block); + return generic_block_fiemap(inode, fieinfo, + start, len, get_data_block_fiemap); } static int f2fs_read_data_page(struct file *file, struct page *page) @@ -771,9 +789,11 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) !is_cold_data(page) && need_inplace_update(inode))) { rewrite_data_page(page, old_blkaddr, fio); + set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); } else { write_data_page(page, &dn, &new_blkaddr, fio); update_extent_cache(new_blkaddr, &dn); + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); } out_writepage: f2fs_put_dnode(&dn); @@ -901,6 +921,16 @@ skip_write: return 0; } +static void f2fs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + truncate_blocks(inode, inode->i_size); + } +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -918,11 +948,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, repeat: err = f2fs_convert_inline_data(inode, pos + len); if (err) - return err; + goto fail; page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; + if (!page) { + err = -ENOMEM; + goto fail; + } /* to avoid latency during memory pressure */ unlock_page(page); @@ -936,10 +968,9 @@ repeat: set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_reserve_block(&dn, index); f2fs_unlock_op(sbi); - if (err) { f2fs_put_page(page, 0); - return err; + goto fail; } inline_data: lock_page(page); @@ -969,19 +1000,20 @@ inline_data: err = f2fs_read_inline_data(inode, page); if (err) { page_cache_release(page); - return err; + goto fail; } } else { err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) - return err; + goto fail; } lock_page(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); - return -EIO; + err = -EIO; + goto fail; } if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); @@ -992,6 +1024,9 @@ out: SetPageUptodate(page); clear_cold_data(page); return 0; +fail: + f2fs_write_failed(mapping, pos + len); + return err; } static int f2fs_write_end(struct file *file, @@ -1003,7 +1038,6 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); - SetPageUptodate(page); set_page_dirty(page); if (pos + copied > i_size_read(inode)) { @@ -1037,7 +1071,10 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + int err; /* Let buffer I/O handle the inline data case. */ if (f2fs_has_inline_data(inode)) @@ -1049,8 +1086,15 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, /* clear fsync mark to recover these blocks */ fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); - return blockdev_direct_IO(rw, iocb, inode, iter, offset, - get_data_block); + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + + err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); + if (err < 0 && (rw & WRITE)) + f2fs_write_failed(mapping, offset + count); + + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); + + return err; } static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index b52c12cf5873..a441ba33be11 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -167,7 +167,7 @@ get_cache: si->cache_mem += npages << PAGE_CACHE_SHIFT; npages = META_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; - si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); + si->cache_mem += sbi->n_orphans * sizeof(struct ino_entry); si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); } @@ -345,21 +345,14 @@ void __init f2fs_create_root_stats(void) f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - goto bail; + return; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); - if (!file) - goto free_debugfs_dir; - - return; - -free_debugfs_dir: - debugfs_remove(f2fs_debugfs_root); - -bail: - f2fs_debugfs_root = NULL; - return; + if (!file) { + debugfs_remove(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; + } } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 966acb039e3b..bcf893c3d903 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -77,8 +77,8 @@ static unsigned long dir_block_index(unsigned int level, return bidx; } -static bool early_match_name(const char *name, size_t namelen, - f2fs_hash_t namehash, struct f2fs_dir_entry *de) +static bool early_match_name(size_t namelen, f2fs_hash_t namehash, + struct f2fs_dir_entry *de) { if (le16_to_cpu(de->name_len) != namelen) return false; @@ -90,7 +90,7 @@ static bool early_match_name(const char *name, size_t namelen, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - const char *name, size_t namelen, int *max_slots, + struct qstr *name, int *max_slots, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; @@ -109,9 +109,10 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, continue; } de = &dentry_blk->dentry[bit_pos]; - if (early_match_name(name, namelen, namehash, de)) { + if (early_match_name(name->len, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], - name, namelen)) { + name->name, + name->len)) { *res_page = dentry_page; goto found; } @@ -120,6 +121,13 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, *max_slots = max_len; max_len = 0; } + + /* + * For the most part, it should be a bug when name_len is zero. + * We stop here for figuring out where the bugs are occurred. + */ + f2fs_bug_on(!de->name_len); + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } @@ -132,10 +140,10 @@ found: } static struct f2fs_dir_entry *find_in_level(struct inode *dir, - unsigned int level, const char *name, size_t namelen, + unsigned int level, struct qstr *name, f2fs_hash_t namehash, struct page **res_page) { - int s = GET_DENTRY_SLOTS(namelen); + int s = GET_DENTRY_SLOTS(name->len); unsigned int nbucket, nblock; unsigned int bidx, end_block; struct page *dentry_page; @@ -160,8 +168,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, continue; } - de = find_in_block(dentry_page, name, namelen, - &max_slots, namehash, res_page); + de = find_in_block(dentry_page, name, &max_slots, + namehash, res_page); if (de) break; @@ -187,8 +195,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct qstr *child, struct page **res_page) { - const char *name = child->name; - size_t namelen = child->len; unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; f2fs_hash_t name_hash; @@ -200,12 +206,11 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, *res_page = NULL; - name_hash = f2fs_dentry_hash(name, namelen); + name_hash = f2fs_dentry_hash(child); max_depth = F2FS_I(dir)->i_current_depth; for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, name, - namelen, name_hash, res_page); + de = find_in_level(dir, level, child, name_hash, res_page); if (de) break; } @@ -298,14 +303,13 @@ static int make_empty_dir(struct inode *inode, struct page *dentry_page; struct f2fs_dentry_block *dentry_blk; struct f2fs_dir_entry *de; - void *kaddr; dentry_page = get_new_data_page(inode, page, 0, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; + + dentry_blk = kmap_atomic(dentry_page); de = &dentry_blk->dentry[0]; de->name_len = cpu_to_le16(1); @@ -323,7 +327,7 @@ static int make_empty_dir(struct inode *inode, test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); - kunmap_atomic(kaddr); + kunmap_atomic(dentry_blk); set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); @@ -333,11 +337,12 @@ static int make_empty_dir(struct inode *inode, static struct page *init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *name) { + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct page *page; int err; if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - page = new_inode_page(inode, name); + page = new_inode_page(inode); if (IS_ERR(page)) return page; @@ -362,7 +367,8 @@ static struct page *init_inode_metadata(struct inode *inode, set_cold_node(inode, page); } - init_dent_inode(name, page); + if (name) + init_dent_inode(name, page); /* * This file should be checkpointed during fsync. @@ -370,17 +376,23 @@ static struct page *init_inode_metadata(struct inode *inode, */ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { file_lost_pino(inode); + /* + * If link the tmpfile to alias through linkat path, + * we should remove this inode from orphan list. + */ + if (inode->i_nlink == 0) + remove_orphan_inode(sbi, inode->i_ino); inc_nlink(inode); } return page; put_error: f2fs_put_page(page, 1); +error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); truncate_blocks(inode, 0); remove_dirty_dir_inode(inode); -error: remove_inode_page(inode); return ERR_PTR(err); } @@ -453,7 +465,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, int err = 0; int i; - dentry_hash = f2fs_dentry_hash(name->name, name->len); + dentry_hash = f2fs_dentry_hash(name); level = 0; current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -529,6 +541,27 @@ fail: return err; } +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) +{ + struct page *page; + int err = 0; + + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + /* we don't need to mark_inode_dirty now */ + update_inode(inode, page); + f2fs_put_page(page, 1); + + clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); +fail: + up_write(&F2FS_I(inode)->i_sem); + return err; +} + /* * It only removes the dentry from the dentry page,corresponding name * entry in name page does not need to be touched during deletion. @@ -541,14 +574,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); - void *kaddr = page_address(page); int i; lock_page(page); f2fs_wait_on_page_writeback(page, DATA); - dentry_blk = (struct f2fs_dentry_block *)kaddr; - bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; + dentry_blk = page_address(page); + bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); @@ -603,7 +635,6 @@ bool f2fs_empty_dir(struct inode *dir) unsigned long nblock = dir_blocks(dir); for (bidx = 0; bidx < nblock; bidx++) { - void *kaddr; dentry_page = get_lock_data_page(dir, bidx); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) @@ -612,8 +643,8 @@ bool f2fs_empty_dir(struct inode *dir) return false; } - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; + + dentry_blk = kmap_atomic(dentry_page); if (bidx == 0) bit_pos = 2; else @@ -621,7 +652,7 @@ bool f2fs_empty_dir(struct inode *dir) bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); - kunmap_atomic(kaddr); + kunmap_atomic(dentry_blk); f2fs_put_page(dentry_page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e51c732b0dd9..4dab5338a97a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -41,6 +41,7 @@ #define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define F2FS_MOUNT_INLINE_DATA 0x00000100 #define F2FS_MOUNT_FLUSH_MERGE 0x00000200 +#define F2FS_MOUNT_NOBARRIER 0x00000400 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -99,8 +100,15 @@ enum { META_SSA }; -/* for the list of orphan inodes */ -struct orphan_inode_entry { +/* for the list of ino */ +enum { + ORPHAN_INO, /* for orphan ino list */ + APPEND_INO, /* for append ino list */ + UPDATE_INO, /* for update ino list */ + MAX_INO_ENTRY, /* max. list */ +}; + +struct ino_entry { struct list_head list; /* list head */ nid_t ino; /* inode number */ }; @@ -256,6 +264,8 @@ struct f2fs_nm_info { unsigned int nat_cnt; /* the # of cached nat entries */ struct list_head nat_entries; /* cached nat entry list (clean) */ struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ + struct list_head nat_entry_set; /* nat entry set list */ + unsigned int dirty_nat_cnt; /* total num of nat entries in set */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ @@ -342,9 +352,6 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct list_head wblist_head; /* list of under-writeback pages */ - spinlock_t wblist_lock; /* lock for checkpoint */ - block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ @@ -445,14 +452,17 @@ struct f2fs_sb_info { struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ - struct mutex node_write; /* locking node writes */ + struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ bool por_doing; /* recovery is doing or not */ wait_queue_head_t cp_wait; - /* for orphan inode management */ - struct list_head orphan_inode_list; /* orphan inode list */ - spinlock_t orphan_inode_lock; /* for orphan inode list */ + /* for inode management */ + struct radix_tree_root ino_root[MAX_INO_ENTRY]; /* ino entry array */ + spinlock_t ino_lock[MAX_INO_ENTRY]; /* for ino entry lock */ + struct list_head ino_list[MAX_INO_ENTRY]; /* inode list head */ + + /* for orphan inode, use 0'th array */ unsigned int n_orphans; /* # of orphan inodes */ unsigned int max_orphans; /* max orphan inodes */ @@ -644,7 +654,8 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) */ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - WARN_ON((nid >= NM_I(sbi)->max_nid)); + if (unlikely(nid < F2FS_ROOT_INO(sbi))) + return -EINVAL; if (unlikely(nid >= NM_I(sbi)->max_nid)) return -EINVAL; return 0; @@ -770,7 +781,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) if (flag == NAT_BITMAP) return &ckpt->sit_nat_version_bitmap; else - return ((unsigned char *)ckpt + F2FS_BLKSIZE); + return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; @@ -985,11 +996,15 @@ enum { FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used fo ipu for fdatasync */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) { - set_bit(flag, &fi->flags); + if (!test_bit(flag, &fi->flags)) + set_bit(flag, &fi->flags); } static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) @@ -999,7 +1014,8 @@ static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) { - clear_bit(flag, &fi->flags); + if (test_bit(flag, &fi->flags)) + clear_bit(flag, &fi->flags); } static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) @@ -1138,6 +1154,7 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, int update_dent_inode(struct inode *, const struct qstr *); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); +int f2fs_do_tmpfile(struct inode *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); bool f2fs_empty_dir(struct inode *); @@ -1157,7 +1174,7 @@ void f2fs_msg(struct super_block *, const char *, const char *, ...); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const char *, size_t); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *); /* * node.c @@ -1175,7 +1192,7 @@ int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); void remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *, const struct qstr *); +struct page *new_inode_page(struct inode *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); @@ -1187,6 +1204,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); void recover_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, struct node_info *, block_t); +void recover_inline_xattr(struct inode *, struct page *); bool recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, @@ -1208,7 +1226,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); -void discard_next_dnode(struct f2fs_sb_info *); +void discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); @@ -1242,6 +1260,9 @@ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); int ra_meta_pages(struct f2fs_sb_info *, int, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); +void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); @@ -1253,7 +1274,7 @@ void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); void sync_dirty_dir_inodes(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool); -void init_orphan_info(struct f2fs_sb_info *); +void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1297,7 +1318,6 @@ bool space_for_roll_forward(struct f2fs_sb_info *); struct f2fs_stat_info { struct list_head stat_list; struct f2fs_sb_info *sbi; - struct mutex stat_lock; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; int hit_ext, total_ext; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c58e33075719..208f1a9bd569 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -127,12 +127,30 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return 0; trace_f2fs_sync_file_enter(inode); + + /* if fdatasync is triggered, let's do in-place-update */ + if (datasync) + set_inode_flag(fi, FI_NEED_IPU); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (datasync) + clear_inode_flag(fi, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } + /* + * if there is no written data, don't waste time to write recovery info. + */ + if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + !exist_written_data(sbi, inode->i_ino, APPEND_INO)) { + if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + exist_written_data(sbi, inode->i_ino, UPDATE_INO)) + goto flush_out; + goto out; + } + /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); @@ -188,6 +206,13 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_on_node_pages_writeback(sbi, inode->i_ino); if (ret) goto out; + + /* once recovery info is written, don't need to tack this */ + remove_dirty_inode(sbi, inode->i_ino, APPEND_INO); + clear_inode_flag(fi, FI_APPEND_WRITE); +flush_out: + remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + clear_inode_flag(fi, FI_UPDATE_WRITE); ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); } out: @@ -206,8 +231,9 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, /* find first dirty page index */ pagevec_init(&pvec, 0); - nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, + PAGECACHE_TAG_DIRTY, 1); + pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; pagevec_release(&pvec); return pgofs; } @@ -272,8 +298,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) } } - end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; @@ -380,13 +405,15 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) return; lock_page(page); - if (unlikely(page->mapping != inode->i_mapping)) { - f2fs_put_page(page, 1); - return; - } + if (unlikely(!PageUptodate(page) || + page->mapping != inode->i_mapping)) + goto out; + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); + +out: f2fs_put_page(page, 1); } @@ -645,6 +672,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t off_start, off_end; int ret = 0; + f2fs_balance_fs(sbi); + ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; @@ -659,16 +688,19 @@ static int expand_inode_data(struct inode *inode, loff_t offset, off_start = offset & (PAGE_CACHE_SIZE - 1); off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + f2fs_lock_op(sbi); + for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; - f2fs_lock_op(sbi); + if (index == pg_end && !off_end) + goto noalloc; + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_reserve_block(&dn, index); - f2fs_unlock_op(sbi); if (ret) break; - +noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) @@ -683,8 +715,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset, i_size_read(inode) < new_size) { i_size_write(inode, new_size); mark_inode_dirty(inode); - f2fs_write_inode(inode, NULL); + update_inode_page(inode); } + f2fs_unlock_op(sbi); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b90dbe55403a..d7947d90ccc3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -186,7 +186,6 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int hint = 0; unsigned int secno; /* @@ -194,11 +193,9 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) * selected by background GC before. * Those segments guarantee they have small valid blocks. */ -next: - secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++); - if (secno < TOTAL_SECS(sbi)) { + for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) { if (sec_usage_check(sbi, secno)) - goto next; + continue; clear_bit(secno, dirty_i->victim_secmap); return secno * sbi->segs_per_sec; } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 6eb8d269b53b..948d17bf7281 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -69,12 +69,14 @@ static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) { __u32 hash; f2fs_hash_t f2fs_hash; const char *p; __u32 in[8], buf[4]; + const char *name = name_info->name; + size_t len = name_info->len; if ((len <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 1bba5228c197..5beeccef9ae1 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -172,6 +172,7 @@ int f2fs_write_inline_data(struct inode *inode, stat_inc_inline_inode(inode); } + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); sync_inode_page(&dn); f2fs_put_dnode(&dn); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index adc622c6bdce..2c39999f3868 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -78,6 +78,7 @@ static int do_read_inode(struct inode *inode) if (check_nid_range(sbi, inode->i_ino)) { f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", (unsigned long) inode->i_ino); + WARN_ON(1); return -EINVAL; } @@ -266,13 +267,14 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) - goto no_delete; + goto out_clear; f2fs_bug_on(get_dirty_dents(inode)); remove_dirty_dir_inode(inode); @@ -294,6 +296,13 @@ void f2fs_evict_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); no_delete: - clear_inode(inode); invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + if (xnid) + invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); + if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE)) + add_dirty_inode(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) + add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); +out_clear: + clear_inode(inode); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 9138c32aa698..27b03776ffd2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -13,6 +13,7 @@ #include <linux/pagemap.h> #include <linux/sched.h> #include <linux/ctype.h> +#include <linux/dcache.h> #include "f2fs.h" #include "node.h" @@ -22,14 +23,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); nid_t ino; struct inode *inode; bool nid_free = false; int err; - inode = new_inode(sb); + inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -102,8 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; nid_t ino = 0; int err; @@ -146,8 +145,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int err; f2fs_balance_fs(sbi); @@ -207,8 +205,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, static int f2fs_unlink(struct inode *dir, struct dentry *dentry) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode = dentry->d_inode; struct f2fs_dir_entry *de; struct page *page; @@ -242,8 +239,7 @@ fail: static int f2fs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; size_t symlen = strlen(symname) + 1; int err; @@ -330,8 +326,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) static int f2fs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; int err = 0; @@ -369,8 +364,7 @@ out: static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct super_block *sb = old_dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb); struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct page *old_dir_page; @@ -393,8 +387,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } - f2fs_lock_op(sbi); - if (new_inode) { err = -ENOTEMPTY; @@ -407,6 +399,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_dir; + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); if (err) goto put_out_dir; @@ -417,9 +411,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); @@ -438,9 +429,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, update_inode_page(old_inode); update_inode_page(new_inode); } else { + f2fs_lock_op(sbi); + err = f2fs_add_link(new_dentry, old_inode); - if (err) + if (err) { + f2fs_unlock_op(sbi); goto out_dir; + } if (old_dir_entry) { inc_nlink(new_dir); @@ -448,6 +443,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); @@ -457,9 +456,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); update_inode_page(old_inode); } else { kunmap(old_dir_page); @@ -474,13 +470,159 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; put_out_dir: - f2fs_put_page(new_page, 1); + f2fs_unlock_op(sbi); + kunmap(new_page); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) { kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } +out_old: + kunmap(old_page); + f2fs_put_page(old_page, 0); +out: + return err; +} + +static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct super_block *sb = old_dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *old_dir_page, *new_dir_page; + struct page *old_page, *new_page; + struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; + struct f2fs_dir_entry *old_entry, *new_entry; + int old_nlink = 0, new_nlink = 0; + int err = -ENOENT; + + f2fs_balance_fs(sbi); + + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) + goto out; + + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_entry) + goto out_old; + + /* prepare for updating ".." directory entry info later */ + if (old_dir != new_dir) { + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + old_dir_entry = f2fs_parent_dir(old_inode, + &old_dir_page); + if (!old_dir_entry) + goto out_new; + } + + if (S_ISDIR(new_inode->i_mode)) { + err = -EIO; + new_dir_entry = f2fs_parent_dir(new_inode, + &new_dir_page); + if (!new_dir_entry) + goto out_old_dir; + } + } + + /* + * If cross rename between file and directory those are not + * in the same directory, we will inc nlink of file's parent + * later, so we should check upper boundary of its nlink. + */ + if ((!old_dir_entry || !new_dir_entry) && + old_dir_entry != new_dir_entry) { + old_nlink = old_dir_entry ? -1 : 1; + new_nlink = -old_nlink; + err = -EMLINK; + if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + goto out_new_dir; + } + + f2fs_lock_op(sbi); + + err = update_dent_inode(old_inode, &new_dentry->d_name); + if (err) + goto out_unlock; + + err = update_dent_inode(new_inode, &old_dentry->d_name); + if (err) + goto out_undo; + + /* update ".." directory entry info of old dentry */ + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + + /* update ".." directory entry info of new dentry */ + if (new_dir_entry) + f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir); + + /* update directory entry info of old dir inode */ + f2fs_set_link(old_dir, old_entry, old_page, new_inode); + + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + + update_inode_page(old_inode); + + old_dir->i_ctime = CURRENT_TIME; + if (old_nlink) { + down_write(&F2FS_I(old_dir)->i_sem); + if (old_nlink < 0) + drop_nlink(old_dir); + else + inc_nlink(old_dir); + up_write(&F2FS_I(old_dir)->i_sem); + } + mark_inode_dirty(old_dir); + update_inode_page(old_dir); + + /* update directory entry info of new dir inode */ + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + + down_write(&F2FS_I(new_inode)->i_sem); + file_lost_pino(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + + update_inode_page(new_inode); + + new_dir->i_ctime = CURRENT_TIME; + if (new_nlink) { + down_write(&F2FS_I(new_dir)->i_sem); + if (new_nlink < 0) + drop_nlink(new_dir); + else + inc_nlink(new_dir); + up_write(&F2FS_I(new_dir)->i_sem); + } + mark_inode_dirty(new_dir); + update_inode_page(new_dir); + f2fs_unlock_op(sbi); + return 0; +out_undo: + /* Still we may fail to recover name info of f2fs_inode here */ + update_dent_inode(old_inode, &old_dentry->d_name); +out_unlock: + f2fs_unlock_op(sbi); +out_new_dir: + if (new_dir_entry) { + kunmap(new_dir_page); + f2fs_put_page(new_dir_page, 0); + } +out_old_dir: + if (old_dir_entry) { + kunmap(old_dir_page); + f2fs_put_page(old_dir_page, 0); + } +out_new: + kunmap(new_page); + f2fs_put_page(new_page, 0); out_old: kunmap(old_page); f2fs_put_page(old_page, 0); @@ -488,6 +630,71 @@ out: return err; } +static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return f2fs_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + /* + * VFS has already handled the new dentry existence case, + * here, we just deal with "RENAME_NOREPLACE" as regular rename. + */ + return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + +static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct inode *inode; + int err; + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); + if (err) + goto out; + + err = f2fs_do_tmpfile(inode, dir); + if (err) + goto release_out; + + /* + * add this non-linked tmpfile to orphan list, in this way we could + * remove all unused data of tmpfile after abnormal power-off. + */ + add_orphan_inode(sbi, inode->i_ino); + f2fs_unlock_op(sbi); + + alloc_nid_done(sbi, inode->i_ino); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + return 0; + +release_out: + release_orphan_inode(sbi); +out: + f2fs_unlock_op(sbi); + clear_nlink(inode); + unlock_new_inode(inode); + make_bad_inode(inode); + iput(inode); + alloc_nid_failed(sbi, inode->i_ino); + return err; +} + const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, .lookup = f2fs_lookup, @@ -498,6 +705,8 @@ const struct inode_operations f2fs_dir_inode_operations = { .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, .rename = f2fs_rename, + .rename2 = f2fs_rename2, + .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9dfb9a042fd2..d3d90d284631 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -25,6 +25,7 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +static struct kmem_cache *nat_entry_set_slab; bool available_free_memory(struct f2fs_sb_info *sbi, int type) { @@ -42,6 +43,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12; res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1); } @@ -88,12 +91,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) /* get current nat block page with lock */ src_page = get_meta_page(sbi, src_off); - - /* Dirty src_page means that it is already the new target NAT page. */ - if (PageDirty(src_page)) - return src_page; - dst_page = grab_meta_page(sbi, dst_off); + f2fs_bug_on(PageDirty(src_page)); src_addr = page_address(src_page); dst_addr = page_address(dst_page); @@ -843,7 +842,7 @@ void remove_inode_page(struct inode *inode) truncate_node(&dn); } -struct page *new_inode_page(struct inode *inode, const struct qstr *name) +struct page *new_inode_page(struct inode *inode) { struct dnode_of_data dn; @@ -1232,12 +1231,12 @@ static int f2fs_write_node_page(struct page *page, if (wbc->for_reclaim) goto redirty_out; - mutex_lock(&sbi->node_write); + down_read(&sbi->node_write); set_page_writeback(page); write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); - mutex_unlock(&sbi->node_write); + up_read(&sbi->node_write); unlock_page(page); return 0; @@ -1550,7 +1549,7 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, clear_node_page_dirty(page); } -static void recover_inline_xattr(struct inode *inode, struct page *page) +void recover_inline_xattr(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); void *src_addr, *dst_addr; @@ -1589,8 +1588,6 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) nid_t new_xnid = nid_of_node(page); struct node_info ni; - recover_inline_xattr(inode, page); - if (!f2fs_has_xattr_block(ofs_of_node(page))) return false; @@ -1742,7 +1739,90 @@ skip: return err; } -static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) +static struct nat_entry_set *grab_nat_entry_set(void) +{ + struct nat_entry_set *nes = + f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); + + nes->entry_cnt = 0; + INIT_LIST_HEAD(&nes->set_list); + INIT_LIST_HEAD(&nes->entry_list); + return nes; +} + +static void release_nat_entry_set(struct nat_entry_set *nes, + struct f2fs_nm_info *nm_i) +{ + f2fs_bug_on(!list_empty(&nes->entry_list)); + + nm_i->dirty_nat_cnt -= nes->entry_cnt; + list_del(&nes->set_list); + kmem_cache_free(nat_entry_set_slab, nes); +} + +static void adjust_nat_entry_set(struct nat_entry_set *nes, + struct list_head *head) +{ + struct nat_entry_set *next = nes; + + if (list_is_last(&nes->set_list, head)) + return; + + list_for_each_entry_continue(next, head, set_list) + if (nes->entry_cnt <= next->entry_cnt) + break; + + list_move_tail(&nes->set_list, &next->set_list); +} + +static void add_nat_entry(struct nat_entry *ne, struct list_head *head) +{ + struct nat_entry_set *nes; + nid_t start_nid = START_NID(ne->ni.nid); + + list_for_each_entry(nes, head, set_list) { + if (nes->start_nid == start_nid) { + list_move_tail(&ne->list, &nes->entry_list); + nes->entry_cnt++; + adjust_nat_entry_set(nes, head); + return; + } + } + + nes = grab_nat_entry_set(); + + nes->start_nid = start_nid; + list_move_tail(&ne->list, &nes->entry_list); + nes->entry_cnt++; + list_add(&nes->set_list, head); +} + +static void merge_nats_in_set(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct list_head *dirty_list = &nm_i->dirty_nat_entries; + struct list_head *set_list = &nm_i->nat_entry_set; + struct nat_entry *ne, *tmp; + + write_lock(&nm_i->nat_tree_lock); + list_for_each_entry_safe(ne, tmp, dirty_list, list) { + if (nat_get_blkaddr(ne) == NEW_ADDR) + continue; + add_nat_entry(ne, set_list); + nm_i->dirty_nat_cnt++; + } + write_unlock(&nm_i->nat_tree_lock); +} + +static bool __has_cursum_space(struct f2fs_summary_block *sum, int size) +{ + if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES) + return true; + else + return false; +} + +static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1750,12 +1830,6 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) int i; mutex_lock(&curseg->curseg_mutex); - - if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) { - mutex_unlock(&curseg->curseg_mutex); - return false; - } - for (i = 0; i < nats_in_cursum(sum); i++) { struct nat_entry *ne; struct f2fs_nat_entry raw_ne; @@ -1765,23 +1839,21 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) retry: write_lock(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne) { - __set_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - continue; - } + if (ne) + goto found; + ne = grab_nat_entry(nm_i, nid); if (!ne) { write_unlock(&nm_i->nat_tree_lock); goto retry; } node_info_from_raw_nat(&ne->ni, &raw_ne); +found: __set_nat_cache_dirty(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); } update_nats_in_cursum(sum, -i); mutex_unlock(&curseg->curseg_mutex); - return true; } /* @@ -1792,80 +1864,91 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - struct nat_entry *ne, *cur; - struct page *page = NULL; - struct f2fs_nat_block *nat_blk = NULL; - nid_t start_nid = 0, end_nid = 0; - bool flushed; + struct nat_entry_set *nes, *tmp; + struct list_head *head = &nm_i->nat_entry_set; + bool to_journal = true; - flushed = flush_nats_in_journal(sbi); - - if (!flushed) - mutex_lock(&curseg->curseg_mutex); - - /* 1) flush dirty nat caches */ - list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { - nid_t nid; - struct f2fs_nat_entry raw_ne; - int offset = -1; - - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; + /* merge nat entries of dirty list to nat entry set temporarily */ + merge_nats_in_set(sbi); - nid = nat_get_nid(ne); + /* + * if there are no enough space in journal to store dirty nat + * entries, remove all entries from journal and merge them + * into nat entry set. + */ + if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) { + remove_nats_in_journal(sbi); - if (flushed) - goto to_nat_page; + /* + * merge nat entries of dirty list to nat entry set temporarily + */ + merge_nats_in_set(sbi); + } - /* if there is room for nat enries in curseg->sumpage */ - offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); - if (offset >= 0) { - raw_ne = nat_in_journal(sum, offset); - goto flush_now; - } -to_nat_page: - if (!page || (start_nid > nid || nid > end_nid)) { - if (page) { - f2fs_put_page(page, 1); - page = NULL; - } - start_nid = START_NID(nid); - end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1; + if (!nm_i->dirty_nat_cnt) + return; - /* - * get nat block with dirty flag, increased reference - * count, mapped and lock - */ + /* + * there are two steps to flush nat entries: + * #1, flush nat entries to journal in current hot data summary block. + * #2, flush nat entries to nat page. + */ + list_for_each_entry_safe(nes, tmp, head, set_list) { + struct f2fs_nat_block *nat_blk; + struct nat_entry *ne, *cur; + struct page *page; + nid_t start_nid = nes->start_nid; + + if (to_journal && !__has_cursum_space(sum, nes->entry_cnt)) + to_journal = false; + + if (to_journal) { + mutex_lock(&curseg->curseg_mutex); + } else { page = get_next_nat_page(sbi, start_nid); nat_blk = page_address(page); + f2fs_bug_on(!nat_blk); } - f2fs_bug_on(!nat_blk); - raw_ne = nat_blk->entries[nid - start_nid]; -flush_now: - raw_nat_from_node_info(&raw_ne, &ne->ni); - - if (offset < 0) { - nat_blk->entries[nid - start_nid] = raw_ne; - } else { - nat_in_journal(sum, offset) = raw_ne; - nid_in_journal(sum, offset) = cpu_to_le32(nid); - } + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(ne, cur, &nes->entry_list, list) { + struct f2fs_nat_entry *raw_ne; + nid_t nid = nat_get_nid(ne); + int offset; + + if (to_journal) { + offset = lookup_journal_in_cursum(sum, + NAT_JOURNAL, nid, 1); + f2fs_bug_on(offset < 0); + raw_ne = &nat_in_journal(sum, offset); + nid_in_journal(sum, offset) = cpu_to_le32(nid); + } else { + raw_ne = &nat_blk->entries[nid - start_nid]; + } + raw_nat_from_node_info(raw_ne, &ne->ni); - if (nat_get_blkaddr(ne) == NULL_ADDR && + if (nat_get_blkaddr(ne) == NULL_ADDR && add_free_nid(sbi, nid, false) <= 0) { - write_lock(&nm_i->nat_tree_lock); - __del_from_nat_cache(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } else { - write_lock(&nm_i->nat_tree_lock); - __clear_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); + write_lock(&nm_i->nat_tree_lock); + __del_from_nat_cache(nm_i, ne); + write_unlock(&nm_i->nat_tree_lock); + } else { + write_lock(&nm_i->nat_tree_lock); + __clear_nat_cache_dirty(nm_i, ne); + write_unlock(&nm_i->nat_tree_lock); + } } + + if (to_journal) + mutex_unlock(&curseg->curseg_mutex); + else + f2fs_put_page(page, 1); + + release_nat_entry_set(nes, nm_i); } - if (!flushed) - mutex_unlock(&curseg->curseg_mutex); - f2fs_put_page(page, 1); + + f2fs_bug_on(!list_empty(head)); + f2fs_bug_on(nm_i->dirty_nat_cnt); } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -1894,6 +1977,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); INIT_LIST_HEAD(&nm_i->dirty_nat_entries); + INIT_LIST_HEAD(&nm_i->nat_entry_set); mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); @@ -1974,19 +2058,30 @@ int __init create_node_manager_caches(void) nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry)); if (!nat_entry_slab) - return -ENOMEM; + goto fail; free_nid_slab = f2fs_kmem_cache_create("free_nid", sizeof(struct free_nid)); - if (!free_nid_slab) { - kmem_cache_destroy(nat_entry_slab); - return -ENOMEM; - } + if (!free_nid_slab) + goto destory_nat_entry; + + nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set", + sizeof(struct nat_entry_set)); + if (!nat_entry_set_slab) + goto destory_free_nid; return 0; + +destory_free_nid: + kmem_cache_destroy(free_nid_slab); +destory_nat_entry: + kmem_cache_destroy(nat_entry_slab); +fail: + return -ENOMEM; } void destroy_node_manager_caches(void) { + kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); kmem_cache_destroy(nat_entry_slab); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7281112cd1c8..8a116a407599 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -89,6 +89,13 @@ enum mem_type { DIRTY_DENTS /* indicates dirty dentry pages */ }; +struct nat_entry_set { + struct list_head set_list; /* link with all nat sets */ + struct list_head entry_list; /* link with dirty nat entries */ + nid_t start_nid; /* start nid of nats in set */ + unsigned int entry_cnt; /* the # of nat entries in set */ +}; + /* * For free nid mangement */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a112368a4a86..fe1c6d921ba2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -300,6 +300,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct node_info ni; int err = 0, recovered = 0; + recover_inline_xattr(inode, page); + if (recover_inline_data(inode, page)) goto out; @@ -434,7 +436,9 @@ next: int recover_fsync_data(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + block_t blkaddr; int err; bool need_writecp = false; @@ -447,6 +451,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ sbi->por_doing = true; + + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + err = find_fsync_dnodes(sbi, &inode_list); if (err) goto out; @@ -462,8 +469,21 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) out: destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); + + if (err) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } + sbi->por_doing = false; - if (!err && need_writecp) + if (err) { + discard_next_dnode(sbi, blkaddr); + + /* Flush all the NAT/SIT pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) + sync_meta_pages(sbi, META, LONG_MAX); + } else if (need_writecp) { write_checkpoint(sbi, false); + } return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f25f0e07e26f..0dfeebae2a50 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -239,6 +239,12 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; struct flush_cmd cmd; + trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE)); + + if (test_opt(sbi, NOBARRIER)) + return 0; + if (!test_opt(sbi, FLUSH_MERGE)) return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); @@ -272,27 +278,27 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; spin_lock_init(&fcc->issue_lock); init_waitqueue_head(&fcc->flush_wait_queue); + SM_I(sbi)->cmd_control_info = fcc; fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); + SM_I(sbi)->cmd_control_info = NULL; return err; } - sbi->sm_info->cmd_control_info = fcc; return err; } void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) { - struct flush_cmd_control *fcc = - sbi->sm_info->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; if (fcc && fcc->f2fs_issue_flush) kthread_stop(fcc->f2fs_issue_flush); kfree(fcc); - sbi->sm_info->cmd_control_info = NULL; + SM_I(sbi)->cmd_control_info = NULL; } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -376,11 +382,8 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -void discard_next_dnode(struct f2fs_sb_info *sbi) +void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - block_t blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - if (f2fs_issue_discard(sbi, blkaddr, 1)) { struct page *page = grab_meta_page(sbi, blkaddr); /* zero-filled page */ @@ -436,17 +439,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno = -1; + unsigned int segno; unsigned int total_segs = TOTAL_SEGS(sbi); mutex_lock(&dirty_i->seglist_lock); - while (1) { - segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - segno + 1); - if (segno >= total_segs) - break; + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs) __set_test_and_free(sbi, segno); - } mutex_unlock(&dirty_i->seglist_lock); } @@ -973,14 +971,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; - unsigned int old_cursegno; curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - old_cursegno = curseg->segno; /* * __add_sum_entry should be resided under the curseg_mutex @@ -1001,7 +997,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); mutex_unlock(&sit_i->sentry_lock); @@ -1531,7 +1526,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) struct page *page = NULL; struct f2fs_sit_block *raw_sit = NULL; unsigned int start = 0, end = 0; - unsigned int segno = -1; + unsigned int segno; bool flushed; mutex_lock(&curseg->curseg_mutex); @@ -1543,7 +1538,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) */ flushed = flush_sits_in_journal(sbi); - while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) { + for_each_set_bit(segno, bitmap, nsegs) { struct seg_entry *se = get_seg_entry(sbi, segno); int sit_offset, offset; @@ -1702,7 +1697,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); + array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL); if (!array) return -ENOMEM; @@ -1885,8 +1880,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) /* init sm info */ sbi->sm_info = sm_info; - INIT_LIST_HEAD(&sm_info->wblist_head); - spin_lock_init(&sm_info->wblist_lock); sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 7091204680f4..55973f7b0330 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -347,8 +347,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; - next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), - start_segno); + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; @@ -486,6 +486,10 @@ static inline bool need_inplace_update(struct inode *inode) if (S_ISDIR(inode->i_mode)) return false; + /* this is only set during fdatasync */ + if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + return true; + switch (SM_I(sbi)->ipu_policy) { case F2FS_IPU_FORCE: return true; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b2b18637cb9e..657582fc7601 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -52,6 +52,7 @@ enum { Opt_inline_xattr, Opt_inline_data, Opt_flush_merge, + Opt_nobarrier, Opt_err, }; @@ -69,6 +70,7 @@ static match_table_t f2fs_tokens = { {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_flush_merge, "flush_merge"}, + {Opt_nobarrier, "nobarrier"}, {Opt_err, NULL}, }; @@ -339,6 +341,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; + case Opt_nobarrier: + set_opt(sbi, NOBARRIER); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -544,6 +549,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_data"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); + if (test_opt(sbi, NOBARRIER)) + seq_puts(seq, ",nobarrier"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -615,7 +622,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. */ - if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; /* @@ -642,8 +649,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { destroy_flush_cmd_control(sbi); - } else if (test_opt(sbi, FLUSH_MERGE) && - !sbi->sm_info->cmd_control_info) { + } else if (test_opt(sbi, FLUSH_MERGE) && !SM_I(sbi)->cmd_control_info) { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; @@ -689,9 +695,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (unlikely(ino < F2FS_ROOT_INO(sbi))) - return ERR_PTR(-ESTALE); - if (unlikely(ino >= NM_I(sbi)->max_nid)) + if (check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* @@ -949,7 +953,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->gc_mutex); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - mutex_init(&sbi->node_write); + init_rwsem(&sbi->node_write); sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); @@ -999,7 +1003,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); - init_orphan_info(sbi); + init_ino_entry_info(sbi); /* setup f2fs internal modules */ err = build_segment_manager(sbi); @@ -1036,8 +1040,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_node_inode; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); err = -EINVAL; - goto free_root_inode; + goto free_node_inode; } sb->s_root = d_make_root(root); /* allocate root dentry */ @@ -1084,7 +1089,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) diff --git a/fs/fcntl.c b/fs/fcntl.c index 72c82f69b01b..22d1c3df61ac 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -21,6 +21,7 @@ #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/shmem_fs.h> #include <asm/poll.h> #include <asm/siginfo.h> @@ -336,6 +337,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_GETPIPE_SZ: err = pipe_fcntl(filp, cmd, arg); break; + case F_ADD_SEALS: + case F_GET_SEALS: + err = shmem_fcntl(filp, cmd, arg); + break; default: break; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index be568b7311d6..ef9bef118342 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -342,7 +342,8 @@ static void __inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); + __wait_on_bit(wqh, &wq, bit_wait, + TASK_UNINTERRUPTIBLE); spin_lock(&inode->i_lock); } } diff --git a/fs/fs_pin.c b/fs/fs_pin.c new file mode 100644 index 000000000000..9368236ca100 --- /dev/null +++ b/fs/fs_pin.c @@ -0,0 +1,78 @@ +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/fs_pin.h> +#include "internal.h" +#include "mount.h" + +static void pin_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct fs_pin, rcu)); +} + +static DEFINE_SPINLOCK(pin_lock); + +void pin_put(struct fs_pin *p) +{ + if (atomic_long_dec_and_test(&p->count)) + call_rcu(&p->rcu, pin_free_rcu); +} + +void pin_remove(struct fs_pin *pin) +{ + spin_lock(&pin_lock); + hlist_del(&pin->m_list); + hlist_del(&pin->s_list); + spin_unlock(&pin_lock); +} + +void pin_insert(struct fs_pin *pin, struct vfsmount *m) +{ + spin_lock(&pin_lock); + hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins); + hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); + spin_unlock(&pin_lock); +} + +void mnt_pin_kill(struct mount *m) +{ + while (1) { + struct hlist_node *p; + struct fs_pin *pin; + rcu_read_lock(); + p = ACCESS_ONCE(m->mnt_pins.first); + if (!p) { + rcu_read_unlock(); + break; + } + pin = hlist_entry(p, struct fs_pin, m_list); + if (!atomic_long_inc_not_zero(&pin->count)) { + rcu_read_unlock(); + cpu_relax(); + continue; + } + rcu_read_unlock(); + pin->kill(pin); + } +} + +void sb_pin_kill(struct super_block *sb) +{ + while (1) { + struct hlist_node *p; + struct fs_pin *pin; + rcu_read_lock(); + p = ACCESS_ONCE(sb->s_pins.first); + if (!p) { + rcu_read_unlock(); + break; + } + pin = hlist_entry(p, struct fs_pin, s_list); + if (!atomic_long_inc_not_zero(&pin->count)) { + rcu_read_unlock(); + cpu_relax(); + continue; + } + rcu_read_unlock(); + pin->kill(pin); + } +} diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index aec01be91b0a..89acec742e0b 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -160,7 +160,7 @@ void __fscache_enable_cookie(struct fscache_cookie *cookie, _enter("%p", cookie); wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock; @@ -255,7 +255,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) if (!fscache_defer_lookup) { _debug("non-deferred lookup %p", &cookie->flags); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); _debug("complete"); if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) goto unavailable; @@ -463,7 +463,6 @@ void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) _enter("%p", cookie); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, - fscache_wait_bit_interruptible, TASK_UNINTERRUPTIBLE); _leave(""); @@ -525,7 +524,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) } wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock_enable; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index bc6c08fcfddd..7872a62ef30c 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void) return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); } -extern int fscache_wait_bit(void *); -extern int fscache_wait_bit_interruptible(void *); extern int fscache_wait_atomic_t(atomic_t *); /* diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 63f868e869b9..b39d487ccfb0 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write, return ret; } -struct ctl_table fscache_sysctls[] = { +static struct ctl_table fscache_sysctls[] = { { .procname = "object_max_active", .data = &fscache_object_max_active, @@ -87,7 +87,7 @@ struct ctl_table fscache_sysctls[] = { {} }; -struct ctl_table fscache_sysctls_root[] = { +static struct ctl_table fscache_sysctls_root[] = { { .procname = "fscache", .mode = 0555, @@ -197,24 +197,6 @@ static void __exit fscache_exit(void) module_exit(fscache_exit); /* - * wait_on_bit() sleep function for uninterruptible waiting - */ -int fscache_wait_bit(void *flags) -{ - schedule(); - return 0; -} - -/* - * wait_on_bit() sleep function for interruptible waiting - */ -int fscache_wait_bit_interruptible(void *flags) -{ - schedule(); - return signal_pending(current); -} - -/* * wait_on_atomic_t() sleep function for uninterruptible waiting */ int fscache_wait_atomic_t(atomic_t *p) diff --git a/fs/fscache/page.c b/fs/fscache/page.c index ed70714503fa..85332b9d19d1 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -298,7 +298,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) jif = jiffies; if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { fscache_stat(&fscache_n_retrievals_intr); _leave(" = -ERESTARTSYS"); @@ -342,7 +341,6 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, if (stat_op_waits) fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { ret = fscache_cancel_op(op, do_cancel); if (ret == 0) @@ -351,7 +349,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, /* it's been removed from the pending queue by another party, * so we should get to run shortly */ wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); } _debug("<<< GO"); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 098f97bdcf1b..ca887314aba9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -643,9 +643,8 @@ struct fuse_copy_state { unsigned long seglen; unsigned long addr; struct page *pg; - void *mapaddr; - void *buf; unsigned len; + unsigned offset; unsigned move_pages:1; }; @@ -666,23 +665,17 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; - if (!cs->write) { - kunmap_atomic(cs->mapaddr); - } else { - kunmap_atomic(cs->mapaddr); + if (cs->write) buf->len = PAGE_SIZE - cs->len; - } cs->currbuf = NULL; - cs->mapaddr = NULL; - } else if (cs->mapaddr) { - kunmap_atomic(cs->mapaddr); + } else if (cs->pg) { if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); } put_page(cs->pg); - cs->mapaddr = NULL; } + cs->pg = NULL; } /* @@ -691,7 +684,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) */ static int fuse_copy_fill(struct fuse_copy_state *cs) { - unsigned long offset; + struct page *page; int err; unlock_request(cs->fc, cs->req); @@ -706,14 +699,12 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = kmap_atomic(buf->page); + cs->pg = buf->page; + cs->offset = buf->offset; cs->len = buf->len; - cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; cs->nr_segs--; } else { - struct page *page; - if (cs->nr_segs == cs->pipe->buffers) return -EIO; @@ -726,8 +717,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap_atomic(page); - cs->buf = cs->mapaddr; + cs->pg = page; + cs->offset = 0; cs->len = PAGE_SIZE; cs->pipebufs++; cs->nr_segs++; @@ -740,14 +731,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->iov++; cs->nr_segs--; } - err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + err = get_user_pages_fast(cs->addr, 1, cs->write, &page); if (err < 0) return err; BUG_ON(err != 1); - offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg); - cs->buf = cs->mapaddr + offset; - cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->pg = page; + cs->offset = cs->addr % PAGE_SIZE; + cs->len = min(PAGE_SIZE - cs->offset, cs->seglen); cs->seglen -= cs->len; cs->addr += cs->len; } @@ -760,15 +750,20 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { + void *pgaddr = kmap_atomic(cs->pg); + void *buf = pgaddr + cs->offset; + if (cs->write) - memcpy(cs->buf, *val, ncpy); + memcpy(buf, *val, ncpy); else - memcpy(*val, cs->buf, ncpy); + memcpy(*val, buf, ncpy); + + kunmap_atomic(pgaddr); *val += ncpy; } *size -= ncpy; cs->len -= ncpy; - cs->buf += ncpy; + cs->offset += ncpy; return ncpy; } @@ -874,8 +869,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = kmap_atomic(buf->page); - cs->buf = cs->mapaddr + buf->offset; + cs->pg = buf->page; + cs->offset = buf->offset; err = lock_request(cs->fc, cs->req); if (err) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 42198359fa1b..de1d84af9f7c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -198,7 +198,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) goto invalid; - else if (fuse_dentry_time(entry) < get_jiffies_64()) { + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || + (flags & LOOKUP_REVAL)) { int err; struct fuse_entry_out outarg; struct fuse_req *req; @@ -814,13 +815,6 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) -{ - return fuse_rename_common(olddir, oldent, newdir, newent, 0, - FUSE_RENAME, sizeof(struct fuse_rename_in)); -} - static int fuse_rename2(struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) @@ -831,17 +825,24 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent, if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; - if (fc->no_rename2 || fc->minor < 23) - return -EINVAL; + if (flags) { + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; - err = fuse_rename_common(olddir, oldent, newdir, newent, flags, - FUSE_RENAME2, sizeof(struct fuse_rename2_in)); - if (err == -ENOSYS) { - fc->no_rename2 = 1; - err = -EINVAL; + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, + sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + } else { + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, + sizeof(struct fuse_rename_in)); } - return err; + return err; } static int fuse_link(struct dentry *entry, struct inode *newdir, @@ -985,7 +986,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, int err; bool r; - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { r = true; err = fuse_do_getattr(inode, stat, file); } else { @@ -1171,7 +1172,7 @@ static int fuse_permission(struct inode *inode, int mask) ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); @@ -2017,7 +2018,6 @@ static const struct inode_operations fuse_dir_inode_operations = { .symlink = fuse_symlink, .unlink = fuse_unlink, .rmdir = fuse_rmdir, - .rename = fuse_rename, .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6e16dad13e9b..912061ac4baf 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1303,10 +1303,10 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, while (nbytes < *nbytesp && req->num_pages < req->max_pages) { unsigned npages; size_t start; - unsigned n = req->max_pages - req->num_pages; ssize_t ret = iov_iter_get_pages(ii, &req->pages[req->num_pages], - n * PAGE_SIZE, &start); + req->max_pages - req->num_pages, + &start); if (ret < 0) return ret; @@ -1687,7 +1687,7 @@ static int fuse_writepage_locked(struct page *page) error = -EIO; req->ff = fuse_write_file_get(fc, fi); if (!req->ff) - goto err_free; + goto err_nofile; fuse_write_fill(req, req->ff, page_offset(page), 0); @@ -1715,6 +1715,8 @@ static int fuse_writepage_locked(struct page *page) return 0; +err_nofile: + __free_page(tmp_page); err_free: fuse_request_free(req); err: @@ -1955,8 +1957,8 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kzalloc(sizeof(struct page *) * - FUSE_MAX_PAGES_PER_REQ, + data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + sizeof(struct page *), GFP_NOFS); if (!data.orig_pages) goto out; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 754dcf23de8a..03246cd9d47a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -478,6 +478,17 @@ static const match_table_t tokens = { {OPT_ERR, NULL} }; +static int fuse_match_uint(substring_t *s, unsigned int *res) +{ + int err = -ENOMEM; + char *buf = match_strdup(s); + if (buf) { + err = kstrtouint(buf, 10, res); + kfree(buf); + } + return err; +} + static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) { char *p; @@ -488,6 +499,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) while ((p = strsep(&opt, ",")) != NULL) { int token; int value; + unsigned uv; substring_t args[MAX_OPT_ARGS]; if (!*p) continue; @@ -511,18 +523,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_USER_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->user_id = make_kuid(current_user_ns(), value); + d->user_id = make_kuid(current_user_ns(), uv); if (!uid_valid(d->user_id)) return 0; d->user_id_present = 1; break; case OPT_GROUP_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->group_id = make_kgid(current_user_ns(), value); + d->group_id = make_kgid(current_user_ns(), uv); if (!gid_valid(d->group_id)) return 0; d->group_id_present = 1; @@ -895,9 +907,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->writeback_cache = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fc->sb->s_time_gran = arg->time_gran; - else - fc->sb->s_time_gran = 1000000000; - } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -926,7 +935,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | - FUSE_WRITEBACK_CACHE; + FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1006,7 +1015,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); - if (!parse_fuse_opt((char *) data, &d, is_bdev)) + if (!parse_fuse_opt(data, &d, is_bdev)) goto err; if (is_bdev) { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4fc3a3046174..26b3f952e6b1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -981,7 +981,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int error = 0; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; mutex_lock(&fp->f_fl_mutex); @@ -991,7 +991,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) goto out; flock_lock_file_wait(file, &(struct file_lock){.fl_type = F_UNLCK}); - gfs2_glock_dq_wait(fl_gh); + gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); } else { error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c355f7320e44..7f513b1ceb2c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -731,14 +731,14 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, cachep = gfs2_glock_aspace_cachep; else cachep = gfs2_glock_cachep; - gl = kmem_cache_alloc(cachep, GFP_KERNEL); + gl = kmem_cache_alloc(cachep, GFP_NOFS); if (!gl) return -ENOMEM; memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); if (glops->go_flags & GLOF_LVB) { - gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); + gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { kmem_cache_free(cachep, gl); return -ENOMEM; @@ -856,27 +856,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh) } /** - * gfs2_glock_holder_wait - * @word: unused - * - * This function and gfs2_glock_demote_wait both show up in the WCHAN - * field. Thus I've separated these otherwise identical functions in - * order to be more informative to the user. - */ - -static int gfs2_glock_holder_wait(void *word) -{ - schedule(); - return 0; -} - -static int gfs2_glock_demote_wait(void *word) -{ - schedule(); - return 0; -} - -/** * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * @@ -888,7 +867,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh) unsigned long time1 = jiffies; might_sleep(); - wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */ /* Lengthen the minimum hold time. */ gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + @@ -1128,7 +1107,7 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); might_sleep(); - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); } /** @@ -1404,12 +1383,16 @@ __acquires(&lru_lock) gl = list_entry(list->next, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); if (!spin_trylock(&gl->gl_spin)) { +add_back_to_lru: list_add(&gl->gl_lru, &lru_list); atomic_inc(&lru_count); continue; } + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + spin_unlock(&gl->gl_spin); + goto add_back_to_lru; + } clear_bit(GLF_LRU, &gl->gl_flags); - spin_unlock(&lru_lock); gl->gl_lockref.count++; if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); @@ -1417,7 +1400,7 @@ __acquires(&lru_lock) if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gl->gl_lockref.count--; spin_unlock(&gl->gl_spin); - spin_lock(&lru_lock); + cond_resched_lock(&lru_lock); } } @@ -1442,7 +1425,7 @@ static long gfs2_scan_glock_lru(int nr) gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); /* Test for being demotable */ - if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + if (!test_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); freed++; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index fc1100781bbc..2ffc67dce87f 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -234,8 +234,8 @@ static void inode_go_sync(struct gfs2_glock *gl) * inode_go_inval - prepare a inode glock to be released * @gl: the glock * @flags: - * - * Normally we invlidate everything, but if we are moving into + * + * Normally we invalidate everything, but if we are moving into * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we * can keep hold of the metadata, since it won't have changed. * diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 91f274de1246..641383a9c1bb 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -936,12 +936,6 @@ fail: return error; } -static int dlm_recovery_wait(void *word) -{ - schedule(); - return 0; -} - static int control_first_done(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -976,7 +970,7 @@ restart: fs_info(sdp, "control_first_done wait gen %u\n", start_gen); wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, - dlm_recovery_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); goto restart; } @@ -1036,8 +1030,8 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, new_size = old_size + RECOVER_SIZE_INC; - submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); - result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); if (!submit || !result) { kfree(submit); kfree(result); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index bc564c0d6d16..d3eae244076e 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1024,20 +1024,13 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp) lm->lm_unmount(sdp); } -static int gfs2_journalid_wait(void *word) -{ - if (signal_pending(current)) - return -EINTR; - schedule(); - return 0; -} - static int wait_on_journal(struct gfs2_sbd *sdp) { if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) return 0; - return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, TASK_INTERRUPTIBLE) + ? -EINTR : 0; } void gfs2_online_uevent(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 94555d4c5698..573bd3b758fa 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -591,12 +591,6 @@ done: wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } -static int gfs2_recovery_wait(void *word) -{ - schedule(); - return 0; -} - int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) { int rv; @@ -609,7 +603,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) BUG_ON(!rv); if (wait) - wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, TASK_UNINTERRUPTIBLE); return wait ? jd->jd_recover_error : 0; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index db629d1bd1bd..f4cb9c0d6bbd 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -337,7 +337,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le /** * gfs2_free_extlen - Return extent length of free blocks - * @rbm: Starting position + * @rrbm: Starting position * @len: Max length to check * * Starting at the block specified by the rbm, see how many free blocks @@ -2522,7 +2522,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) /** * gfs2_rlist_free - free a resource group list - * @list: the list of resource groups + * @rlist: the list of resource groups * */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 1319b5c4ec68..2607ff13d486 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -864,12 +864,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) return error; } -static int gfs2_umount_recovery_wait(void *word) -{ - schedule(); - return 0; -} - /** * gfs2_put_super - Unmount the filesystem * @sb: The VFS superblock @@ -894,7 +888,7 @@ restart: continue; spin_unlock(&sdp->sd_jindex_spin); wait_on_bit(&jd->jd_flags, JDF_RECOVERY, - gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); goto restart; } spin_unlock(&sdp->sd_jindex_spin); diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 9c88da0e855a..4fcd40d6f308 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major, extern int link_file(const char *from, const char *to); extern int hostfs_do_readlink(char *file, char *buf, int size); extern int rename_file(char *from, char *to); +extern int rename2_file(char *from, char *to, unsigned int flags); extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index bb529f3b7f2b..fd62cae0fdcb 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -static int hostfs_rename(struct inode *from_ino, struct dentry *from, - struct inode *to_ino, struct dentry *to) +static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { - char *from_name, *to_name; + char *old_name, *new_name; int err; - if ((from_name = dentry_name(from)) == NULL) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + old_name = dentry_name(old_dentry); + if (old_name == NULL) return -ENOMEM; - if ((to_name = dentry_name(to)) == NULL) { - __putname(from_name); + new_name = dentry_name(new_dentry); + if (new_name == NULL) { + __putname(old_name); return -ENOMEM; } - err = rename_file(from_name, to_name); - __putname(from_name); - __putname(to_name); + if (!flags) + err = rename_file(old_name, new_name); + else + err = rename2_file(old_name, new_name, flags); + + __putname(old_name); + __putname(new_name); return err; } @@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = { .mkdir = hostfs_mkdir, .rmdir = hostfs_rmdir, .mknod = hostfs_mknod, - .rename = hostfs_rename, + .rename2 = hostfs_rename2, .permission = hostfs_permission, .setattr = hostfs_setattr, }; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 67838f3aa20a..9765dab95cbd 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -14,6 +14,7 @@ #include <sys/time.h> #include <sys/types.h> #include <sys/vfs.h> +#include <sys/syscall.h> #include "hostfs.h" #include <utime.h> @@ -360,6 +361,33 @@ int rename_file(char *from, char *to) return 0; } +int rename2_file(char *from, char *to, unsigned int flags) +{ + int err; + +#ifndef SYS_renameat2 +# ifdef __x86_64__ +# define SYS_renameat2 316 +# endif +# ifdef __i386__ +# define SYS_renameat2 353 +# endif +#endif + +#ifdef SYS_renameat2 + err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags); + if (err < 0) { + if (errno != ENOSYS) + return -errno; + else + return -EINVAL; + } + return 0; +#else + return -EINVAL; +#endif +} + int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index f36fc010fccb..2923a7bd82ac 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -545,12 +545,13 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) struct dnode *d1; struct quad_buffer_head qbh1; if (hpfs_sb(i->i_sb)->sb_chk) - if (up != i->i_ino) { - hpfs_error(i->i_sb, - "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", - dno, up, (unsigned long)i->i_ino); - return; - } + if (up != i->i_ino) { + hpfs_error(i->i_sb, + "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", + dno, up, + (unsigned long)i->i_ino); + return; + } if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { d1->up = cpu_to_le32(up); d1->root_dnode = 1; @@ -1061,8 +1062,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, hpfs_brelse4(qbh); if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) { - kfree(name2); - return NULL; + kfree(name2); + return NULL; } goto go_down; } diff --git a/fs/inode.c b/fs/inode.c index 6eecb7ff0b9a..26753ba7b6d6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -165,6 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->backing_dev_info = &default_backing_dev_info; @@ -1695,13 +1696,6 @@ int inode_needs_sync(struct inode *inode) } EXPORT_SYMBOL(inode_needs_sync); -int inode_wait(void *word) -{ - schedule(); - return 0; -} -EXPORT_SYMBOL(inode_wait); - /* * If we try to find an inode in the inode hash while it is being * deleted, we have to wait until the filesystem completes its diff --git a/fs/internal.h b/fs/internal.h index 465742407466..e325b4f9c799 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -131,7 +131,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, /* * read_write.c */ -extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); extern int rw_verify_area(int, struct file *, const loff_t *, size_t); /* @@ -144,3 +143,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, * pipe.c */ extern const struct file_operations pipefifo_fops; + +/* + * fs_pin.c + */ +extern void sb_pin_kill(struct super_block *sb); +extern void mnt_pin_kill(struct mount *m); diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 592e5115a561..f311bf084015 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -158,8 +158,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, "zisofs: zisofs_inflate returned" " %d, inode = %lu," " page idx = %d, bh idx = %d," - " avail_in = %d," - " avail_out = %d\n", + " avail_in = %ld," + " avail_out = %ld\n", zerr, inode->i_ino, curpage, curbh, stream.avail_in, stream.avail_out); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 38cfcf5f6fce..5f09370c90a8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -763,12 +763,6 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } -static int sleep_on_shadow_bh(void *word) -{ - io_schedule(); - return 0; -} - /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -906,8 +900,8 @@ repeat: if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); jbd_unlock_bh_state(bh); - wait_on_bit(&bh->b_state, BH_Shadow, - sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&bh->b_state, BH_Shadow, + TASK_UNINTERRUPTIBLE); goto repeat; } @@ -1588,9 +1582,12 @@ int jbd2_journal_stop(handle_t *handle) * to perform a synchronous write. We do this to detect the * case where a single process is doing a stream of sync * writes. No point in waiting for joiners in that case. + * + * Setting max_batch_time to 0 disables this completely. */ pid = current->pid; - if (handle->h_sync && journal->j_last_sync_writer != pid) { + if (handle->h_sync && journal->j_last_sync_writer != pid && + journal->j_max_batch_time) { u64 commit_time, trans_time; journal->j_last_sync_writer = pid; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 009ec0b5993d..2f7a3c090489 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -202,8 +202,7 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type) } else { acl = ERR_PTR(rc); } - if (value) - kfree(value); + kfree(value); if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); return acl; diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c index 0b9a1e44e833..5698dae5d92d 100644 --- a/fs/jffs2/compr_zlib.c +++ b/fs/jffs2/compr_zlib.c @@ -94,11 +94,12 @@ static int jffs2_zlib_compress(unsigned char *data_in, while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) { def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE); - def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out); - jffs2_dbg(1, "calling deflate with avail_in %d, avail_out %d\n", + def_strm.avail_in = min_t(unsigned long, + (*sourcelen-def_strm.total_in), def_strm.avail_out); + jffs2_dbg(1, "calling deflate with avail_in %ld, avail_out %ld\n", def_strm.avail_in, def_strm.avail_out); ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH); - jffs2_dbg(1, "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n", + jffs2_dbg(1, "deflate returned with avail_in %ld, avail_out %ld, total_in %ld, total_out %ld\n", def_strm.avail_in, def_strm.avail_out, def_strm.total_in, def_strm.total_out); if (ret != Z_OK) { diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index ad0f2e2a1700..d72817ac51f6 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -756,8 +756,7 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c) for (i=0; i < XATTRINDEX_HASHSIZE; i++) { list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) { list_del(&xd->xindex); - if (xd->xname) - kfree(xd->xname); + kfree(xd->xname); jffs2_free_xattr_datum(xd); } } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e3d37f607f97..4429d6d9217f 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -39,6 +39,19 @@ struct kernfs_open_node { struct list_head files; /* goes through kernfs_open_file.list */ }; +/* + * kernfs_notify() may be called from any context and bounces notifications + * through a work item. To minimize space overhead in kernfs_node, the + * pending queue is implemented as a singly linked list of kernfs_nodes. + * The list is terminated with the self pointer so that whether a + * kernfs_node is on the list or not can be determined by testing the next + * pointer for NULL. + */ +#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) + +static DEFINE_SPINLOCK(kernfs_notify_lock); +static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; + static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; @@ -783,24 +796,25 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) return DEFAULT_POLLMASK|POLLERR|POLLPRI; } -/** - * kernfs_notify - notify a kernfs file - * @kn: file to notify - * - * Notify @kn such that poll(2) on @kn wakes up. - */ -void kernfs_notify(struct kernfs_node *kn) +static void kernfs_notify_workfn(struct work_struct *work) { - struct kernfs_root *root = kernfs_root(kn); + struct kernfs_node *kn; struct kernfs_open_node *on; struct kernfs_super_info *info; - unsigned long flags; - - if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) +repeat: + /* pop one off the notify_list */ + spin_lock_irq(&kernfs_notify_lock); + kn = kernfs_notify_list; + if (kn == KERNFS_NOTIFY_EOL) { + spin_unlock_irq(&kernfs_notify_lock); return; + } + kernfs_notify_list = kn->attr.notify_next; + kn->attr.notify_next = NULL; + spin_unlock_irq(&kernfs_notify_lock); /* kick poll */ - spin_lock_irqsave(&kernfs_open_node_lock, flags); + spin_lock_irq(&kernfs_open_node_lock); on = kn->attr.open; if (on) { @@ -808,12 +822,12 @@ void kernfs_notify(struct kernfs_node *kn) wake_up_interruptible(&on->poll); } - spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + spin_unlock_irq(&kernfs_open_node_lock); /* kick fsnotify */ mutex_lock(&kernfs_mutex); - list_for_each_entry(info, &root->supers, node) { + list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct inode *inode; struct dentry *dentry; @@ -833,6 +847,33 @@ void kernfs_notify(struct kernfs_node *kn) } mutex_unlock(&kernfs_mutex); + kernfs_put(kn); + goto repeat; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any + * context. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); + unsigned long flags; + + if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) + return; + + spin_lock_irqsave(&kernfs_notify_lock, flags); + if (!kn->attr.notify_next) { + kernfs_get(kn); + kn->attr.notify_next = kernfs_notify_list; + kernfs_notify_list = kn; + schedule_work(&kernfs_notify_work); + } + spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); @@ -855,7 +896,7 @@ const struct file_operations kernfs_file_fops = { * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file - * @static_name: don't copy file name + * @name_is_static: don't copy file name * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Returns the created node on success, ERR_PTR() value on error. diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d171b98a6cdd..f973ae9b05f1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb) kernfs_put(root_kn); } +/** + * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root + * @kernfs_root: the kernfs_root in question + * @ns: the namespace tag + * + * Pin the superblock so the superblock won't be destroyed in subsequent + * operations. This can be used to block ->kill_sb() which may be useful + * for kernfs users which dynamically manage superblocks. + * + * Returns NULL if there's no superblock associated to this kernfs_root, or + * -EINVAL if the superblock is being freed. + */ +struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) +{ + struct kernfs_super_info *info; + struct super_block *sb = NULL; + + mutex_lock(&kernfs_mutex); + list_for_each_entry(info, &root->supers, node) { + if (info->ns == ns) { + sb = info->sb; + if (!atomic_inc_not_zero(&info->sb->s_active)) + sb = ERR_PTR(-EINVAL); + break; + } + } + mutex_unlock(&kernfs_mutex); + return sb; +} + void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 1812f026960c..daa8e7514eae 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -306,11 +306,9 @@ static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv) static void nsm_init_private(struct nsm_handle *nsm) { u64 *p = (u64 *)&nsm->sm_priv.data; - struct timespec ts; s64 ns; - ktime_get_ts(&ts); - ns = timespec_to_ns(&ts); + ns = ktime_get_ns(); put_unaligned(ns, p); put_unaligned((unsigned long)nsm, p + 1); } diff --git a/fs/locks.c b/fs/locks.c index 717fbc404e6b..a6f54802d277 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -325,7 +325,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, return -ENOMEM; fl->fl_file = filp; - fl->fl_owner = (fl_owner_t)filp; + fl->fl_owner = filp; fl->fl_pid = current->tgid; fl->fl_flags = FL_FLOCK; fl->fl_type = type; @@ -431,7 +431,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) if (assign_type(fl, type) != 0) return -EINVAL; - fl->fl_owner = (fl_owner_t)current->files; + fl->fl_owner = current->files; fl->fl_pid = current->tgid; fl->fl_file = filp; @@ -1155,7 +1155,6 @@ EXPORT_SYMBOL(posix_lock_file_wait); int locks_mandatory_locked(struct file *file) { struct inode *inode = file_inode(file); - fl_owner_t owner = current->files; struct file_lock *fl; /* @@ -1165,7 +1164,8 @@ int locks_mandatory_locked(struct file *file) for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!IS_POSIX(fl)) continue; - if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file) + if (fl->fl_owner != current->files && + fl->fl_owner != file) break; } spin_unlock(&inode->i_lock); @@ -1205,7 +1205,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, for (;;) { if (filp) { - fl.fl_owner = (fl_owner_t)filp; + fl.fl_owner = filp; fl.fl_flags &= ~FL_SLEEP; error = __posix_lock_file(inode, &fl, NULL); if (!error) @@ -1948,7 +1948,7 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) cmd = F_GETLK; file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = (fl_owner_t)filp; + file_lock.fl_owner = filp; } error = vfs_test_lock(filp, &file_lock); @@ -2103,7 +2103,7 @@ again: cmd = F_SETLK; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2112,7 +2112,7 @@ again: cmd = F_SETLKW; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; /* Fallthrough */ case F_SETLKW: file_lock->fl_flags |= FL_SLEEP; @@ -2170,7 +2170,7 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) cmd = F_GETLK64; file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = (fl_owner_t)filp; + file_lock.fl_owner = filp; } error = vfs_test_lock(filp, &file_lock); @@ -2242,7 +2242,7 @@ again: cmd = F_SETLK64; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2251,7 +2251,7 @@ again: cmd = F_SETLKW64; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; /* Fallthrough */ case F_SETLKW64: file_lock->fl_flags |= FL_SLEEP; @@ -2324,11 +2324,11 @@ void locks_remove_file(struct file *filp) if (!inode->i_flock) return; - locks_remove_posix(filp, (fl_owner_t)filp); + locks_remove_posix(filp, filp); if (filp->f_op->flock) { struct file_lock fl = { - .fl_owner = (fl_owner_t)filp, + .fl_owner = filp, .fl_pid = current->tgid, .fl_file = filp, .fl_flags = FL_FLOCK, diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 48140315f627..380d86e1ab45 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1019,11 +1019,11 @@ static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) /** * logfs_is_valid_block - check whether this block is still valid * - * @sb - superblock - * @ofs - block physical offset - * @ino - block inode number - * @bix - block index - * @level - block level + * @sb: superblock + * @ofs: block physical offset + * @ino: block inode number + * @bix: block index + * @gc_level: block level * * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will * become invalid once the journal is written. @@ -2226,10 +2226,9 @@ void btree_write_block(struct logfs_block *block) * * @inode: parent inode (ifile or directory) * @buf: object to write (inode or dentry) - * @n: object size - * @_pos: object number (file position in blocks/objects) + * @count: object size + * @bix: block index * @flags: write flags - * @lock: 0 if write lock is already taken, 1 otherwise * @shadow_tree: shadow below this inode * * FIXME: All caller of this put a 200-300 byte variable on the stack, diff --git a/fs/mbcache.c b/fs/mbcache.c index bf166e388f0d..187477ded6b3 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -73,6 +73,7 @@ #include <linux/mbcache.h> #include <linux/init.h> #include <linux/blockgroup_lock.h> +#include <linux/log2.h> #ifdef MB_CACHE_DEBUG # define mb_debug(f...) do { \ @@ -93,7 +94,7 @@ #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) -#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS) +#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) #define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 4bc50dac8e97..742942a983be 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -96,7 +96,7 @@ int minix_new_block(struct inode * inode) unsigned long minix_count_free_blocks(struct super_block *sb) { struct minix_sb_info *sbi = minix_sb(sb); - u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1); + u32 bits = sbi->s_nzones - sbi->s_firstdatazone + 1; return (count_free(sbi->s_zmap, sb->s_blocksize, bits) << sbi->s_log_zone_size); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index f007a3355570..3f57af196a7d 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -267,12 +267,12 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize); if (sbi->s_imap_blocks < block) { printk("MINIX-fs: file system does not have enough " - "imap blocks allocated. Refusing to mount\n"); + "imap blocks allocated. Refusing to mount.\n"); goto out_no_bitmap; } block = minix_blocks_needed( - (sbi->s_nzones - (sbi->s_firstdatazone + 1)), + (sbi->s_nzones - sbi->s_firstdatazone + 1), s->s_blocksize); if (sbi->s_zmap_blocks < block) { printk("MINIX-fs: file system does not have enough " diff --git a/fs/mount.h b/fs/mount.h index d55297f2fa05..6740a6215529 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -55,7 +55,7 @@ struct mount { int mnt_id; /* mount identifier */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ - int mnt_pinned; + struct hlist_head mnt_pins; struct path mnt_ex_mountpoint; }; diff --git a/fs/namei.c b/fs/namei.c index 985c6f368485..a996bb48dfab 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1091,10 +1091,10 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); -static inline bool managed_dentry_might_block(struct dentry *dentry) +static inline int managed_dentry_rcu(struct dentry *dentry) { - return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && - dentry->d_op->d_manage(dentry, true) < 0); + return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? + dentry->d_op->d_manage(dentry, true) : 0; } /* @@ -1110,11 +1110,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - if (unlikely(managed_dentry_might_block(path->dentry))) + switch (managed_dentry_rcu(path->dentry)) { + case -ECHILD: + default: return false; + case -EISDIR: + return true; + case 0: + break; + } if (!d_mountpoint(path->dentry)) - return true; + return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) @@ -1130,7 +1137,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, */ *inode = path->dentry->d_inode; } - return read_seqretry(&mount_lock, nd->m_seq); + return read_seqretry(&mount_lock, nd->m_seq) && + !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); } static int follow_dotdot_rcu(struct nameidata *nd) @@ -1402,11 +1410,8 @@ static int lookup_fast(struct nameidata *nd, } path->mnt = mnt; path->dentry = dentry; - if (unlikely(!__follow_mount_rcu(nd, path, inode))) - goto unlazy; - if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) - goto unlazy; - return 0; + if (likely(__follow_mount_rcu(nd, path, inode))) + return 0; unlazy: if (unlazy_walk(nd, dentry)) return -ECHILD; @@ -2256,9 +2261,10 @@ done: goto out; } path->dentry = dentry; - path->mnt = mntget(nd->path.mnt); + path->mnt = nd->path.mnt; if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW)) return 1; + mntget(path->mnt); follow_mount(path); error = 0; out: @@ -4018,7 +4024,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: - * a) we can get into loop creation. Check is done in is_subdir(). + * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4 screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another @@ -4074,7 +4080,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - if (!old_dir->i_op->rename) + if (!old_dir->i_op->rename && !old_dir->i_op->rename2) return -EPERM; if (flags && !old_dir->i_op->rename2) @@ -4133,10 +4139,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) goto out; } - if (!flags) { + if (!old_dir->i_op->rename2) { error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); } else { + WARN_ON(old_dir->i_op->rename != NULL); error = old_dir->i_op->rename2(old_dir, old_dentry, new_dir, new_dentry, flags); } diff --git a/fs/namespace.c b/fs/namespace.c index 182bc41cd887..a01c7730e9af 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -16,7 +16,6 @@ #include <linux/namei.h> #include <linux/security.h> #include <linux/idr.h> -#include <linux/acct.h> /* acct_auto_close_mnt */ #include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ @@ -779,6 +778,20 @@ static void attach_mnt(struct mount *mnt, list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } +static void attach_shadowed(struct mount *mnt, + struct mount *parent, + struct mount *shadows) +{ + if (shadows) { + hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); + list_add(&mnt->mnt_child, &shadows->mnt_child); + } else { + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + } +} + /* * vfsmount lock must be held for write */ @@ -797,12 +810,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); - if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); - else - hlist_add_head_rcu(&mnt->mnt_hash, - m_hash(&parent->mnt, mnt->mnt_mountpoint)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + attach_shadowed(mnt, parent, shadows); touch_mnt_namespace(n); } @@ -890,8 +898,21 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); /* Don't allow unprivileged users to change mount flags */ - if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) - mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + if (flag & CL_UNPRIVILEGED) { + mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; + + if (mnt->mnt.mnt_flags & MNT_READONLY) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + + if (mnt->mnt.mnt_flags & MNT_NODEV) + mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; + + if (mnt->mnt.mnt_flags & MNT_NOSUID) + mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; + + if (mnt->mnt.mnt_flags & MNT_NOEXEC) + mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; + } /* Don't allow unprivileged users to reveal what is under a mount */ if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) @@ -938,7 +959,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, static void mntput_no_expire(struct mount *mnt) { -put_again: rcu_read_lock(); mnt_add_count(mnt, -1); if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ @@ -951,14 +971,6 @@ put_again: unlock_mount_hash(); return; } - if (unlikely(mnt->mnt_pinned)) { - mnt_add_count(mnt, mnt->mnt_pinned + 1); - mnt->mnt_pinned = 0; - rcu_read_unlock(); - unlock_mount_hash(); - acct_auto_close_mnt(&mnt->mnt); - goto put_again; - } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); unlock_mount_hash(); @@ -981,6 +993,8 @@ put_again: * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); + if (unlikely(mnt->mnt_pins.first)) + mnt_pin_kill(mnt); fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); @@ -1008,25 +1022,15 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -void mnt_pin(struct vfsmount *mnt) -{ - lock_mount_hash(); - real_mount(mnt)->mnt_pinned++; - unlock_mount_hash(); -} -EXPORT_SYMBOL(mnt_pin); - -void mnt_unpin(struct vfsmount *m) +struct vfsmount *mnt_clone_internal(struct path *path) { - struct mount *mnt = real_mount(m); - lock_mount_hash(); - if (mnt->mnt_pinned) { - mnt_add_count(mnt, 1); - mnt->mnt_pinned--; - } - unlock_mount_hash(); + struct mount *p; + p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); + if (IS_ERR(p)) + return ERR_CAST(p); + p->mnt.mnt_flags |= MNT_INTERNAL; + return &p->mnt; } -EXPORT_SYMBOL(mnt_unpin); static inline void mangle(struct seq_file *m, const char *s) { @@ -1492,6 +1496,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { + struct mount *t = NULL; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { s = skip_mnt_tree(s); @@ -1513,7 +1518,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, parent, p->mnt_mp); + mnt_set_mountpoint(parent, p->mnt_mp, q); + if (!list_empty(&parent->mnt_mounts)) { + t = list_last_entry(&parent->mnt_mounts, + struct mount, mnt_child); + if (t->mnt_mp != p->mnt_mp) + t = NULL; + } + attach_shadowed(q, parent, t); unlock_mount_hash(); } } @@ -1896,9 +1908,6 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; - if (mnt->mnt_flags & MNT_LOCK_READONLY) - return -EPERM; - if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else @@ -1924,6 +1933,33 @@ static int do_remount(struct path *path, int flags, int mnt_flags, if (path->dentry != path->mnt->mnt_root) return -EINVAL; + /* Don't allow changing of locked mnt flags. + * + * No locks need to be held here while testing the various + * MNT_LOCK flags because those flags can never be cleared + * once they are set. + */ + if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && + !(mnt_flags & MNT_READONLY)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + !(mnt_flags & MNT_NODEV)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && + !(mnt_flags & MNT_NOSUID)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && + !(mnt_flags & MNT_NOEXEC)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && + ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { + return -EPERM; + } + err = security_sb_remount(sb, data); if (err) return err; @@ -1937,7 +1973,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = do_remount_sb(sb, flags, data, 0); if (!err) { lock_mount_hash(); - mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; + mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); unlock_mount_hash(); @@ -2122,7 +2158,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, */ if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { flags |= MS_NODEV; - mnt_flags |= MNT_NODEV; + mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } } @@ -2436,6 +2472,14 @@ long do_mount(const char *dev_name, const char *dir_name, if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; + /* The default atime for remount is preservation */ + if ((flags & MS_REMOUNT) && + ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | + MS_STRICTATIME)) == 0)) { + mnt_flags &= ~MNT_ATIME_MASK; + mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; + } + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); @@ -2972,13 +3016,13 @@ static void *mntns_get(struct task_struct *task) struct mnt_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->mnt_ns; get_mnt_ns(ns); } - rcu_read_unlock(); + task_unlock(task); return ns; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1d09289c8f0e..180d1ec9c32e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1205,7 +1205,7 @@ static const struct file_operations nfs_server_list_fops = { .open = nfs_server_list_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, .owner = THIS_MODULE, }; @@ -1226,7 +1226,7 @@ static const struct file_operations nfs_volume_list_fops = { .open = nfs_volume_list_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, .owner = THIS_MODULE, }; @@ -1236,19 +1236,8 @@ static const struct file_operations nfs_volume_list_fops = { */ static int nfs_server_list_open(struct inode *inode, struct file *file) { - struct seq_file *m; - int ret; - struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; - struct net *net = pid_ns->child_reaper->nsproxy->net_ns; - - ret = seq_open(file, &nfs_server_list_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = net; - - return 0; + return seq_open_net(inode, file, &nfs_server_list_ops, + sizeof(struct seq_net_private)); } /* @@ -1256,7 +1245,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file) */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) { - struct nfs_net *nn = net_generic(m->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); @@ -1268,7 +1257,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) */ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) { - struct nfs_net *nn = net_generic(p->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_client_list, pos); } @@ -1278,7 +1267,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) */ static void nfs_server_list_stop(struct seq_file *p, void *v) { - struct nfs_net *nn = net_generic(p->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } @@ -1289,7 +1278,7 @@ static void nfs_server_list_stop(struct seq_file *p, void *v) static int nfs_server_list_show(struct seq_file *m, void *v) { struct nfs_client *clp; - struct nfs_net *nn = net_generic(m->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_client_list) { @@ -1321,19 +1310,8 @@ static int nfs_server_list_show(struct seq_file *m, void *v) */ static int nfs_volume_list_open(struct inode *inode, struct file *file) { - struct seq_file *m; - int ret; - struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; - struct net *net = pid_ns->child_reaper->nsproxy->net_ns; - - ret = seq_open(file, &nfs_volume_list_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = net; - - return 0; + return seq_open_net(inode, file, &nfs_server_list_ops, + sizeof(struct seq_net_private)); } /* @@ -1341,7 +1319,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file) */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) { - struct nfs_net *nn = net_generic(m->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); @@ -1353,7 +1331,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) */ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) { - struct nfs_net *nn = net_generic(p->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_volume_list, pos); } @@ -1363,7 +1341,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) */ static void nfs_volume_list_stop(struct seq_file *p, void *v) { - struct nfs_net *nn = net_generic(p->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } @@ -1376,7 +1354,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) struct nfs_server *server; struct nfs_client *clp; char dev[8], fsid[17]; - struct nfs_net *nn = net_generic(m->private, nfs_net_id); + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { @@ -1407,6 +1385,45 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) return 0; } +int nfs_fs_proc_net_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct proc_dir_entry *p; + + nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net); + if (!nn->proc_nfsfs) + goto error_0; + + /* a file of servers with which we're dealing */ + p = proc_create("servers", S_IFREG|S_IRUGO, + nn->proc_nfsfs, &nfs_server_list_fops); + if (!p) + goto error_1; + + /* a file of volumes that we have mounted */ + p = proc_create("volumes", S_IFREG|S_IRUGO, + nn->proc_nfsfs, &nfs_volume_list_fops); + if (!p) + goto error_2; + return 0; + +error_2: + remove_proc_entry("servers", nn->proc_nfsfs); +error_1: + remove_proc_entry("fs/nfsfs", NULL); +error_0: + return -ENOMEM; +} + +void nfs_fs_proc_net_exit(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + remove_proc_entry("volumes", nn->proc_nfsfs); + remove_proc_entry("servers", nn->proc_nfsfs); + remove_proc_entry("fs/nfsfs", NULL); +} + /* * initialise the /proc/fs/nfsfs/ directory */ @@ -1419,14 +1436,12 @@ int __init nfs_fs_proc_init(void) goto error_0; /* a file of servers with which we're dealing */ - p = proc_create("servers", S_IFREG|S_IRUGO, - proc_fs_nfs, &nfs_server_list_fops); + p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers"); if (!p) goto error_1; /* a file of volumes that we have mounted */ - p = proc_create("volumes", S_IFREG|S_IRUGO, - proc_fs_nfs, &nfs_volume_list_fops); + p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes"); if (!p) goto error_2; return 0; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 8f98138cbc43..f11b9eed0de1 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -756,7 +756,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { - bool do_destroy = true; req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); @@ -765,7 +764,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) case NFS_IOHDR_NEED_COMMIT: kref_get(&req->wb_kref); nfs_mark_request_commit(req, hdr->lseg, &cinfo); - do_destroy = false; } nfs_unlock_and_release_request(req); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4042ff58fe3f..524dd80d1898 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -361,8 +361,8 @@ start: * Prevent starvation issues if someone is doing a consistency * sync-to-disk */ - ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); + ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); if (ret) return ret; diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 44bf0140a4c7..e2a0361e24c6 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -783,8 +783,8 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) { might_sleep(); - wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, - nfs_wait_bit_killable, TASK_KILLABLE); + wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING, + nfs_wait_bit_killable, TASK_KILLABLE); } static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index b94f80420a58..880618a8b048 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root */ - ret = d_obtain_alias(inode); + ret = d_obtain_root(inode); if (IS_ERR(ret)) { dprintk("nfs_get_root: get root dentry failed\n"); goto out; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 567983d2c0eb..7dd55b745c4d 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -174,7 +174,9 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) static struct key_type key_type_id_resolver = { .name = "id_resolver", - .instantiate = user_instantiate, + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, .match = user_match, .revoke = user_revoke, .destroy = user_destroy, @@ -282,6 +284,8 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen, desc, "", 0, idmap); mutex_unlock(&idmap->idmap_mutex); } + if (!IS_ERR(rkey)) + set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags); kfree(desc); return rkey; @@ -394,7 +398,9 @@ static const struct rpc_pipe_ops idmap_upcall_ops = { static struct key_type key_type_id_resolver_legacy = { .name = "id_legacy", - .instantiate = user_instantiate, + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, .match = user_match, .revoke = user_revoke, .destroy = user_destroy, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c496f8a74639..68921b01b792 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -75,7 +75,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks * @word: long word containing the bit lock */ -int nfs_wait_bit_killable(void *word) +int nfs_wait_bit_killable(struct wait_bit_key *key) { if (fatal_signal_pending(current)) return -ERESTARTSYS; @@ -147,6 +147,17 @@ int nfs_sync_mapping(struct address_space *mapping) return ret; } +static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; + nfsi->cache_validity |= flags; + if (flags & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); +} + /* * Invalidate the local caches */ @@ -162,17 +173,16 @@ static void nfs_zap_caches_locked(struct inode *inode) memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { - nfs_fscache_invalidate(inode); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_REVAL_PAGECACHE); } else - nfsi->cache_validity |= NFS_INO_INVALID_ATTR + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_REVAL_PAGECACHE); nfs_zap_label_cache_locked(nfsi); } @@ -187,8 +197,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { if (mapping->nrpages != 0) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; - nfs_fscache_invalidate(inode); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); spin_unlock(&inode->i_lock); } } @@ -209,7 +218,7 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_invalidate_atime); @@ -369,7 +378,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_mode = fattr->mode; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -415,36 +424,36 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (nfs_server_capable(inode, NFS_CAP_ATIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (nfs_server_capable(inode, NFS_CAP_OWNER)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -550,6 +559,9 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) spin_lock(&inode->i_lock); i_size_write(inode, offset); + /* Optimisation */ + if (offset == 0) + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); @@ -578,7 +590,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL); spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { @@ -1061,8 +1074,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) * the bit lock here if it looks like we're going to be doing that. */ for (;;) { - ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); if (ret) goto out; spin_lock(&inode->i_lock); @@ -1101,7 +1114,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr && inode->i_version == fattr->pre_change_attr) { inode->i_version = fattr->change_attr; if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ @@ -1117,7 +1130,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) @@ -1128,9 +1141,6 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr ret |= NFS_INO_INVALID_ATTR; } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); - return ret; } @@ -1189,7 +1199,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) - nfsi->cache_validity |= invalid; + nfs_set_cache_invalid(inode, invalid); nfsi->read_cache_jiffies = fattr->time_start; return 0; @@ -1402,13 +1412,11 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); + unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) { - nfsi->cache_validity |= NFS_INO_INVALID_DATA; - nfs_fscache_invalidate(inode); - } + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, invalid); if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; return nfs_refresh_inode_locked(inode, fattr); @@ -1601,6 +1609,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if ((nfsi->npages == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid &= ~NFS_INO_REVAL_PAGECACHE; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -1702,10 +1711,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid &= ~NFS_INO_INVALID_DATA; if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) || (save_cache_validity & NFS_INO_REVAL_FORCED)) - nfsi->cache_validity |= invalid; - - if (invalid & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); + nfs_set_cache_invalid(inode, invalid); return 0; out_err: @@ -1834,11 +1840,12 @@ EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { nfs_clients_init(net); - return 0; + return nfs_fs_proc_net_init(net); } static void nfs_net_exit(struct net *net) { + nfs_fs_proc_net_exit(net); nfs_cleanup_cb_ident_idr(net); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 82ddbf46660e..e2a45ae5014e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -195,7 +195,16 @@ extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); +extern int nfs_fs_proc_net_init(struct net *net); +extern void nfs_fs_proc_net_exit(struct net *net); #else +static inline int nfs_fs_proc_net_init(struct net *net) +{ + return 0; +} +static inline void nfs_fs_proc_net_exit(struct net *net) +{ +} static inline int nfs_fs_proc_init(void) { return 0; @@ -244,6 +253,7 @@ void nfs_pgio_data_release(struct nfs_pgio_data *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, const struct rpc_call_ops *, int, int); +void nfs_free_request(struct nfs_page *req); static inline void nfs_iocounter_init(struct nfs_io_counter *c) { @@ -347,7 +357,7 @@ extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); -extern int nfs_wait_bit_killable(void *word); +extern int nfs_wait_bit_killable(struct wait_bit_key *key); /* super.c */ extern const struct super_operations nfs_sops; diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index 8ee1fab83268..ef221fb8a183 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -29,6 +29,9 @@ struct nfs_net { #endif spinlock_t nfs_client_lock; struct timespec boot_time; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_nfsfs; +#endif }; extern int nfs_net_id; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 871d6eda8dba..8f854dde4150 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -247,3 +247,46 @@ const struct xattr_handler *nfs3_xattr_handlers[] = { &posix_acl_default_xattr_handler, NULL, }; + +static int +nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, + size_t size, ssize_t *result) +{ + struct posix_acl *acl; + char *p = data + *result; + + acl = get_acl(inode, type); + if (!acl) + return 0; + + posix_acl_release(acl); + + *result += strlen(name); + *result += 1; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +nfs3_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct inode *inode = dentry->d_inode; + ssize_t result = 0; + int error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, + POSIX_ACL_XATTR_ACCESS, data, size, &result); + if (error) + return error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, + POSIX_ACL_XATTR_DEFAULT, data, size, &result); + if (error) + return error; + return result; +} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index e7daa42bbc86..f0afa291fd58 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -885,7 +885,7 @@ static const struct inode_operations nfs3_dir_inode_operations = { .getattr = nfs_getattr, .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL - .listxattr = generic_listxattr, + .listxattr = nfs3_listxattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, .removexattr = generic_removexattr, @@ -899,7 +899,7 @@ static const struct inode_operations nfs3_file_inode_operations = { .getattr = nfs_getattr, .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL - .listxattr = generic_listxattr, + .listxattr = nfs3_listxattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, .removexattr = generic_removexattr, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index f63cb87cd730..ba2affa51941 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -230,7 +230,7 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *, extern struct file_system_type nfs4_fs_type; /* nfs4namespace.c */ -struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *); +struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *); struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); int nfs4_replace_transport(struct nfs_server *server, diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3d5dbf80d46a..3d83cb1fdc70 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -139,16 +139,22 @@ static size_t nfs_parse_server_name(char *string, size_t len, * @server: NFS server struct * @flavors: List of security tuples returned by SECINFO procedure * - * Return the pseudoflavor of the first security mechanism in - * "flavors" that is locally supported. Return RPC_AUTH_UNIX if - * no matching flavor is found in the array. The "flavors" array + * Return an rpc client that uses the first security mechanism in + * "flavors" that is locally supported. The "flavors" array * is searched in the order returned from the server, per RFC 3530 - * recommendation. + * recommendation and each flavor is checked for membership in the + * sec= mount option list if it exists. + * + * Return -EPERM if no matching flavor is found in the array. + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + * */ -static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server, +static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt, + struct nfs_server *server, struct nfs4_secinfo_flavors *flavors) { - rpc_authflavor_t pseudoflavor; + rpc_authflavor_t pflavor; struct nfs4_secinfo4 *secinfo; unsigned int i; @@ -159,62 +165,73 @@ static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server, case RPC_AUTH_NULL: case RPC_AUTH_UNIX: case RPC_AUTH_GSS: - pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, + pflavor = rpcauth_get_pseudoflavor(secinfo->flavor, &secinfo->flavor_info); - /* make sure pseudoflavor matches sec= mount opt */ - if (pseudoflavor != RPC_AUTH_MAXFLAVOR && - nfs_auth_info_match(&server->auth_info, - pseudoflavor)) - return pseudoflavor; - break; + /* does the pseudoflavor match a sec= mount opt? */ + if (pflavor != RPC_AUTH_MAXFLAVOR && + nfs_auth_info_match(&server->auth_info, pflavor)) { + struct rpc_clnt *new; + struct rpc_cred *cred; + + /* Cloning creates an rpc_auth for the flavor */ + new = rpc_clone_client_set_auth(clnt, pflavor); + if (IS_ERR(new)) + continue; + /** + * Check that the user actually can use the + * flavor. This is mostly for RPC_AUTH_GSS + * where cr_init obtains a gss context + */ + cred = rpcauth_lookupcred(new->cl_auth, 0); + if (IS_ERR(cred)) { + rpc_shutdown_client(new); + continue; + } + put_rpccred(cred); + return new; + } } } - - /* if there were any sec= options then nothing matched */ - if (server->auth_info.flavor_len > 0) - return -EPERM; - - return RPC_AUTH_UNIX; + return ERR_PTR(-EPERM); } -static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name) +/** + * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup, + * return an rpc_clnt that uses the best available security flavor with + * respect to the secinfo flavor list and the sec= mount options. + * + * @clnt: RPC client to clone + * @inode: directory inode + * @name: lookup name + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + */ +struct rpc_clnt * +nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode, + struct qstr *name) { struct page *page; struct nfs4_secinfo_flavors *flavors; - rpc_authflavor_t flavor; + struct rpc_clnt *new; int err; page = alloc_page(GFP_KERNEL); if (!page) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + flavors = page_address(page); err = nfs4_proc_secinfo(inode, name, flavors); if (err < 0) { - flavor = err; + new = ERR_PTR(err); goto out; } - flavor = nfs_find_best_sec(NFS_SERVER(inode), flavors); + new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors); out: put_page(page); - return flavor; -} - -/* - * Please call rpc_shutdown_client() when you are done with this client. - */ -struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode, - struct qstr *name) -{ - rpc_authflavor_t flavor; - - flavor = nfs4_negotiate_security(inode, name); - if ((int)flavor < 0) - return ERR_PTR((int)flavor); - - return rpc_clone_client_set_auth(clnt, flavor); + return new; } static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, @@ -397,11 +414,6 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, if (client->cl_auth->au_flavor != flavor) flavor = client->cl_auth->au_flavor; - else { - rpc_authflavor_t new = nfs4_negotiate_security(dir, name); - if ((int)new >= 0) - flavor = new; - } mnt = nfs_do_submount(dentry, fh, fattr, flavor); out: rpc_shutdown_client(client); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 285ad5334018..4bf3d97cc5a0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3247,7 +3247,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, err = -EPERM; if (client != *clnt) goto out; - client = nfs4_create_sec_client(client, dir, name); + client = nfs4_negotiate_security(client, dir, name); if (IS_ERR(client)) return PTR_ERR(client); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 848f6853c59e..42f121182167 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1251,8 +1251,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); atomic_inc(&clp->cl_count); - res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); + res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, + nfs_wait_bit_killable, TASK_KILLABLE); if (res) goto out; if (clp->cl_cons_state < 0) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b6ee3a6ee96d..0be5050638f7 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -29,8 +29,6 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; -static void nfs_free_request(struct nfs_page *); - static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) { p->npages = pagecount; @@ -117,7 +115,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c) set_bit(NFS_IO_INPROGRESS, &c->flags); if (atomic_read(&c->io_count) == 0) break; - ret = nfs_wait_bit_killable(&c->flags); + ret = nfs_wait_bit_killable(&q.key); } while (atomic_read(&c->io_count) != 0); finish_wait(wq, &q.wait); return ret; @@ -138,12 +136,6 @@ nfs_iocounter_wait(struct nfs_io_counter *c) return __nfs_iocounter_wait(c); } -static int nfs_wait_bit_uninterruptible(void *word) -{ - io_schedule(); - return 0; -} - /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked @@ -158,7 +150,6 @@ nfs_page_group_lock(struct nfs_page *req) WARN_ON_ONCE(head != head->wb_head); wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, - nfs_wait_bit_uninterruptible, TASK_UNINTERRUPTIBLE); } @@ -239,20 +230,28 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) WARN_ON_ONCE(prev == req); if (!prev) { + /* a head request */ req->wb_head = req; req->wb_this_page = req; } else { + /* a subrequest */ WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); req->wb_head = prev->wb_head; req->wb_this_page = prev->wb_this_page; prev->wb_this_page = req; + /* All subrequests take a ref on the head request until + * nfs_page_group_destroy is called */ + kref_get(&req->wb_head->wb_kref); + /* grab extra ref if head request has extra ref from * the write/commit path to handle handoff between write * and commit lists */ - if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) + if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { + set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); + } } } @@ -269,6 +268,10 @@ nfs_page_group_destroy(struct kref *kref) struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); struct nfs_page *tmp, *next; + /* subrequests must release the ref on the head request */ + if (req->wb_head != req) + nfs_release_request(req->wb_head); + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) return; @@ -394,7 +397,7 @@ static void nfs_clear_request(struct nfs_page *req) * * Note: Should never be called with the spinlock held! */ -static void nfs_free_request(struct nfs_page *req) +void nfs_free_request(struct nfs_page *req) { WARN_ON_ONCE(req->wb_this_page != req); @@ -425,9 +428,8 @@ void nfs_release_request(struct nfs_page *req) int nfs_wait_on_request(struct nfs_page *req) { - return wait_on_bit(&req->wb_flags, PG_BUSY, - nfs_wait_bit_uninterruptible, - TASK_UNINTERRUPTIBLE); + return wait_on_bit_io(&req->wb_flags, PG_BUSY, + TASK_UNINTERRUPTIBLE); } /* @@ -925,7 +927,6 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_pageio_doio(desc); if (desc->pg_error < 0) return 0; - desc->pg_moreio = 0; if (desc->pg_recoalesce) return 0; /* retry add_request for this subreq */ @@ -972,6 +973,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) desc->pg_count = 0; desc->pg_base = 0; desc->pg_recoalesce = 0; + desc->pg_moreio = 0; while (!list_empty(&head)) { struct nfs_page *req; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6fdcd233d6f7..a8914b335617 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1885,7 +1885,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { if (!sync) goto out; - status = wait_on_bit_lock(&nfsi->flags, + status = wait_on_bit_lock_action(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, nfs_wait_bit_killable, TASK_KILLABLE); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3ee5af4e738e..962c9ee758be 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -46,6 +46,7 @@ static const struct rpc_call_ops nfs_commit_ops; static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_clear_request_commit(struct nfs_page *req); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -91,8 +92,15 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } +/* + * nfs_page_find_head_request_locked - find head request associated with @page + * + * must be called while holding the inode lock. + * + * returns matching head request with reference held, or NULL if not found. + */ static struct nfs_page * -nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) +nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) { struct nfs_page *req = NULL; @@ -104,25 +112,33 @@ nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) /* Linearly search the commit list for the correct req */ list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { if (freq->wb_page == page) { - req = freq; + req = freq->wb_head; break; } } } - if (req) + if (req) { + WARN_ON_ONCE(req->wb_head != req); + kref_get(&req->wb_kref); + } return req; } -static struct nfs_page *nfs_page_find_request(struct page *page) +/* + * nfs_page_find_head_request - find head request associated with @page + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page *nfs_page_find_head_request(struct page *page) { struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req = NULL; spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); spin_unlock(&inode->i_lock); return req; } @@ -274,36 +290,246 @@ static void nfs_end_page_writeback(struct nfs_page *req) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) + +/* nfs_page_group_clear_bits + * @req - an nfs request + * clears all page group related bits from @req + */ +static void +nfs_page_group_clear_bits(struct nfs_page *req) +{ + clear_bit(PG_TEARDOWN, &req->wb_flags); + clear_bit(PG_UNLOCKPAGE, &req->wb_flags); + clear_bit(PG_UPTODATE, &req->wb_flags); + clear_bit(PG_WB_END, &req->wb_flags); + clear_bit(PG_REMOVE, &req->wb_flags); +} + + +/* + * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req + * + * this is a helper function for nfs_lock_and_join_requests + * + * @inode - inode associated with request page group, must be holding inode lock + * @head - head request of page group, must be holding head lock + * @req - request that couldn't lock and needs to wait on the req bit lock + * @nonblock - if true, don't actually wait + * + * NOTE: this must be called holding page_group bit lock and inode spin lock + * and BOTH will be released before returning. + * + * returns 0 on success, < 0 on error. + */ +static int +nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, + struct nfs_page *req, bool nonblock) + __releases(&inode->i_lock) +{ + struct nfs_page *tmp; + int ret; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) + nfs_unlock_request(tmp); + + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + + /* grab a ref on the request that will be waited on */ + kref_get(&req->wb_kref); + + nfs_page_group_unlock(head); + spin_unlock(&inode->i_lock); + + /* release ref from nfs_page_find_head_request_locked */ + nfs_release_request(head); + + if (!nonblock) + ret = nfs_wait_on_request(req); + else + ret = -EAGAIN; + nfs_release_request(req); + + return ret; +} + +/* + * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests + * + * @destroy_list - request list (using wb_this_page) terminated by @old_head + * @old_head - the old head of the list + * + * All subrequests must be locked and removed from all lists, so at this point + * they are only "active" in this function, and possibly in nfs_wait_on_request + * with a reference held by some other context. + */ +static void +nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, + struct nfs_page *old_head) +{ + while (destroy_list) { + struct nfs_page *subreq = destroy_list; + + destroy_list = (subreq->wb_this_page == old_head) ? + NULL : subreq->wb_this_page; + + WARN_ON_ONCE(old_head != subreq->wb_head); + + /* make sure old group is not used */ + subreq->wb_head = subreq; + subreq->wb_this_page = subreq; + + nfs_clear_request_commit(subreq); + + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_request(subreq); + + if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { + /* release ref on old head request */ + nfs_release_request(old_head); + + nfs_page_group_clear_bits(subreq); + + /* release the PG_INODE_REF reference */ + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) + nfs_release_request(subreq); + else + WARN_ON_ONCE(1); + } else { + WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); + /* zombie requests have already released the last + * reference and were waiting on the rest of the + * group to complete. Since it's no longer part of a + * group, simply free the request */ + nfs_page_group_clear_bits(subreq); + nfs_free_request(subreq); + } + } +} + +/* + * nfs_lock_and_join_requests - join all subreqs to the head req and return + * a locked reference, cancelling any pending + * operations for this page. + * + * @page - the page used to lookup the "page group" of nfs_page structures + * @nonblock - if true, don't block waiting for request locks + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page, bool nonblock) { struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req; + struct nfs_page *head, *subreq; + struct nfs_page *destroy_list = NULL; + unsigned int total_bytes; int ret; +try_again: + total_bytes = 0; + + WARN_ON_ONCE(destroy_list); + spin_lock(&inode->i_lock); - for (;;) { - req = nfs_page_find_request_locked(NFS_I(inode), page); - if (req == NULL) - break; - if (nfs_lock_request(req)) - break; - /* Note: If we hold the page lock, as is the case in nfs_writepage, - * then the call to nfs_lock_request() will always - * succeed provided that someone hasn't already marked the - * request as dirty (in which case we don't care). - */ + + /* + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. + */ + head = nfs_page_find_head_request_locked(NFS_I(inode), page); + + if (!head) { spin_unlock(&inode->i_lock); - if (!nonblock) - ret = nfs_wait_on_request(req); - else - ret = -EAGAIN; - nfs_release_request(req); - if (ret != 0) + return NULL; + } + + /* lock each request in the page group */ + nfs_page_group_lock(head); + subreq = head; + do { + /* + * Subrequests are always contiguous, non overlapping + * and in order. If not, it's a programming error. + */ + WARN_ON_ONCE(subreq->wb_offset != + (head->wb_offset + total_bytes)); + + /* keep track of how many bytes this group covers */ + total_bytes += subreq->wb_bytes; + + if (!nfs_lock_request(subreq)) { + /* releases page group bit lock and + * inode spin lock and all references */ + ret = nfs_unroll_locks_and_wait(inode, head, + subreq, nonblock); + + if (ret == 0) + goto try_again; + return ERR_PTR(ret); - spin_lock(&inode->i_lock); + } + + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* Now that all requests are locked, make sure they aren't on any list. + * Commit list removal accounting is done after locks are dropped */ + subreq = head; + do { + nfs_list_remove_request(subreq); + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* unlink subrequests from head, destroy them later */ + if (head->wb_this_page != head) { + /* destroy list will be terminated by head */ + destroy_list = head->wb_this_page; + head->wb_this_page = head; + + /* change head request to cover whole range that + * the former page group covered */ + head->wb_bytes = total_bytes; } + + /* + * prepare head request to be added to new pgio descriptor + */ + nfs_page_group_clear_bits(head); + + /* + * some part of the group was still on the inode list - otherwise + * the group wouldn't be involved in async write. + * grab a reference for the head request, iff it needs one. + */ + if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) + kref_get(&head->wb_kref); + + nfs_page_group_unlock(head); + + /* drop lock to clear_request_commit the head req and clean up + * requests on destroy list */ spin_unlock(&inode->i_lock); - return req; + + nfs_destroy_unlinked_subrequests(destroy_list, head); + + /* clean up commit list state */ + nfs_clear_request_commit(head); + + /* still holds ref on head from nfs_page_find_head_request_locked + * and still has lock on head from lock loop */ + return head; } /* @@ -316,7 +542,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, struct nfs_page *req; int ret = 0; - req = nfs_find_and_lock_request(page, nonblock); + req = nfs_lock_and_join_requests(page, nonblock); if (!req) goto out; ret = PTR_ERR(req); @@ -397,7 +623,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int err; /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, + err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, nfs_wait_bit_killable, TASK_KILLABLE); if (err) goto out_err; @@ -448,7 +674,9 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) set_page_private(req->wb_page, (unsigned long)req); } nfsi->npages++; - set_bit(PG_INODE_REF, &req->wb_flags); + /* this a head request for a page group - mark it as having an + * extra reference so sub groups can follow suit */ + WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); spin_unlock(&inode->i_lock); } @@ -474,7 +702,9 @@ static void nfs_inode_remove_request(struct nfs_page *req) nfsi->npages--; spin_unlock(&inode->i_lock); } - nfs_release_request(req); + + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + nfs_release_request(req); } static void @@ -638,7 +868,6 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) { struct nfs_commit_info cinfo; unsigned long bytes = 0; - bool do_destroy; if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) goto out; @@ -668,7 +897,6 @@ remove_req: next: nfs_unlock_request(req); nfs_end_page_writeback(req); - do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags); nfs_release_request(req); } out: @@ -769,7 +997,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); for (;;) { - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); if (req == NULL) goto out_unlock; @@ -877,7 +1105,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) * dropped page. */ do { - req = nfs_page_find_request(page); + req = nfs_page_find_head_request(page); if (req == NULL) return 0; l_ctx = req->wb_lock_context; @@ -934,12 +1162,14 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) if (nfs_have_delegated_attributes(inode)) goto out; - if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) return false; smp_rmb(); if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) return false; out: + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return false; return PageUptodate(page) != 0; } @@ -1473,7 +1703,7 @@ int nfs_commit_inode(struct inode *inode, int how) return error; if (!may_wait) goto out_mark_dirty; - error = wait_on_bit(&NFS_I(inode)->flags, + error = wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_COMMIT, nfs_wait_bit_killable, TASK_KILLABLE); @@ -1567,27 +1797,28 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) struct nfs_page *req; int ret = 0; - for (;;) { - wait_on_page_writeback(page); - req = nfs_page_find_request(page); - if (req == NULL) - break; - if (nfs_lock_request(req)) { - nfs_clear_request_commit(req); - nfs_inode_remove_request(req); - /* - * In case nfs_inode_remove_request has marked the - * page as being dirty - */ - cancel_dirty_page(page, PAGE_CACHE_SIZE); - nfs_unlock_and_release_request(req); - break; - } - ret = nfs_wait_on_request(req); - nfs_release_request(req); - if (ret < 0) - break; + wait_on_page_writeback(page); + + /* blocking call to cancel all requests and join to a single (head) + * request */ + req = nfs_lock_and_join_requests(page, false); + + if (IS_ERR(req)) { + ret = PTR_ERR(req); + } else if (req) { + /* all requests from this page have been cancelled by + * nfs_lock_and_join_requests, so just remove the head + * request from the inode / page_private pointer and + * release it */ + nfs_inode_remove_request(req); + /* + * In case nfs_inode_remove_request has marked the + * page as being dirty + */ + cancel_dirty_page(page, PAGE_CACHE_SIZE); + nfs_unlock_and_release_request(req); } + return ret; } diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index a986ceb6fd0d..4cd7c69a6cb9 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -47,7 +47,7 @@ struct svc_rqst; #define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ / sizeof(struct nfs4_ace)) -struct nfs4_acl *nfs4_acl_new(int); +int nfs4_acl_bytes(int entries); int nfs4_acl_get_whotype(char *, u32); __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 72f44823adbb..9d46a0bdd9f9 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -28,7 +28,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) validate_process_creds(); /* discard any old override before preparing the new set */ - revert_creds(get_cred(current->real_cred)); + revert_creds(get_cred(current_real_cred())); new = prepare_creds(); if (!new) return -ENOMEM; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 13b85f94d9e2..72ffd7cce3c3 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -698,8 +698,8 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) kref_get(&item->ex_client->ref); new->ex_client = item->ex_client; - new->ex_path.dentry = dget(item->ex_path.dentry); - new->ex_path.mnt = mntget(item->ex_path.mnt); + new->ex_path = item->ex_path; + path_get(&item->ex_path); new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; @@ -1253,7 +1253,7 @@ static int e_show(struct seq_file *m, void *p) return 0; } - cache_get(&exp->h); + exp_get(exp); if (cache_check(cd, &exp->h, NULL)) return 0; exp_put(exp); diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index cfeea85c5bed..04dc8c167b0c 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -101,9 +101,10 @@ static inline void exp_put(struct svc_export *exp) cache_put(&exp->h, exp->cd); } -static inline void exp_get(struct svc_export *exp) +static inline struct svc_export *exp_get(struct svc_export *exp) { cache_get(&exp->h); + return exp; } struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *); diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 2ed05c3cd43d..c16bf5af6831 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -17,81 +17,13 @@ struct nfsd_fault_inject_op { char *file; - u64 (*forget)(struct nfs4_client *, u64); - u64 (*print)(struct nfs4_client *, u64); + u64 (*get)(void); + u64 (*set_val)(u64); + u64 (*set_clnt)(struct sockaddr_storage *, size_t); }; -static struct nfsd_fault_inject_op inject_ops[] = { - { - .file = "forget_clients", - .forget = nfsd_forget_client, - .print = nfsd_print_client, - }, - { - .file = "forget_locks", - .forget = nfsd_forget_client_locks, - .print = nfsd_print_client_locks, - }, - { - .file = "forget_openowners", - .forget = nfsd_forget_client_openowners, - .print = nfsd_print_client_openowners, - }, - { - .file = "forget_delegations", - .forget = nfsd_forget_client_delegations, - .print = nfsd_print_client_delegations, - }, - { - .file = "recall_delegations", - .forget = nfsd_recall_client_delegations, - .print = nfsd_print_client_delegations, - }, -}; - -static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op); static struct dentry *debug_dir; -static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val) -{ - u64 count = 0; - - if (val == 0) - printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file); - else - printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val); - - nfs4_lock_state(); - count = nfsd_for_n_state(val, op->forget); - nfs4_unlock_state(); - printk(KERN_INFO "NFSD: %s: found %llu", op->file, count); -} - -static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op, - struct sockaddr_storage *addr, - size_t addr_size) -{ - char buf[INET6_ADDRSTRLEN]; - struct nfs4_client *clp; - u64 count; - - nfs4_lock_state(); - clp = nfsd_find_client(addr, addr_size); - if (clp) { - count = op->forget(clp, 0); - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); - printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count); - } - nfs4_unlock_state(); -} - -static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val) -{ - nfs4_lock_state(); - *val = nfsd_for_n_state(0, op->print); - nfs4_unlock_state(); -} - static ssize_t fault_inject_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { @@ -99,9 +31,10 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf, char read_buf[25]; size_t size; loff_t pos = *ppos; + struct nfsd_fault_inject_op *op = file_inode(file)->i_private; if (!pos) - nfsd_inject_get(file_inode(file)->i_private, &val); + val = op->get(); size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); return simple_read_from_buffer(buf, len, ppos, read_buf, size); @@ -114,18 +47,36 @@ static ssize_t fault_inject_write(struct file *file, const char __user *buf, size_t size = min(sizeof(write_buf) - 1, len); struct net *net = current->nsproxy->net_ns; struct sockaddr_storage sa; + struct nfsd_fault_inject_op *op = file_inode(file)->i_private; u64 val; + char *nl; if (copy_from_user(write_buf, buf, size)) return -EFAULT; write_buf[size] = '\0'; + /* Deal with any embedded newlines in the string */ + nl = strchr(write_buf, '\n'); + if (nl) { + size = nl - write_buf; + *nl = '\0'; + } + size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); - if (size > 0) - nfsd_inject_set_client(file_inode(file)->i_private, &sa, size); - else { + if (size > 0) { + val = op->set_clnt(&sa, size); + if (val) + pr_info("NFSD [%s]: Client %s had %llu state object(s)\n", + op->file, write_buf, val); + } else { val = simple_strtoll(write_buf, NULL, 0); - nfsd_inject_set(file_inode(file)->i_private, val); + if (val == 0) + pr_info("NFSD Fault Injection: %s (all)", op->file); + else + pr_info("NFSD Fault Injection: %s (n = %llu)", + op->file, val); + val = op->set_val(val); + pr_info("NFSD: %s: found %llu", op->file, val); } return len; /* on success, claim we got the whole input */ } @@ -141,6 +92,41 @@ void nfsd_fault_inject_cleanup(void) debugfs_remove_recursive(debug_dir); } +static struct nfsd_fault_inject_op inject_ops[] = { + { + .file = "forget_clients", + .get = nfsd_inject_print_clients, + .set_val = nfsd_inject_forget_clients, + .set_clnt = nfsd_inject_forget_client, + }, + { + .file = "forget_locks", + .get = nfsd_inject_print_locks, + .set_val = nfsd_inject_forget_locks, + .set_clnt = nfsd_inject_forget_client_locks, + }, + { + .file = "forget_openowners", + .get = nfsd_inject_print_openowners, + .set_val = nfsd_inject_forget_openowners, + .set_clnt = nfsd_inject_forget_client_openowners, + }, + { + .file = "forget_delegations", + .get = nfsd_inject_print_delegations, + .set_val = nfsd_inject_forget_delegations, + .set_clnt = nfsd_inject_forget_client_delegations, + }, + { + .file = "recall_delegations", + .get = nfsd_inject_print_delegations, + .set_val = nfsd_inject_recall_delegations, + .set_clnt = nfsd_inject_recall_client_delegations, + }, +}; + +#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op)) + int nfsd_fault_inject_init(void) { unsigned int i; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index d32b3aa6600d..ea6749a32760 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -29,14 +29,19 @@ #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) #define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) -#define LOCKOWNER_INO_HASH_BITS 8 -#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS) - #define SESSION_HASH_SIZE 512 struct cld_net; struct nfsd4_client_tracking_ops; +/* + * Represents a nfsd "container". With respect to nfsv4 state tracking, the + * fields of interest are the *_id_hashtbls and the *_name_tree. These track + * the nfs4_client objects by either short or long form clientid. + * + * Each nfsd_net runs a nfs4_laundromat workqueue job when necessary to clean + * up expired clients and delegations within the container. + */ struct nfsd_net { struct cld_net *cld_net; @@ -66,8 +71,6 @@ struct nfsd_net { struct rb_root conf_name_tree; struct list_head *unconf_id_hashtbl; struct rb_root unconf_name_tree; - struct list_head *ownerstr_hashtbl; - struct list_head *lockowner_ino_hashtbl; struct list_head *sessionid_hashtbl; /* * client_lru holds client queue ordered by nfs4_client.cl_time @@ -97,10 +100,16 @@ struct nfsd_net { bool nfsd_net_up; bool lockd_up; + /* Time of server startup */ + struct timeval nfssvc_boot; + /* - * Time of server startup + * Max number of connections this nfsd container will allow. Defaults + * to '0' which is means that it bases this on the number of threads. */ - struct timeval nfssvc_boot; + unsigned int max_connections; + + u32 clientid_counter; struct svc_serv *nfsd_serv; }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 12b023a7ab7d..ac54ea60b3f6 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -54,14 +54,14 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { acl = get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); - goto fail; - } if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } + if (IS_ERR(acl)) { + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; + } resp->acl_access = acl; } if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 2a514e21dc74..34cbbab6abd7 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -47,14 +47,14 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { acl = get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); - goto fail; - } if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } + if (IS_ERR(acl)) { + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; + } resp->acl_access = acl; } if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 401289913130..fa2525b2e9d7 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -157,11 +157,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) * + 1 (xdr opaque byte count) = 26 */ - - resp->count = argp->count; - if (max_blocksize < resp->count) - resp->count = max_blocksize; - + resp->count = min(argp->count, max_blocksize); svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); fh_copy(&resp->fh, &argp->fh); @@ -286,8 +282,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp, fh_copy(&resp->dirfh, &argp->ffh); fh_init(&resp->fh, NFS3_FHSIZE); nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen, - argp->tname, argp->tlen, - &resp->fh, &argp->attrs); + argp->tname, &resp->fh); RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index e6c01e80325e..39c5eb3ad33a 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -120,10 +120,7 @@ decode_sattr3(__be32 *p, struct iattr *iap) iap->ia_valid |= ATTR_SIZE; p = xdr_decode_hyper(p, &newsize); - if (newsize <= NFS_OFFSET_MAX) - iap->ia_size = newsize; - else - iap->ia_size = NFS_OFFSET_MAX; + iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); } if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ iap->ia_valid |= ATTR_ATIME; @@ -338,10 +335,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, return 0; p = xdr_decode_hyper(p, &args->offset); - len = args->count = ntohl(*p++); - - if (len > max_blocksize) - len = max_blocksize; + args->count = ntohl(*p++); + len = min(args->count, max_blocksize); /* set up the kvec */ v=0; @@ -349,7 +344,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, struct page *p = *(rqstp->rq_next_page++); rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE; + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); len -= rqstp->rq_vec[v].iov_len; v++; } @@ -484,9 +479,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, } /* now copy next page if there is one */ if (len && !avail && rqstp->rq_arg.page_len) { - avail = rqstp->rq_arg.page_len; - if (avail > PAGE_SIZE) - avail = PAGE_SIZE; + avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE); old = page_address(rqstp->rq_arg.pages[0]); } while (len && avail && *old) { @@ -571,10 +564,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - - if (args->count > PAGE_SIZE) - args->count = PAGE_SIZE; - + args->count = min_t(u32, args->count, PAGE_SIZE); args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); @@ -595,10 +585,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, args->dircount = ntohl(*p++); args->count = ntohl(*p++); - len = (args->count > max_blocksize) ? max_blocksize : - args->count; - args->count = len; - + len = args->count = min(args->count, max_blocksize); while (len > 0) { struct page *p = *(rqstp->rq_next_page++); if (!args->buffer) @@ -913,8 +900,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, */ /* truncate filename if too long */ - if (namlen > NFS3_MAXNAMLEN) - namlen = NFS3_MAXNAMLEN; + namlen = min(namlen, NFS3_MAXNAMLEN); slen = XDR_QUADLEN(namlen); elen = slen + NFS3_ENTRY_BAGGAGE diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index d714156a19fd..59fd76651781 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -146,35 +146,43 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, int size = 0; pacl = get_acl(inode, ACL_TYPE_ACCESS); - if (!pacl) { + if (!pacl) pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); - if (IS_ERR(pacl)) - return PTR_ERR(pacl); - } + + if (IS_ERR(pacl)) + return PTR_ERR(pacl); + /* allocate for worst case: one (deny, allow) pair each: */ size += 2 * pacl->a_count; if (S_ISDIR(inode->i_mode)) { flags = NFS4_ACL_DIR; dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(dpacl)) { + error = PTR_ERR(dpacl); + goto rel_pacl; + } + if (dpacl) size += 2 * dpacl->a_count; } - *acl = nfs4_acl_new(size); + *acl = kmalloc(nfs4_acl_bytes(size), GFP_KERNEL); if (*acl == NULL) { error = -ENOMEM; goto out; } + (*acl)->naces = 0; _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); if (dpacl) _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT); - out: - posix_acl_release(pacl); +out: posix_acl_release(dpacl); +rel_pacl: + posix_acl_release(pacl); return error; } @@ -872,16 +880,13 @@ ace2type(struct nfs4_ace *ace) return -1; } -struct nfs4_acl * -nfs4_acl_new(int n) +/* + * return the size of the struct nfs4_acl required to represent an acl + * with @entries entries. + */ +int nfs4_acl_bytes(int entries) { - struct nfs4_acl *acl; - - acl = kmalloc(sizeof(*acl) + n*sizeof(struct nfs4_ace), GFP_KERNEL); - if (acl == NULL) - return NULL; - acl->naces = 0; - return acl; + return sizeof(struct nfs4_acl) + entries * sizeof(struct nfs4_ace); } static struct { @@ -935,5 +940,5 @@ __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who) return 0; } WARN_ON_ONCE(1); - return -1; + return nfserr_serverfault; } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 2c73cae9899d..e0be57b0f79b 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -337,7 +337,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr, p = xdr_reserve_space(xdr, 4); *p++ = xdr_zero; /* truncate */ - encode_nfs_fh4(xdr, &dp->dl_fh); + encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle); hdr->nops++; } @@ -678,7 +678,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) return -EINVAL; args.client_name = clp->cl_cred.cr_principal; - args.prognumber = conn->cb_prog, + args.prognumber = conn->cb_prog; args.protocol = XPRT_TRANSPORT_TCP; args.authflavor = clp->cl_cred.cr_flavor; clp->cl_cb_ident = conn->cb_ident; @@ -689,7 +689,8 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; args.prognumber = clp->cl_cb_session->se_cb_prog; - args.protocol = XPRT_TRANSPORT_BC_TCP; + args.protocol = conn->cb_xprt->xpt_class->xcl_ident | + XPRT_TRANSPORT_BC; args.authflavor = ses->se_cb_sec.flavor; } /* Create RPC client */ @@ -904,7 +905,7 @@ static void nfsd4_cb_recall_release(void *calldata) spin_lock(&clp->cl_lock); list_del(&cb->cb_per_client); spin_unlock(&clp->cl_lock); - nfs4_put_delegation(dp); + nfs4_put_stid(&dp->dl_stid); } } @@ -933,7 +934,7 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); /* * Note this won't actually result in a null callback; - * instead, nfsd4_do_callback_rpc() will detect the killed + * instead, nfsd4_run_cb_null() will detect the killed * client, destroy the rpc client, and stop: */ do_probe_callback(clp); @@ -1011,9 +1012,9 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) run_nfsd4_cb(cb); } -static void nfsd4_do_callback_rpc(struct work_struct *w) +static void +nfsd4_run_callback_rpc(struct nfsd4_callback *cb) { - struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; @@ -1031,9 +1032,22 @@ static void nfsd4_do_callback_rpc(struct work_struct *w) cb->cb_ops, cb); } -void nfsd4_init_callback(struct nfsd4_callback *cb) +void +nfsd4_run_cb_null(struct work_struct *w) { - INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc); + struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, + cb_work); + nfsd4_run_callback_rpc(cb); +} + +void +nfsd4_run_cb_recall(struct work_struct *w) +{ + struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, + cb_work); + + nfsd4_prepare_cb_recall(cb->cb_op); + nfsd4_run_callback_rpc(cb); } void nfsd4_cb_recall(struct nfs4_delegation *dp) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6851b003f2a4..5e0dc528a0e8 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -177,7 +177,7 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src) fh_put(dst); dget(src->fh_dentry); if (src->fh_export) - cache_get(&src->fh_export->h); + exp_get(src->fh_export); *dst = *src; } @@ -385,8 +385,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (nfsd4_has_session(cstate)) copy_clientid(&open->op_clientid, cstate->session); - nfs4_lock_state(); - /* check seqid for replay. set nfs4_owner */ resp = rqstp->rq_resp; status = nfsd4_process_open1(&resp->cstate, open, nn); @@ -431,8 +429,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case NFS4_OPEN_CLAIM_PREVIOUS: status = nfs4_check_open_reclaim(&open->op_clientid, - cstate->minorversion, - nn); + cstate, nn); if (status) goto out; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; @@ -461,19 +458,17 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ status = nfsd4_process_open2(rqstp, resfh, open); - WARN_ON(status && open->op_created); + WARN(status && open->op_created, + "nfsd4_process_open2 failed to open newly-created file! status=%u\n", + be32_to_cpu(status)); out: if (resfh && resfh != &cstate->current_fh) { fh_dup2(&cstate->current_fh, resfh); fh_put(resfh); kfree(resfh); } - nfsd4_cleanup_open_state(open, status); - if (open->op_openowner && !nfsd4_has_session(cstate)) - cstate->replay_owner = &open->op_openowner->oo_owner; + nfsd4_cleanup_open_state(cstate, open, status); nfsd4_bump_seqid(cstate, status); - if (!cstate->replay_owner) - nfs4_unlock_state(); return status; } @@ -581,8 +576,12 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) __be32 verf[2]; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - verf[0] = (__be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__be32)nn->nfssvc_boot.tv_usec; + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy + */ + verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; + verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec; memcpy(verifier->data, verf, sizeof(verifier->data)); } @@ -617,19 +616,9 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (create->cr_type) { case NF4LNK: - /* ugh! we have to null-terminate the linktext, or - * vfs_symlink() will choke. it is always safe to - * null-terminate by brute force, since at worst we - * will overwrite the first byte of the create namelen - * in the XDR buffer, which has already been extracted - * during XDR decode. - */ - create->cr_linkname[create->cr_linklen] = 0; - status = nfsd_symlink(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, - create->cr_linkname, create->cr_linklen, - &resfh, &create->cr_iattr); + create->cr_data, &resfh); break; case NF4BLK: @@ -918,8 +907,8 @@ nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstat default: return nfserr_inval; } - exp_get(cstate->current_fh.fh_export); - sin->sin_exp = cstate->current_fh.fh_export; + + sin->sin_exp = exp_get(cstate->current_fh.fh_export); fh_put(&cstate->current_fh); return nfs_ok; } @@ -1298,7 +1287,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. */ - rqstp->rq_usedeferral = 0; + rqstp->rq_usedeferral = false; /* * According to RFC3010, this takes precedence over all other errors. @@ -1400,10 +1389,7 @@ encode_op: args->ops, args->opcnt, resp->opcnt, op->opnum, be32_to_cpu(status)); - if (cstate->replay_owner) { - nfs4_unlock_state(); - cstate->replay_owner = NULL; - } + nfsd4_cstate_clear_replay(cstate); /* XXX Ugh, we need to get rid of this kind of special case: */ if (op->opnum == OP_READ && op->u.read.rd_filp) fput(op->u.read.rd_filp); @@ -1417,7 +1403,7 @@ encode_op: BUG_ON(cstate->replay_owner); out: /* Reset deferral mechanism for RPC deferrals */ - rqstp->rq_usedeferral = 1; + rqstp->rq_usedeferral = true; dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } @@ -1529,21 +1515,17 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) u32 maxcount = 0, rlen = 0; maxcount = svc_max_payload(rqstp); - rlen = op->u.read.rd_length; - - if (rlen > maxcount) - rlen = maxcount; + rlen = min(op->u.read.rd_length, maxcount); return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); } static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { - u32 maxcount = svc_max_payload(rqstp); - u32 rlen = op->u.readdir.rd_maxcount; + u32 maxcount = 0, rlen = 0; - if (rlen > maxcount) - rlen = maxcount; + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.readdir.rd_maxcount, maxcount); return (op_encode_hdr_size + op_encode_verifier_maxsz + XDR_QUADLEN(rlen)) * sizeof(__be32); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2204e1fe5725..2e80a59e7e91 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -70,13 +70,11 @@ static u64 current_sessionid = 1; #define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) /* forward declarations */ -static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); +static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); +static void nfs4_free_ol_stateid(struct nfs4_stid *stid); /* Locking: */ -/* Currently used for almost all code touching nfsv4 state: */ -static DEFINE_MUTEX(client_mutex); - /* * Currently used for the del_recall_lru and file hash table. In an * effort to decrease the scope of the client_mutex, this spinlock may @@ -84,18 +82,18 @@ static DEFINE_MUTEX(client_mutex); */ static DEFINE_SPINLOCK(state_lock); +/* + * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for + * the refcount on the open stateid to drop. + */ +static DECLARE_WAIT_QUEUE_HEAD(close_wq); + static struct kmem_cache *openowner_slab; static struct kmem_cache *lockowner_slab; static struct kmem_cache *file_slab; static struct kmem_cache *stateid_slab; static struct kmem_cache *deleg_slab; -void -nfs4_lock_state(void) -{ - mutex_lock(&client_mutex); -} - static void free_session(struct nfsd4_session *); static bool is_session_dead(struct nfsd4_session *ses) @@ -103,12 +101,6 @@ static bool is_session_dead(struct nfsd4_session *ses) return ses->se_flags & NFS4_SESSION_DEAD; } -void nfsd4_put_session(struct nfsd4_session *ses) -{ - if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses)) - free_session(ses); -} - static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me) { if (atomic_read(&ses->se_ref) > ref_held_by_me) @@ -117,46 +109,17 @@ static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_b return nfs_ok; } -static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses) -{ - if (is_session_dead(ses)) - return nfserr_badsession; - atomic_inc(&ses->se_ref); - return nfs_ok; -} - -void -nfs4_unlock_state(void) -{ - mutex_unlock(&client_mutex); -} - static bool is_client_expired(struct nfs4_client *clp) { return clp->cl_time == 0; } -static __be32 mark_client_expired_locked(struct nfs4_client *clp) -{ - if (atomic_read(&clp->cl_refcount)) - return nfserr_jukebox; - clp->cl_time = 0; - return nfs_ok; -} - -static __be32 mark_client_expired(struct nfs4_client *clp) +static __be32 get_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - __be32 ret; - spin_lock(&nn->client_lock); - ret = mark_client_expired_locked(clp); - spin_unlock(&nn->client_lock); - return ret; -} + lockdep_assert_held(&nn->client_lock); -static __be32 get_client_locked(struct nfs4_client *clp) -{ if (is_client_expired(clp)) return nfserr_expired; atomic_inc(&clp->cl_refcount); @@ -197,13 +160,17 @@ renew_client(struct nfs4_client *clp) static void put_client_renew_locked(struct nfs4_client *clp) { + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + if (!atomic_dec_and_test(&clp->cl_refcount)) return; if (!is_client_expired(clp)) renew_client_locked(clp); } -void put_client_renew(struct nfs4_client *clp) +static void put_client_renew(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); @@ -214,6 +181,79 @@ void put_client_renew(struct nfs4_client *clp) spin_unlock(&nn->client_lock); } +static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses) +{ + __be32 status; + + if (is_session_dead(ses)) + return nfserr_badsession; + status = get_client_locked(ses->se_client); + if (status) + return status; + atomic_inc(&ses->se_ref); + return nfs_ok; +} + +static void nfsd4_put_session_locked(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses)) + free_session(ses); + put_client_renew_locked(clp); +} + +static void nfsd4_put_session(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + nfsd4_put_session_locked(ses); + spin_unlock(&nn->client_lock); +} + +static int +same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner) +{ + return (sop->so_owner.len == owner->len) && + 0 == memcmp(sop->so_owner.data, owner->data, owner->len); +} + +static struct nfs4_openowner * +find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open, + struct nfs4_client *clp) +{ + struct nfs4_stateowner *so; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval], + so_strhash) { + if (!so->so_is_open_owner) + continue; + if (same_owner_str(so, &open->op_owner)) { + atomic_inc(&so->so_count); + return openowner(so); + } + } + return NULL; +} + +static struct nfs4_openowner * +find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, + struct nfs4_client *clp) +{ + struct nfs4_openowner *oo; + + spin_lock(&clp->cl_lock); + oo = find_openstateowner_str_locked(hashval, open, clp); + spin_unlock(&clp->cl_lock); + return oo; +} static inline u32 opaque_hashval(const void *ptr, int nbytes) @@ -236,10 +276,11 @@ static void nfsd4_free_file(struct nfs4_file *f) static inline void put_nfs4_file(struct nfs4_file *fi) { + might_lock(&state_lock); + if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del(&fi->fi_hash); spin_unlock(&state_lock); - iput(fi->fi_inode); nfsd4_free_file(fi); } } @@ -250,7 +291,80 @@ get_nfs4_file(struct nfs4_file *fi) atomic_inc(&fi->fi_ref); } -static int num_delegations; +static struct file * +__nfs4_get_fd(struct nfs4_file *f, int oflag) +{ + if (f->fi_fds[oflag]) + return get_file(f->fi_fds[oflag]); + return NULL; +} + +static struct file * +find_writeable_file_locked(struct nfs4_file *f) +{ + struct file *ret; + + lockdep_assert_held(&f->fi_lock); + + ret = __nfs4_get_fd(f, O_WRONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDWR); + return ret; +} + +static struct file * +find_writeable_file(struct nfs4_file *f) +{ + struct file *ret; + + spin_lock(&f->fi_lock); + ret = find_writeable_file_locked(f); + spin_unlock(&f->fi_lock); + + return ret; +} + +static struct file *find_readable_file_locked(struct nfs4_file *f) +{ + struct file *ret; + + lockdep_assert_held(&f->fi_lock); + + ret = __nfs4_get_fd(f, O_RDONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDWR); + return ret; +} + +static struct file * +find_readable_file(struct nfs4_file *f) +{ + struct file *ret; + + spin_lock(&f->fi_lock); + ret = find_readable_file_locked(f); + spin_unlock(&f->fi_lock); + + return ret; +} + +static struct file * +find_any_file(struct nfs4_file *f) +{ + struct file *ret; + + spin_lock(&f->fi_lock); + ret = __nfs4_get_fd(f, O_RDWR); + if (!ret) { + ret = __nfs4_get_fd(f, O_WRONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDONLY); + } + spin_unlock(&f->fi_lock); + return ret; +} + +static atomic_long_t num_delegations; unsigned long max_delegations; /* @@ -262,12 +376,11 @@ unsigned long max_delegations; #define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) #define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) -static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) +static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) { unsigned int ret; ret = opaque_hashval(ownername->data, ownername->len); - ret += clientid; return ret & OWNER_HASH_MASK; } @@ -275,75 +388,124 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -static unsigned int file_hashval(struct inode *ino) +static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh) +{ + return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0); +} + +static unsigned int file_hashval(struct knfsd_fh *fh) +{ + return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); +} + +static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) { - /* XXX: why are we hashing on inode pointer, anyway? */ - return hash_ptr(ino, FILE_HASH_BITS); + return fh1->fh_size == fh2->fh_size && + !memcmp(fh1->fh_base.fh_pad, + fh2->fh_base.fh_pad, + fh1->fh_size); } static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; -static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) +static void +__nfs4_file_get_access(struct nfs4_file *fp, u32 access) { - WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); - atomic_inc(&fp->fi_access[oflag]); + lockdep_assert_held(&fp->fi_lock); + + if (access & NFS4_SHARE_ACCESS_WRITE) + atomic_inc(&fp->fi_access[O_WRONLY]); + if (access & NFS4_SHARE_ACCESS_READ) + atomic_inc(&fp->fi_access[O_RDONLY]); } -static void nfs4_file_get_access(struct nfs4_file *fp, int oflag) +static __be32 +nfs4_file_get_access(struct nfs4_file *fp, u32 access) { - if (oflag == O_RDWR) { - __nfs4_file_get_access(fp, O_RDONLY); - __nfs4_file_get_access(fp, O_WRONLY); - } else - __nfs4_file_get_access(fp, oflag); + lockdep_assert_held(&fp->fi_lock); + + /* Does this access mode make sense? */ + if (access & ~NFS4_SHARE_ACCESS_BOTH) + return nfserr_inval; + + /* Does it conflict with a deny mode already set? */ + if ((access & fp->fi_share_deny) != 0) + return nfserr_share_denied; + + __nfs4_file_get_access(fp, access); + return nfs_ok; } -static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) +static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny) { - if (fp->fi_fds[oflag]) { - fput(fp->fi_fds[oflag]); - fp->fi_fds[oflag] = NULL; + /* Common case is that there is no deny mode. */ + if (deny) { + /* Does this deny mode make sense? */ + if (deny & ~NFS4_SHARE_DENY_BOTH) + return nfserr_inval; + + if ((deny & NFS4_SHARE_DENY_READ) && + atomic_read(&fp->fi_access[O_RDONLY])) + return nfserr_share_denied; + + if ((deny & NFS4_SHARE_DENY_WRITE) && + atomic_read(&fp->fi_access[O_WRONLY])) + return nfserr_share_denied; } + return nfs_ok; } static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) { - if (atomic_dec_and_test(&fp->fi_access[oflag])) { - nfs4_file_put_fd(fp, oflag); + might_lock(&fp->fi_lock); + + if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { + struct file *f1 = NULL; + struct file *f2 = NULL; + + swap(f1, fp->fi_fds[oflag]); if (atomic_read(&fp->fi_access[1 - oflag]) == 0) - nfs4_file_put_fd(fp, O_RDWR); + swap(f2, fp->fi_fds[O_RDWR]); + spin_unlock(&fp->fi_lock); + if (f1) + fput(f1); + if (f2) + fput(f2); } } -static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) +static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) { - if (oflag == O_RDWR) { - __nfs4_file_put_access(fp, O_RDONLY); + WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH); + + if (access & NFS4_SHARE_ACCESS_WRITE) __nfs4_file_put_access(fp, O_WRONLY); - } else - __nfs4_file_put_access(fp, oflag); + if (access & NFS4_SHARE_ACCESS_READ) + __nfs4_file_put_access(fp, O_RDONLY); } -static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct -kmem_cache *slab) +static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, + struct kmem_cache *slab) { - struct idr *stateids = &cl->cl_stateids; struct nfs4_stid *stid; int new_id; - stid = kmem_cache_alloc(slab, GFP_KERNEL); + stid = kmem_cache_zalloc(slab, GFP_KERNEL); if (!stid) return NULL; - new_id = idr_alloc_cyclic(stateids, stid, 0, 0, GFP_KERNEL); + idr_preload(GFP_KERNEL); + spin_lock(&cl->cl_lock); + new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 0, 0, GFP_NOWAIT); + spin_unlock(&cl->cl_lock); + idr_preload_end(); if (new_id < 0) goto out_free; stid->sc_client = cl; - stid->sc_type = 0; stid->sc_stateid.si_opaque.so_id = new_id; stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; /* Will be incremented before return to client: */ - stid->sc_stateid.si_generation = 0; + atomic_set(&stid->sc_count, 1); /* * It shouldn't be a problem to reuse an opaque stateid value. @@ -360,9 +522,24 @@ out_free: return NULL; } -static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) +static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) { - return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); + struct nfs4_stid *stid; + struct nfs4_ol_stateid *stp; + + stid = nfs4_alloc_stid(clp, stateid_slab); + if (!stid) + return NULL; + + stp = openlockstateid(stid); + stp->st_stid.sc_free = nfs4_free_ol_stateid; + return stp; +} + +static void nfs4_free_deleg(struct nfs4_stid *stid) +{ + kmem_cache_free(deleg_slab, stid); + atomic_long_dec(&num_delegations); } /* @@ -379,10 +556,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) * Each filter is 256 bits. We hash the filehandle to 32bit and use the * low 3 bytes as hash-table indices. * - * 'state_lock', which is always held when block_delegations() is called, + * 'blocked_delegations_lock', which is always taken in block_delegations(), * is used to manage concurrent access. Testing does not need the lock * except when swapping the two filters. */ +static DEFINE_SPINLOCK(blocked_delegations_lock); static struct bloom_pair { int entries, old_entries; time_t swap_time; @@ -398,7 +576,7 @@ static int delegation_blocked(struct knfsd_fh *fh) if (bd->entries == 0) return 0; if (seconds_since_boot() - bd->swap_time > 30) { - spin_lock(&state_lock); + spin_lock(&blocked_delegations_lock); if (seconds_since_boot() - bd->swap_time > 30) { bd->entries -= bd->old_entries; bd->old_entries = bd->entries; @@ -407,7 +585,7 @@ static int delegation_blocked(struct knfsd_fh *fh) bd->new = 1-bd->new; bd->swap_time = seconds_since_boot(); } - spin_unlock(&state_lock); + spin_unlock(&blocked_delegations_lock); } hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); if (test_bit(hash&255, bd->set[0]) && @@ -430,69 +608,73 @@ static void block_delegations(struct knfsd_fh *fh) hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); + spin_lock(&blocked_delegations_lock); __set_bit(hash&255, bd->set[bd->new]); __set_bit((hash>>8)&255, bd->set[bd->new]); __set_bit((hash>>16)&255, bd->set[bd->new]); if (bd->entries == 0) bd->swap_time = seconds_since_boot(); bd->entries += 1; + spin_unlock(&blocked_delegations_lock); } static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh) +alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) { struct nfs4_delegation *dp; + long n; dprintk("NFSD alloc_init_deleg\n"); - if (num_delegations > max_delegations) - return NULL; + n = atomic_long_inc_return(&num_delegations); + if (n < 0 || n > max_delegations) + goto out_dec; if (delegation_blocked(¤t_fh->fh_handle)) - return NULL; + goto out_dec; dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) - return dp; + goto out_dec; + + dp->dl_stid.sc_free = nfs4_free_deleg; /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid * 0 anyway just for consistency and use 1: */ dp->dl_stid.sc_stateid.si_generation = 1; - num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); - dp->dl_file = NULL; dp->dl_type = NFS4_OPEN_DELEGATE_READ; - fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); - dp->dl_time = 0; - atomic_set(&dp->dl_count, 1); - nfsd4_init_callback(&dp->dl_recall); + INIT_WORK(&dp->dl_recall.cb_work, nfsd4_run_cb_recall); return dp; +out_dec: + atomic_long_dec(&num_delegations); + return NULL; } -static void remove_stid(struct nfs4_stid *s) +void +nfs4_put_stid(struct nfs4_stid *s) { - struct idr *stateids = &s->sc_client->cl_stateids; + struct nfs4_file *fp = s->sc_file; + struct nfs4_client *clp = s->sc_client; - idr_remove(stateids, s->sc_stateid.si_opaque.so_id); -} + might_lock(&clp->cl_lock); -static void nfs4_free_stid(struct kmem_cache *slab, struct nfs4_stid *s) -{ - kmem_cache_free(slab, s); -} - -void -nfs4_put_delegation(struct nfs4_delegation *dp) -{ - if (atomic_dec_and_test(&dp->dl_count)) { - nfs4_free_stid(deleg_slab, &dp->dl_stid); - num_delegations--; + if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) { + wake_up_all(&close_wq); + return; } + idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + spin_unlock(&clp->cl_lock); + s->sc_free(s); + if (fp) + put_nfs4_file(fp); } static void nfs4_put_deleg_lease(struct nfs4_file *fp) { + lockdep_assert_held(&state_lock); + if (!fp->fi_lease) return; if (atomic_dec_and_test(&fp->fi_delegees)) { @@ -512,54 +694,54 @@ static void hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) { lockdep_assert_held(&state_lock); + lockdep_assert_held(&fp->fi_lock); + atomic_inc(&dp->dl_stid.sc_count); dp->dl_stid.sc_type = NFS4_DELEG_STID; list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); } -/* Called under the state lock. */ static void -unhash_delegation(struct nfs4_delegation *dp) +unhash_delegation_locked(struct nfs4_delegation *dp) { - spin_lock(&state_lock); - list_del_init(&dp->dl_perclnt); - list_del_init(&dp->dl_perfile); - list_del_init(&dp->dl_recall_lru); - spin_unlock(&state_lock); - if (dp->dl_file) { - nfs4_put_deleg_lease(dp->dl_file); - put_nfs4_file(dp->dl_file); - dp->dl_file = NULL; - } -} - + struct nfs4_file *fp = dp->dl_stid.sc_file; + lockdep_assert_held(&state_lock); -static void destroy_revoked_delegation(struct nfs4_delegation *dp) -{ + dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; + /* Ensure that deleg break won't try to requeue it */ + ++dp->dl_time; + spin_lock(&fp->fi_lock); + list_del_init(&dp->dl_perclnt); list_del_init(&dp->dl_recall_lru); - remove_stid(&dp->dl_stid); - nfs4_put_delegation(dp); + list_del_init(&dp->dl_perfile); + spin_unlock(&fp->fi_lock); + if (fp) + nfs4_put_deleg_lease(fp); } static void destroy_delegation(struct nfs4_delegation *dp) { - unhash_delegation(dp); - remove_stid(&dp->dl_stid); - nfs4_put_delegation(dp); + spin_lock(&state_lock); + unhash_delegation_locked(dp); + spin_unlock(&state_lock); + nfs4_put_stid(&dp->dl_stid); } static void revoke_delegation(struct nfs4_delegation *dp) { struct nfs4_client *clp = dp->dl_stid.sc_client; + WARN_ON(!list_empty(&dp->dl_recall_lru)); + if (clp->cl_minorversion == 0) - destroy_delegation(dp); + nfs4_put_stid(&dp->dl_stid); else { - unhash_delegation(dp); dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; + spin_lock(&clp->cl_lock); list_add(&dp->dl_recall_lru, &clp->cl_revoked); + spin_unlock(&clp->cl_lock); } } @@ -607,57 +789,62 @@ bmap_to_share_mode(unsigned long bmap) { return access; } -static bool -test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { - unsigned int access, deny; - - access = bmap_to_share_mode(stp->st_access_bmap); - deny = bmap_to_share_mode(stp->st_deny_bmap); - if ((access & open->op_share_deny) || (deny & open->op_share_access)) - return false; - return true; -} - /* set share access for a given stateid */ static inline void set_access(u32 access, struct nfs4_ol_stateid *stp) { - __set_bit(access, &stp->st_access_bmap); + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap |= mask; } /* clear share access for a given stateid */ static inline void clear_access(u32 access, struct nfs4_ol_stateid *stp) { - __clear_bit(access, &stp->st_access_bmap); + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap &= ~mask; } /* test whether a given stateid has access */ static inline bool test_access(u32 access, struct nfs4_ol_stateid *stp) { - return test_bit(access, &stp->st_access_bmap); + unsigned char mask = 1 << access; + + return (bool)(stp->st_access_bmap & mask); } /* set share deny for a given stateid */ static inline void -set_deny(u32 access, struct nfs4_ol_stateid *stp) +set_deny(u32 deny, struct nfs4_ol_stateid *stp) { - __set_bit(access, &stp->st_deny_bmap); + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap |= mask; } /* clear share deny for a given stateid */ static inline void -clear_deny(u32 access, struct nfs4_ol_stateid *stp) +clear_deny(u32 deny, struct nfs4_ol_stateid *stp) { - __clear_bit(access, &stp->st_deny_bmap); + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap &= ~mask; } /* test whether a given stateid is denying specific access */ static inline bool -test_deny(u32 access, struct nfs4_ol_stateid *stp) +test_deny(u32 deny, struct nfs4_ol_stateid *stp) { - return test_bit(access, &stp->st_deny_bmap); + unsigned char mask = 1 << deny; + + return (bool)(stp->st_deny_bmap & mask); } static int nfs4_access_to_omode(u32 access) @@ -674,138 +861,283 @@ static int nfs4_access_to_omode(u32 access) return O_RDONLY; } +/* + * A stateid that had a deny mode associated with it is being released + * or downgraded. Recalculate the deny mode on the file. + */ +static void +recalculate_deny_mode(struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *stp; + + spin_lock(&fp->fi_lock); + fp->fi_share_deny = 0; + list_for_each_entry(stp, &fp->fi_stateids, st_perfile) + fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap); + spin_unlock(&fp->fi_lock); +} + +static void +reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + int i; + bool change = false; + + for (i = 1; i < 4; i++) { + if ((i & deny) != i) { + change = true; + clear_deny(i, stp); + } + } + + /* Recalculate per-file deny mode if there was a change */ + if (change) + recalculate_deny_mode(stp->st_stid.sc_file); +} + /* release all access and file references for a given stateid */ static void release_all_access(struct nfs4_ol_stateid *stp) { int i; + struct nfs4_file *fp = stp->st_stid.sc_file; + + if (fp && stp->st_deny_bmap != 0) + recalculate_deny_mode(fp); for (i = 1; i < 4; i++) { if (test_access(i, stp)) - nfs4_file_put_access(stp->st_file, - nfs4_access_to_omode(i)); + nfs4_file_put_access(stp->st_stid.sc_file, i); clear_access(i, stp); } } -static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) +static void nfs4_put_stateowner(struct nfs4_stateowner *sop) { + struct nfs4_client *clp = sop->so_client; + + might_lock(&clp->cl_lock); + + if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock)) + return; + sop->so_ops->so_unhash(sop); + spin_unlock(&clp->cl_lock); + kfree(sop->so_owner.data); + sop->so_ops->so_free(sop); +} + +static void unhash_ol_stateid(struct nfs4_ol_stateid *stp) +{ + struct nfs4_file *fp = stp->st_stid.sc_file; + + lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock); + + spin_lock(&fp->fi_lock); list_del(&stp->st_perfile); + spin_unlock(&fp->fi_lock); list_del(&stp->st_perstateowner); } -static void close_generic_stateid(struct nfs4_ol_stateid *stp) +static void nfs4_free_ol_stateid(struct nfs4_stid *stid) { + struct nfs4_ol_stateid *stp = openlockstateid(stid); + release_all_access(stp); - put_nfs4_file(stp->st_file); - stp->st_file = NULL; + if (stp->st_stateowner) + nfs4_put_stateowner(stp->st_stateowner); + kmem_cache_free(stateid_slab, stid); } -static void free_generic_stateid(struct nfs4_ol_stateid *stp) +static void nfs4_free_lock_stateid(struct nfs4_stid *stid) { - remove_stid(&stp->st_stid); - nfs4_free_stid(stateid_slab, &stp->st_stid); + struct nfs4_ol_stateid *stp = openlockstateid(stid); + struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); + struct file *file; + + file = find_any_file(stp->st_stid.sc_file); + if (file) + filp_close(file, (fl_owner_t)lo); + nfs4_free_ol_stateid(stid); } -static void release_lock_stateid(struct nfs4_ol_stateid *stp) +/* + * Put the persistent reference to an already unhashed generic stateid, while + * holding the cl_lock. If it's the last reference, then put it onto the + * reaplist for later destruction. + */ +static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, + struct list_head *reaplist) { - struct file *file; + struct nfs4_stid *s = &stp->st_stid; + struct nfs4_client *clp = s->sc_client; + + lockdep_assert_held(&clp->cl_lock); - unhash_generic_stateid(stp); + WARN_ON_ONCE(!list_empty(&stp->st_locks)); + + if (!atomic_dec_and_test(&s->sc_count)) { + wake_up_all(&close_wq); + return; + } + + idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + list_add(&stp->st_locks, reaplist); +} + +static void unhash_lock_stateid(struct nfs4_ol_stateid *stp) +{ + struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); + + lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); + + list_del_init(&stp->st_locks); + unhash_ol_stateid(stp); unhash_stid(&stp->st_stid); - file = find_any_file(stp->st_file); - if (file) - locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner)); - close_generic_stateid(stp); - free_generic_stateid(stp); } -static void unhash_lockowner(struct nfs4_lockowner *lo) +static void release_lock_stateid(struct nfs4_ol_stateid *stp) { - struct nfs4_ol_stateid *stp; + struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); - list_del(&lo->lo_owner.so_strhash); - list_del(&lo->lo_perstateid); - list_del(&lo->lo_owner_ino_hash); - while (!list_empty(&lo->lo_owner.so_stateids)) { - stp = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - release_lock_stateid(stp); - } + spin_lock(&oo->oo_owner.so_client->cl_lock); + unhash_lock_stateid(stp); + spin_unlock(&oo->oo_owner.so_client->cl_lock); + nfs4_put_stid(&stp->st_stid); } -static void nfs4_free_lockowner(struct nfs4_lockowner *lo) +static void unhash_lockowner_locked(struct nfs4_lockowner *lo) { - kfree(lo->lo_owner.so_owner.data); - kmem_cache_free(lockowner_slab, lo); + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_del_init(&lo->lo_owner.so_strhash); +} + +/* + * Free a list of generic stateids that were collected earlier after being + * fully unhashed. + */ +static void +free_ol_stateid_reaplist(struct list_head *reaplist) +{ + struct nfs4_ol_stateid *stp; + struct nfs4_file *fp; + + might_sleep(); + + while (!list_empty(reaplist)) { + stp = list_first_entry(reaplist, struct nfs4_ol_stateid, + st_locks); + list_del(&stp->st_locks); + fp = stp->st_stid.sc_file; + stp->st_stid.sc_free(&stp->st_stid); + if (fp) + put_nfs4_file(fp); + } } static void release_lockowner(struct nfs4_lockowner *lo) { - unhash_lockowner(lo); - nfs4_free_lockowner(lo); + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfs4_ol_stateid *stp; + struct list_head reaplist; + + INIT_LIST_HEAD(&reaplist); + + spin_lock(&clp->cl_lock); + unhash_lockowner_locked(lo); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + unhash_lock_stateid(stp); + put_ol_stateid_locked(stp, &reaplist); + } + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + nfs4_put_stateowner(&lo->lo_owner); } -static void -release_stateid_lockowners(struct nfs4_ol_stateid *open_stp) +static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, + struct list_head *reaplist) { - struct nfs4_lockowner *lo; + struct nfs4_ol_stateid *stp; - while (!list_empty(&open_stp->st_lockowners)) { - lo = list_entry(open_stp->st_lockowners.next, - struct nfs4_lockowner, lo_perstateid); - release_lockowner(lo); + while (!list_empty(&open_stp->st_locks)) { + stp = list_entry(open_stp->st_locks.next, + struct nfs4_ol_stateid, st_locks); + unhash_lock_stateid(stp); + put_ol_stateid_locked(stp, reaplist); } } -static void unhash_open_stateid(struct nfs4_ol_stateid *stp) +static void unhash_open_stateid(struct nfs4_ol_stateid *stp, + struct list_head *reaplist) { - unhash_generic_stateid(stp); - release_stateid_lockowners(stp); - close_generic_stateid(stp); + lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); + + unhash_ol_stateid(stp); + release_open_stateid_locks(stp, reaplist); } static void release_open_stateid(struct nfs4_ol_stateid *stp) { - unhash_open_stateid(stp); - free_generic_stateid(stp); + LIST_HEAD(reaplist); + + spin_lock(&stp->st_stid.sc_client->cl_lock); + unhash_open_stateid(stp, &reaplist); + put_ol_stateid_locked(stp, &reaplist); + spin_unlock(&stp->st_stid.sc_client->cl_lock); + free_ol_stateid_reaplist(&reaplist); } -static void unhash_openowner(struct nfs4_openowner *oo) +static void unhash_openowner_locked(struct nfs4_openowner *oo) { - struct nfs4_ol_stateid *stp; + struct nfs4_client *clp = oo->oo_owner.so_client; - list_del(&oo->oo_owner.so_strhash); - list_del(&oo->oo_perclient); - while (!list_empty(&oo->oo_owner.so_stateids)) { - stp = list_first_entry(&oo->oo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - release_open_stateid(stp); - } + lockdep_assert_held(&clp->cl_lock); + + list_del_init(&oo->oo_owner.so_strhash); + list_del_init(&oo->oo_perclient); } static void release_last_closed_stateid(struct nfs4_openowner *oo) { - struct nfs4_ol_stateid *s = oo->oo_last_closed_stid; + struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net, + nfsd_net_id); + struct nfs4_ol_stateid *s; + spin_lock(&nn->client_lock); + s = oo->oo_last_closed_stid; if (s) { - free_generic_stateid(s); + list_del_init(&oo->oo_close_lru); oo->oo_last_closed_stid = NULL; } -} - -static void nfs4_free_openowner(struct nfs4_openowner *oo) -{ - kfree(oo->oo_owner.so_owner.data); - kmem_cache_free(openowner_slab, oo); + spin_unlock(&nn->client_lock); + if (s) + nfs4_put_stid(&s->st_stid); } static void release_openowner(struct nfs4_openowner *oo) { - unhash_openowner(oo); - list_del(&oo->oo_close_lru); + struct nfs4_ol_stateid *stp; + struct nfs4_client *clp = oo->oo_owner.so_client; + struct list_head reaplist; + + INIT_LIST_HEAD(&reaplist); + + spin_lock(&clp->cl_lock); + unhash_openowner_locked(oo); + while (!list_empty(&oo->oo_owner.so_stateids)) { + stp = list_first_entry(&oo->oo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + unhash_open_stateid(stp, &reaplist); + put_ol_stateid_locked(stp, &reaplist); + } + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); release_last_closed_stateid(oo); - nfs4_free_openowner(oo); + nfs4_put_stateowner(&oo->oo_owner); } static inline int @@ -842,7 +1174,7 @@ void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr) return; if (!seqid_mutating_err(ntohl(nfserr))) { - cstate->replay_owner = NULL; + nfsd4_cstate_clear_replay(cstate); return; } if (!so) @@ -1030,10 +1362,8 @@ static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, str if (ret) /* oops; xprt is already down: */ nfsd4_conn_lost(&conn->cn_xpt_user); - if (conn->cn_flags & NFS4_CDFC4_BACK) { - /* callback channel may be back up */ - nfsd4_probe_callback(ses->se_client); - } + /* We may have gained or lost a callback channel: */ + nfsd4_probe_callback_sync(ses->se_client); } static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses) @@ -1073,9 +1403,6 @@ static void __free_session(struct nfsd4_session *ses) static void free_session(struct nfsd4_session *ses) { - struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id); - - lockdep_assert_held(&nn->client_lock); nfsd4_del_conns(ses); nfsd4_put_drc_mem(&ses->se_fchannel); __free_session(ses); @@ -1097,12 +1424,10 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru new->se_cb_sec = cses->cb_sec; atomic_set(&new->se_ref, 0); idx = hash_sessionid(&new->se_sessionid); - spin_lock(&nn->client_lock); list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); spin_lock(&clp->cl_lock); list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); - spin_unlock(&nn->client_lock); if (cses->flags & SESSION4_BACK_CHAN) { struct sockaddr *sa = svc_addr(rqstp); @@ -1120,12 +1445,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru /* caller must hold client_lock */ static struct nfsd4_session * -find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) +__find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) { struct nfsd4_session *elem; int idx; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + lockdep_assert_held(&nn->client_lock); + dump_sessionid(__func__, sessionid); idx = hash_sessionid(sessionid); /* Search in the appropriate list */ @@ -1140,10 +1467,33 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) return NULL; } +static struct nfsd4_session * +find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net, + __be32 *ret) +{ + struct nfsd4_session *session; + __be32 status = nfserr_badsession; + + session = __find_in_sessionid_hashtbl(sessionid, net); + if (!session) + goto out; + status = nfsd4_get_session_locked(session); + if (status) + session = NULL; +out: + *ret = status; + return session; +} + /* caller must hold client_lock */ static void unhash_session(struct nfsd4_session *ses) { + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + list_del(&ses->se_hash); spin_lock(&ses->se_client->cl_lock); list_del(&ses->se_perclnt); @@ -1169,15 +1519,20 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) static struct nfs4_client *alloc_client(struct xdr_netobj name) { struct nfs4_client *clp; + int i; clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); if (clp == NULL) return NULL; clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL); - if (clp->cl_name.data == NULL) { - kfree(clp); - return NULL; - } + if (clp->cl_name.data == NULL) + goto err_no_name; + clp->cl_ownerstr_hashtbl = kmalloc(sizeof(struct list_head) * + OWNER_HASH_SIZE, GFP_KERNEL); + if (!clp->cl_ownerstr_hashtbl) + goto err_no_hashtbl; + for (i = 0; i < OWNER_HASH_SIZE; i++) + INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]); clp->cl_name.len = name.len; INIT_LIST_HEAD(&clp->cl_sessions); idr_init(&clp->cl_stateids); @@ -1192,14 +1547,16 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; +err_no_hashtbl: + kfree(clp->cl_name.data); +err_no_name: + kfree(clp); + return NULL; } static void free_client(struct nfs4_client *clp) { - struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id); - - lockdep_assert_held(&nn->client_lock); while (!list_empty(&clp->cl_sessions)) { struct nfsd4_session *ses; ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, @@ -1210,18 +1567,32 @@ free_client(struct nfs4_client *clp) } rpc_destroy_wait_queue(&clp->cl_cb_waitq); free_svc_cred(&clp->cl_cred); + kfree(clp->cl_ownerstr_hashtbl); kfree(clp->cl_name.data); idr_destroy(&clp->cl_stateids); kfree(clp); } /* must be called under the client_lock */ -static inline void +static void unhash_client_locked(struct nfs4_client *clp) { + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct nfsd4_session *ses; - list_del(&clp->cl_lru); + lockdep_assert_held(&nn->client_lock); + + /* Mark the client as expired! */ + clp->cl_time = 0; + /* Make it invisible */ + if (!list_empty(&clp->cl_idhash)) { + list_del_init(&clp->cl_idhash); + if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + rb_erase(&clp->cl_namenode, &nn->conf_name_tree); + else + rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); + } + list_del_init(&clp->cl_lru); spin_lock(&clp->cl_lock); list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) list_del_init(&ses->se_hash); @@ -1229,53 +1600,71 @@ unhash_client_locked(struct nfs4_client *clp) } static void -destroy_client(struct nfs4_client *clp) +unhash_client(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + unhash_client_locked(clp); + spin_unlock(&nn->client_lock); +} + +static __be32 mark_client_expired_locked(struct nfs4_client *clp) +{ + if (atomic_read(&clp->cl_refcount)) + return nfserr_jukebox; + unhash_client_locked(clp); + return nfs_ok; +} + +static void +__destroy_client(struct nfs4_client *clp) { struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); INIT_LIST_HEAD(&reaplist); spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - list_del_init(&dp->dl_perclnt); - list_move(&dp->dl_recall_lru, &reaplist); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); - destroy_delegation(dp); + list_del_init(&dp->dl_recall_lru); + nfs4_put_stid(&dp->dl_stid); } - list_splice_init(&clp->cl_revoked, &reaplist); - while (!list_empty(&reaplist)) { + while (!list_empty(&clp->cl_revoked)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); - destroy_revoked_delegation(dp); + list_del_init(&dp->dl_recall_lru); + nfs4_put_stid(&dp->dl_stid); } while (!list_empty(&clp->cl_openowners)) { oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); + atomic_inc(&oo->oo_owner.so_count); release_openowner(oo); } nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); - list_del(&clp->cl_idhash); - if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) - rb_erase(&clp->cl_namenode, &nn->conf_name_tree); - else - rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); - spin_lock(&nn->client_lock); - unhash_client_locked(clp); - WARN_ON_ONCE(atomic_read(&clp->cl_refcount)); free_client(clp); - spin_unlock(&nn->client_lock); +} + +static void +destroy_client(struct nfs4_client *clp) +{ + unhash_client(clp); + __destroy_client(clp); } static void expire_client(struct nfs4_client *clp) { + unhash_client(clp); nfsd4_client_record_remove(clp); - destroy_client(clp); + __destroy_client(clp); } static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) @@ -1408,25 +1797,28 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); } -static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) +static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn) { - static u32 current_clientid = 1; + __be32 verf[2]; - clp->cl_clientid.cl_boot = nn->boot_time; - clp->cl_clientid.cl_id = current_clientid++; + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy + */ + verf[0] = (__force __be32)get_seconds(); + verf[1] = (__force __be32)nn->clientid_counter; + memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); } -static void gen_confirm(struct nfs4_client *clp) +static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) { - __be32 verf[2]; - static u32 i; - - verf[0] = (__be32)get_seconds(); - verf[1] = (__be32)i++; - memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); + clp->cl_clientid.cl_boot = nn->boot_time; + clp->cl_clientid.cl_id = nn->clientid_counter++; + gen_confirm(clp, nn); } -static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) +static struct nfs4_stid * +find_stateid_locked(struct nfs4_client *cl, stateid_t *t) { struct nfs4_stid *ret; @@ -1436,16 +1828,21 @@ static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) return ret; } -static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +static struct nfs4_stid * +find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) { struct nfs4_stid *s; - s = find_stateid(cl, t); - if (!s) - return NULL; - if (typemask & s->sc_type) - return s; - return NULL; + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, t); + if (s != NULL) { + if (typemask & s->sc_type) + atomic_inc(&s->sc_count); + else + s = NULL; + } + spin_unlock(&cl->cl_lock); + return s; } static struct nfs4_client *create_client(struct xdr_netobj name, @@ -1455,7 +1852,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, struct sockaddr *sa = svc_addr(rqstp); int ret; struct net *net = SVC_NET(rqstp); - struct nfsd_net *nn = net_generic(net, nfsd_net_id); clp = alloc_client(name); if (clp == NULL) @@ -1463,17 +1859,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name, ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { - spin_lock(&nn->client_lock); free_client(clp); - spin_unlock(&nn->client_lock); return NULL; } - nfsd4_init_callback(&clp->cl_cb_null); + INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_run_cb_null); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); copy_verf(clp, verf); rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); - gen_confirm(clp); clp->cl_cb_session = NULL; clp->net = net; return clp; @@ -1525,11 +1918,13 @@ add_to_unconfirmed(struct nfs4_client *clp) unsigned int idhashval; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + lockdep_assert_held(&nn->client_lock); + clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); add_clp_to_name_tree(clp, &nn->unconf_name_tree); idhashval = clientid_hashval(clp->cl_clientid.cl_id); list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]); - renew_client(clp); + renew_client_locked(clp); } static void @@ -1538,12 +1933,14 @@ move_to_confirmed(struct nfs4_client *clp) unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + lockdep_assert_held(&nn->client_lock); + dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); add_clp_to_name_tree(clp, &nn->conf_name_tree); set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); - renew_client(clp); + renew_client_locked(clp); } static struct nfs4_client * @@ -1556,7 +1953,7 @@ find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions) if (same_clid(&clp->cl_clientid, clid)) { if ((bool)clp->cl_minorversion != sessions) return NULL; - renew_client(clp); + renew_client_locked(clp); return clp; } } @@ -1568,6 +1965,7 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct list_head *tbl = nn->conf_id_hashtbl; + lockdep_assert_held(&nn->client_lock); return find_client_in_id_table(tbl, clid, sessions); } @@ -1576,6 +1974,7 @@ find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { struct list_head *tbl = nn->unconf_id_hashtbl; + lockdep_assert_held(&nn->client_lock); return find_client_in_id_table(tbl, clid, sessions); } @@ -1587,12 +1986,14 @@ static bool clp_used_exchangeid(struct nfs4_client *clp) static struct nfs4_client * find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) { + lockdep_assert_held(&nn->client_lock); return find_clp_in_name_tree(name, &nn->conf_name_tree); } static struct nfs4_client * find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) { + lockdep_assert_held(&nn->client_lock); return find_clp_in_name_tree(name, &nn->unconf_name_tree); } @@ -1642,7 +2043,7 @@ out_err: /* * Cache a reply. nfsd4_check_resp_size() has bounded the cache size. */ -void +static void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { struct xdr_buf *buf = resp->xdr.buf; @@ -1758,7 +2159,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_exchange_id *exid) { - struct nfs4_client *unconf, *conf, *new; + struct nfs4_client *conf, *new; + struct nfs4_client *unconf = NULL; __be32 status; char addr_str[INET6_ADDRSTRLEN]; nfs4_verifier verf = exid->verifier; @@ -1787,8 +2189,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, return nfserr_encr_alg_unsupp; } + new = create_client(exid->clname, rqstp, &verf); + if (new == NULL) + return nfserr_jukebox; + /* Cases below refer to rfc 5661 section 18.35.4: */ - nfs4_lock_state(); + spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&exid->clname, nn); if (conf) { bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); @@ -1813,7 +2219,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, } /* case 6 */ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; - new = conf; goto out_copy; } if (!creds_match) { /* case 3 */ @@ -1821,15 +2226,14 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, status = nfserr_clid_inuse; goto out; } - expire_client(conf); goto out_new; } if (verfs_match) { /* case 2 */ conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; - new = conf; goto out_copy; } /* case 5, client reboot */ + conf = NULL; goto out_new; } @@ -1840,33 +2244,38 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, unconf = find_unconfirmed_client_by_name(&exid->clname, nn); if (unconf) /* case 4, possible retry or client restart */ - expire_client(unconf); + unhash_client_locked(unconf); /* case 1 (normal case) */ out_new: - new = create_client(exid->clname, rqstp, &verf); - if (new == NULL) { - status = nfserr_jukebox; - goto out; + if (conf) { + status = mark_client_expired_locked(conf); + if (status) + goto out; } new->cl_minorversion = cstate->minorversion; new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED); gen_clid(new, nn); add_to_unconfirmed(new); + swap(new, conf); out_copy: - exid->clientid.cl_boot = new->cl_clientid.cl_boot; - exid->clientid.cl_id = new->cl_clientid.cl_id; + exid->clientid.cl_boot = conf->cl_clientid.cl_boot; + exid->clientid.cl_id = conf->cl_clientid.cl_id; - exid->seqid = new->cl_cs_slot.sl_seqid + 1; - nfsd4_set_ex_flags(new, exid); + exid->seqid = conf->cl_cs_slot.sl_seqid + 1; + nfsd4_set_ex_flags(conf, exid); dprintk("nfsd4_exchange_id seqid %d flags %x\n", - new->cl_cs_slot.sl_seqid, new->cl_exchange_flags); + conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags); status = nfs_ok; out: - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); + if (new) + expire_client(new); + if (unconf) + expire_client(unconf); return status; } @@ -2010,6 +2419,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, { struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; + struct nfs4_client *old = NULL; struct nfsd4_session *new; struct nfsd4_conn *conn; struct nfsd4_clid_slot *cs_slot = NULL; @@ -2035,7 +2445,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, if (!conn) goto out_free_session; - nfs4_lock_state(); + spin_lock(&nn->client_lock); unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); WARN_ON_ONCE(conf && unconf); @@ -2054,7 +2464,6 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } } else if (unconf) { - struct nfs4_client *old; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { status = nfserr_clid_inuse; @@ -2072,10 +2481,11 @@ nfsd4_create_session(struct svc_rqst *rqstp, } old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { - status = mark_client_expired(old); - if (status) + status = mark_client_expired_locked(old); + if (status) { + old = NULL; goto out_free_conn; - expire_client(old); + } } move_to_confirmed(unconf); conf = unconf; @@ -2091,20 +2501,27 @@ nfsd4_create_session(struct svc_rqst *rqstp, cr_ses->flags &= ~SESSION4_RDMA; init_session(rqstp, new, conf, cr_ses); - nfsd4_init_conn(rqstp, conn, new); + nfsd4_get_session_locked(new); memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); cs_slot->sl_seqid++; cr_ses->seqid = cs_slot->sl_seqid; - /* cache solo and embedded create sessions under the state lock */ + /* cache solo and embedded create sessions under the client_lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); + /* init connection and backchannel */ + nfsd4_init_conn(rqstp, conn, new); + nfsd4_put_session(new); + if (old) + expire_client(old); return status; out_free_conn: - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); free_conn(conn); + if (old) + expire_client(old); out_free_session: __free_session(new); out_release_drc_mem: @@ -2152,17 +2569,16 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, __be32 status; struct nfsd4_conn *conn; struct nfsd4_session *session; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nfsd4_last_compound_op(rqstp)) return nfserr_not_only_op; - nfs4_lock_state(); spin_lock(&nn->client_lock); - session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp)); + session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status); spin_unlock(&nn->client_lock); - status = nfserr_badsession; if (!session) - goto out; + goto out_no_session; status = nfserr_wrong_cred; if (!mach_creds_match(session->se_client, rqstp)) goto out; @@ -2176,7 +2592,8 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, nfsd4_init_conn(rqstp, conn, session); status = nfs_ok; out: - nfs4_unlock_state(); + nfsd4_put_session(session); +out_no_session: return status; } @@ -2195,9 +2612,9 @@ nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_session *ses; __be32 status; int ref_held_by_me = 0; - struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id); + struct net *net = SVC_NET(r); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); - nfs4_lock_state(); status = nfserr_not_only_op; if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { if (!nfsd4_last_compound_op(r)) @@ -2206,14 +2623,12 @@ nfsd4_destroy_session(struct svc_rqst *r, } dump_sessionid(__func__, &sessionid->sessionid); spin_lock(&nn->client_lock); - ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r)); - status = nfserr_badsession; + ses = find_in_sessionid_hashtbl(&sessionid->sessionid, net, &status); if (!ses) goto out_client_lock; status = nfserr_wrong_cred; if (!mach_creds_match(ses->se_client, r)) - goto out_client_lock; - nfsd4_get_session_locked(ses); + goto out_put_session; status = mark_session_dead_locked(ses, 1 + ref_held_by_me); if (status) goto out_put_session; @@ -2225,11 +2640,10 @@ nfsd4_destroy_session(struct svc_rqst *r, spin_lock(&nn->client_lock); status = nfs_ok; out_put_session: - nfsd4_put_session(ses); + nfsd4_put_session_locked(ses); out_client_lock: spin_unlock(&nn->client_lock); out: - nfs4_unlock_state(); return status; } @@ -2300,7 +2714,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_conn *conn; __be32 status; int buflen; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (resp->opcnt != 1) return nfserr_sequence_pos; @@ -2314,17 +2729,10 @@ nfsd4_sequence(struct svc_rqst *rqstp, return nfserr_jukebox; spin_lock(&nn->client_lock); - status = nfserr_badsession; - session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp)); + session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status); if (!session) goto out_no_session; clp = session->se_client; - status = get_client_locked(clp); - if (status) - goto out_no_session; - status = nfsd4_get_session_locked(session); - if (status) - goto out_put_client; status = nfserr_too_many_ops; if (nfsd4_session_too_many_ops(rqstp, session)) @@ -2354,6 +2762,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, goto out_put_session; cstate->slot = slot; cstate->session = session; + cstate->clp = clp; /* Return the cached reply status and set cstate->status * for nfsd4_proc_compound processing */ status = nfsd4_replay_cache_entry(resp, seq); @@ -2388,6 +2797,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, cstate->slot = slot; cstate->session = session; + cstate->clp = clp; out: switch (clp->cl_cb_state) { @@ -2408,31 +2818,48 @@ out_no_session: spin_unlock(&nn->client_lock); return status; out_put_session: - nfsd4_put_session(session); -out_put_client: - put_client_renew_locked(clp); + nfsd4_put_session_locked(session); goto out_no_session; } +void +nfsd4_sequence_done(struct nfsd4_compoundres *resp) +{ + struct nfsd4_compound_state *cs = &resp->cstate; + + if (nfsd4_has_session(cs)) { + if (cs->status != nfserr_replay_cache) { + nfsd4_store_cache_entry(resp); + cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; + } + /* Drop session reference that was taken in nfsd4_sequence() */ + nfsd4_put_session(cs->session); + } else if (cs->clp) + put_client_renew(cs->clp); +} + __be32 nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc) { - struct nfs4_client *conf, *unconf, *clp; + struct nfs4_client *conf, *unconf; + struct nfs4_client *clp = NULL; __be32 status = 0; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - nfs4_lock_state(); + spin_lock(&nn->client_lock); unconf = find_unconfirmed_client(&dc->clientid, true, nn); conf = find_confirmed_client(&dc->clientid, true, nn); WARN_ON_ONCE(conf && unconf); if (conf) { - clp = conf; - if (client_has_state(conf)) { status = nfserr_clientid_busy; goto out; } + status = mark_client_expired_locked(conf); + if (status) + goto out; + clp = conf; } else if (unconf) clp = unconf; else { @@ -2440,12 +2867,15 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta goto out; } if (!mach_creds_match(clp, rqstp)) { + clp = NULL; status = nfserr_wrong_cred; goto out; } - expire_client(clp); + unhash_client_locked(clp); out: - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); + if (clp) + expire_client(clp); return status; } @@ -2464,7 +2894,6 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta return nfs_ok; } - nfs4_lock_state(); status = nfserr_complete_already; if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->session->se_client->cl_flags)) @@ -2484,7 +2913,6 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta status = nfs_ok; nfsd4_client_record_create(cstate->session->se_client); out: - nfs4_unlock_state(); return status; } @@ -2494,12 +2922,16 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct xdr_netobj clname = setclid->se_name; nfs4_verifier clverifier = setclid->se_verf; - struct nfs4_client *conf, *unconf, *new; + struct nfs4_client *conf, *new; + struct nfs4_client *unconf = NULL; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + new = create_client(clname, rqstp, &clverifier); + if (new == NULL) + return nfserr_jukebox; /* Cases below refer to rfc 3530 section 14.2.33: */ - nfs4_lock_state(); + spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&clname, nn); if (conf) { /* case 0: */ @@ -2517,11 +2949,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } unconf = find_unconfirmed_client_by_name(&clname, nn); if (unconf) - expire_client(unconf); - status = nfserr_jukebox; - new = create_client(clname, rqstp, &clverifier); - if (new == NULL) - goto out; + unhash_client_locked(unconf); if (conf && same_verf(&conf->cl_verifier, &clverifier)) /* case 1: probable callback update */ copy_clid(new, conf); @@ -2533,9 +2961,14 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); + new = NULL; status = nfs_ok; out: - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); + if (new) + free_client(new); + if (unconf) + expire_client(unconf); return status; } @@ -2546,6 +2979,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confirm *setclientid_confirm) { struct nfs4_client *conf, *unconf; + struct nfs4_client *old = NULL; nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; __be32 status; @@ -2553,8 +2987,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; - nfs4_lock_state(); + spin_lock(&nn->client_lock); conf = find_confirmed_client(clid, false, nn); unconf = find_unconfirmed_client(clid, false, nn); /* @@ -2578,22 +3012,30 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } status = nfs_ok; if (conf) { /* case 1: callback update */ + old = unconf; + unhash_client_locked(old); nfsd4_change_callback(conf, &unconf->cl_cb_conn); - nfsd4_probe_callback(conf); - expire_client(unconf); } else { /* case 3: normal case; new or rebooted client */ - conf = find_confirmed_client_by_name(&unconf->cl_name, nn); - if (conf) { - status = mark_client_expired(conf); - if (status) + old = find_confirmed_client_by_name(&unconf->cl_name, nn); + if (old) { + status = mark_client_expired_locked(old); + if (status) { + old = NULL; goto out; - expire_client(conf); + } } move_to_confirmed(unconf); - nfsd4_probe_callback(unconf); + conf = unconf; } + get_client_locked(conf); + spin_unlock(&nn->client_lock); + nfsd4_probe_callback(conf); + spin_lock(&nn->client_lock); + put_client_renew_locked(conf); out: - nfs4_unlock_state(); + spin_unlock(&nn->client_lock); + if (old) + expire_client(old); return status; } @@ -2603,21 +3045,23 @@ static struct nfs4_file *nfsd4_alloc_file(void) } /* OPEN Share state helper functions */ -static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) +static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh) { - unsigned int hashval = file_hashval(ino); + unsigned int hashval = file_hashval(fh); + + lockdep_assert_held(&state_lock); atomic_set(&fp->fi_ref, 1); + spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); - fp->fi_inode = igrab(ino); + fh_copy_shallow(&fp->fi_fhandle, fh); fp->fi_had_conflict = false; fp->fi_lease = NULL; + fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); - spin_lock(&state_lock); hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&state_lock); } void @@ -2673,6 +3117,28 @@ static void init_nfs4_replay(struct nfs4_replay *rp) rp->rp_status = nfserr_serverfault; rp->rp_buflen = 0; rp->rp_buf = rp->rp_ibuf; + mutex_init(&rp->rp_mutex); +} + +static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, + struct nfs4_stateowner *so) +{ + if (!nfsd4_has_session(cstate)) { + mutex_lock(&so->so_replay.rp_mutex); + cstate->replay_owner = so; + atomic_inc(&so->so_count); + } +} + +void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate) +{ + struct nfs4_stateowner *so = cstate->replay_owner; + + if (so != NULL) { + cstate->replay_owner = NULL; + mutex_unlock(&so->so_replay.rp_mutex); + nfs4_put_stateowner(so); + } } static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) @@ -2693,111 +3159,172 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj INIT_LIST_HEAD(&sop->so_stateids); sop->so_client = clp; init_nfs4_replay(&sop->so_replay); + atomic_set(&sop->so_count, 1); return sop; } static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) { - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + lockdep_assert_held(&clp->cl_lock); - list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]); + list_add(&oo->oo_owner.so_strhash, + &clp->cl_ownerstr_hashtbl[strhashval]); list_add(&oo->oo_perclient, &clp->cl_openowners); } +static void nfs4_unhash_openowner(struct nfs4_stateowner *so) +{ + unhash_openowner_locked(openowner(so)); +} + +static void nfs4_free_openowner(struct nfs4_stateowner *so) +{ + struct nfs4_openowner *oo = openowner(so); + + kmem_cache_free(openowner_slab, oo); +} + +static const struct nfs4_stateowner_operations openowner_ops = { + .so_unhash = nfs4_unhash_openowner, + .so_free = nfs4_free_openowner, +}; + static struct nfs4_openowner * -alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { - struct nfs4_openowner *oo; +alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, + struct nfsd4_compound_state *cstate) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_openowner *oo, *ret; oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); if (!oo) return NULL; + oo->oo_owner.so_ops = &openowner_ops; oo->oo_owner.so_is_open_owner = 1; oo->oo_owner.so_seqid = open->op_seqid; - oo->oo_flags = NFS4_OO_NEW; + oo->oo_flags = 0; + if (nfsd4_has_session(cstate)) + oo->oo_flags |= NFS4_OO_CONFIRMED; oo->oo_time = 0; oo->oo_last_closed_stid = NULL; INIT_LIST_HEAD(&oo->oo_close_lru); - hash_openowner(oo, clp, strhashval); + spin_lock(&clp->cl_lock); + ret = find_openstateowner_str_locked(strhashval, open, clp); + if (ret == NULL) { + hash_openowner(oo, clp, strhashval); + ret = oo; + } else + nfs4_free_openowner(&oo->oo_owner); + spin_unlock(&clp->cl_lock); return oo; } static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; + atomic_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_OPEN_STID; - INIT_LIST_HEAD(&stp->st_lockowners); - list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); - list_add(&stp->st_perfile, &fp->fi_stateids); + INIT_LIST_HEAD(&stp->st_locks); stp->st_stateowner = &oo->oo_owner; + atomic_inc(&stp->st_stateowner->so_count); get_nfs4_file(fp); - stp->st_file = fp; + stp->st_stid.sc_file = fp; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; - set_access(open->op_share_access, stp); - set_deny(open->op_share_deny, stp); stp->st_openstp = NULL; + spin_lock(&oo->oo_owner.so_client->cl_lock); + list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); + spin_lock(&fp->fi_lock); + list_add(&stp->st_perfile, &fp->fi_stateids); + spin_unlock(&fp->fi_lock); + spin_unlock(&oo->oo_owner.so_client->cl_lock); } +/* + * In the 4.0 case we need to keep the owners around a little while to handle + * CLOSE replay. We still do need to release any file access that is held by + * them before returning however. + */ static void -move_to_close_lru(struct nfs4_openowner *oo, struct net *net) +move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) { - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfs4_ol_stateid *last; + struct nfs4_openowner *oo = openowner(s->st_stateowner); + struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net, + nfsd_net_id); dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); + /* + * We know that we hold one reference via nfsd4_close, and another + * "persistent" reference for the client. If the refcount is higher + * than 2, then there are still calls in progress that are using this + * stateid. We can't put the sc_file reference until they are finished. + * Wait for the refcount to drop to 2. Since it has been unhashed, + * there should be no danger of the refcount going back up again at + * this point. + */ + wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2); + + release_all_access(s); + if (s->st_stid.sc_file) { + put_nfs4_file(s->st_stid.sc_file); + s->st_stid.sc_file = NULL; + } + + spin_lock(&nn->client_lock); + last = oo->oo_last_closed_stid; + oo->oo_last_closed_stid = s; list_move_tail(&oo->oo_close_lru, &nn->close_lru); oo->oo_time = get_seconds(); + spin_unlock(&nn->client_lock); + if (last) + nfs4_put_stid(&last->st_stid); } -static int -same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, - clientid_t *clid) +/* search file_hashtbl[] for file */ +static struct nfs4_file * +find_file_locked(struct knfsd_fh *fh) { - return (sop->so_owner.len == owner->len) && - 0 == memcmp(sop->so_owner.data, owner->data, owner->len) && - (sop->so_client->cl_clientid.cl_id == clid->cl_id); -} + unsigned int hashval = file_hashval(fh); + struct nfs4_file *fp; -static struct nfs4_openowner * -find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, - bool sessions, struct nfsd_net *nn) -{ - struct nfs4_stateowner *so; - struct nfs4_openowner *oo; - struct nfs4_client *clp; + lockdep_assert_held(&state_lock); - list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) { - if (!so->so_is_open_owner) - continue; - if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { - oo = openowner(so); - clp = oo->oo_owner.so_client; - if ((bool)clp->cl_minorversion != sessions) - return NULL; - renew_client(oo->oo_owner.so_client); - return oo; + hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { + if (nfsd_fh_match(&fp->fi_fhandle, fh)) { + get_nfs4_file(fp); + return fp; } } return NULL; } -/* search file_hashtbl[] for file */ static struct nfs4_file * -find_file(struct inode *ino) +find_file(struct knfsd_fh *fh) { - unsigned int hashval = file_hashval(ino); struct nfs4_file *fp; spin_lock(&state_lock); - hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { - if (fp->fi_inode == ino) { - get_nfs4_file(fp); - spin_unlock(&state_lock); - return fp; - } + fp = find_file_locked(fh); + spin_unlock(&state_lock); + return fp; +} + +static struct nfs4_file * +find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh) +{ + struct nfs4_file *fp; + + spin_lock(&state_lock); + fp = find_file_locked(fh); + if (fp == NULL) { + nfsd4_init_file(new, fh); + fp = new; } spin_unlock(&state_lock); - return NULL; + + return fp; } /* @@ -2807,47 +3334,53 @@ find_file(struct inode *ino) static __be32 nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) { - struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_file *fp; - struct nfs4_ol_stateid *stp; - __be32 ret; + __be32 ret = nfs_ok; - fp = find_file(ino); + fp = find_file(¤t_fh->fh_handle); if (!fp) - return nfs_ok; - ret = nfserr_locked; - /* Search for conflicting share reservations */ - list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { - if (test_deny(deny_type, stp) || - test_deny(NFS4_SHARE_DENY_BOTH, stp)) - goto out; - } - ret = nfs_ok; -out: + return ret; + /* Check for conflicting share reservations */ + spin_lock(&fp->fi_lock); + if (fp->fi_share_deny & deny_type) + ret = nfserr_locked; + spin_unlock(&fp->fi_lock); put_nfs4_file(fp); return ret; } -static void nfsd_break_one_deleg(struct nfs4_delegation *dp) +void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp) { - struct nfs4_client *clp = dp->dl_stid.sc_client; - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, + nfsd_net_id); - lockdep_assert_held(&state_lock); - /* We're assuming the state code never drops its reference + block_delegations(&dp->dl_stid.sc_file->fi_fhandle); + + /* + * We can't do this in nfsd_break_deleg_cb because it is + * already holding inode->i_lock. + * + * If the dl_time != 0, then we know that it has already been + * queued for a lease break. Don't queue it again. + */ + spin_lock(&state_lock); + if (dp->dl_time == 0) { + dp->dl_time = get_seconds(); + list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); + } + spin_unlock(&state_lock); +} + +static void nfsd_break_one_deleg(struct nfs4_delegation *dp) +{ + /* + * We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the kernel * lock) we know the server hasn't removed the lease yet, we know - * it's safe to take a reference: */ - atomic_inc(&dp->dl_count); - - list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); - - /* Only place dl_time is set; protected by i_lock: */ - dp->dl_time = get_seconds(); - - block_delegations(&dp->dl_fh); - + * it's safe to take a reference. + */ + atomic_inc(&dp->dl_stid.sc_count); nfsd4_cb_recall(dp); } @@ -2872,11 +3405,20 @@ static void nfsd_break_deleg_cb(struct file_lock *fl) */ fl->fl_break_time = 0; - spin_lock(&state_lock); + spin_lock(&fp->fi_lock); fp->fi_had_conflict = true; - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) - nfsd_break_one_deleg(dp); - spin_unlock(&state_lock); + /* + * If there are no delegations on the list, then we can't count on this + * lease ever being cleaned up. Set the fl_break_time to jiffies so that + * time_out_leases will do it ASAP. The fact that fi_had_conflict is now + * true should keep any new delegations from being hashed. + */ + if (list_empty(&fp->fi_delegations)) + fl->fl_break_time = jiffies; + else + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + nfsd_break_one_deleg(dp); + spin_unlock(&fp->fi_lock); } static @@ -2904,6 +3446,42 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4 return nfserr_bad_seqid; } +static __be32 lookup_clientid(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn) +{ + struct nfs4_client *found; + + if (cstate->clp) { + found = cstate->clp; + if (!same_clid(&found->cl_clientid, clid)) + return nfserr_stale_clientid; + return nfs_ok; + } + + if (STALE_CLIENTID(clid, nn)) + return nfserr_stale_clientid; + + /* + * For v4.1+ we get the client in the SEQUENCE op. If we don't have one + * cached already then we know this is for is for v4.0 and "sessions" + * will be false. + */ + WARN_ON_ONCE(cstate->session); + spin_lock(&nn->client_lock); + found = find_confirmed_client(clid, false, nn); + if (!found) { + spin_unlock(&nn->client_lock); + return nfserr_expired; + } + atomic_inc(&found->cl_refcount); + spin_unlock(&nn->client_lock); + + /* Cache the nfs4_client in cstate! */ + cstate->clp = found; + return nfs_ok; +} + __be32 nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct nfsd_net *nn) @@ -2924,19 +3502,19 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (open->op_file == NULL) return nfserr_jukebox; - strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); - oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn); + status = lookup_clientid(clientid, cstate, nn); + if (status) + return status; + clp = cstate->clp; + + strhashval = ownerstr_hashval(&open->op_owner); + oo = find_openstateowner_str(strhashval, open, clp); open->op_openowner = oo; if (!oo) { - clp = find_confirmed_client(clientid, cstate->minorversion, - nn); - if (clp == NULL) - return nfserr_expired; goto new_owner; } if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { /* Replace unconfirmed owners without checking for replay. */ - clp = oo->oo_owner.so_client; release_openowner(oo); open->op_openowner = NULL; goto new_owner; @@ -2944,15 +3522,14 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); if (status) return status; - clp = oo->oo_owner.so_client; goto alloc_stateid; new_owner: - oo = alloc_init_open_stateowner(strhashval, clp, open); + oo = alloc_init_open_stateowner(strhashval, open, cstate); if (oo == NULL) return nfserr_jukebox; open->op_openowner = oo; alloc_stateid: - open->op_stp = nfs4_alloc_stateid(clp); + open->op_stp = nfs4_alloc_open_stateid(clp); if (!open->op_stp) return nfserr_jukebox; return nfs_ok; @@ -2994,14 +3571,18 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, { int flags; __be32 status = nfserr_bad_stateid; + struct nfs4_delegation *deleg; - *dp = find_deleg_stateid(cl, &open->op_delegate_stateid); - if (*dp == NULL) + deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); + if (deleg == NULL) goto out; flags = share_access_to_flags(open->op_share_access); - status = nfs4_check_delegmode(*dp, flags); - if (status) - *dp = NULL; + status = nfs4_check_delegmode(deleg, flags); + if (status) { + nfs4_put_stid(&deleg->dl_stid); + goto out; + } + *dp = deleg; out: if (!nfsd4_is_deleg_cur(open)) return nfs_ok; @@ -3011,24 +3592,25 @@ out: return nfs_ok; } -static __be32 -nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp) +static struct nfs4_ol_stateid * +nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) { - struct nfs4_ol_stateid *local; + struct nfs4_ol_stateid *local, *ret = NULL; struct nfs4_openowner *oo = open->op_openowner; + spin_lock(&fp->fi_lock); list_for_each_entry(local, &fp->fi_stateids, st_perfile) { /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; - /* remember if we have seen this open owner */ - if (local->st_stateowner == &oo->oo_owner) - *stpp = local; - /* check for conflicting share reservations */ - if (!test_share(local, open)) - return nfserr_share_denied; + if (local->st_stateowner == &oo->oo_owner) { + ret = local; + atomic_inc(&ret->st_stid.sc_count); + break; + } } - return nfs_ok; + spin_unlock(&fp->fi_lock); + return ret; } static inline int nfs4_access_to_access(u32 nfs4_access) @@ -3042,24 +3624,6 @@ static inline int nfs4_access_to_access(u32 nfs4_access) return flags; } -static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, - struct svc_fh *cur_fh, struct nfsd4_open *open) -{ - __be32 status; - int oflag = nfs4_access_to_omode(open->op_share_access); - int access = nfs4_access_to_access(open->op_share_access); - - if (!fp->fi_fds[oflag]) { - status = nfsd_open(rqstp, cur_fh, S_IFREG, access, - &fp->fi_fds[oflag]); - if (status) - return status; - } - nfs4_file_get_access(fp, oflag); - - return nfs_ok; -} - static inline __be32 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, struct nfsd4_open *open) @@ -3075,34 +3639,99 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0); } -static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) +static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, + struct nfsd4_open *open) { - u32 op_share_access = open->op_share_access; - bool new_access; + struct file *filp = NULL; __be32 status; + int oflag = nfs4_access_to_omode(open->op_share_access); + int access = nfs4_access_to_access(open->op_share_access); + unsigned char old_access_bmap, old_deny_bmap; - new_access = !test_access(op_share_access, stp); - if (new_access) { - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); - if (status) - return status; + spin_lock(&fp->fi_lock); + + /* + * Are we trying to set a deny mode that would conflict with + * current access? + */ + status = nfs4_file_check_deny(fp, open->op_share_deny); + if (status != nfs_ok) { + spin_unlock(&fp->fi_lock); + goto out; } - status = nfsd4_truncate(rqstp, cur_fh, open); - if (status) { - if (new_access) { - int oflag = nfs4_access_to_omode(op_share_access); - nfs4_file_put_access(fp, oflag); - } - return status; + + /* set access to the file */ + status = nfs4_file_get_access(fp, open->op_share_access); + if (status != nfs_ok) { + spin_unlock(&fp->fi_lock); + goto out; } - /* remember the open */ - set_access(op_share_access, stp); + + /* Set access bits in stateid */ + old_access_bmap = stp->st_access_bmap; + set_access(open->op_share_access, stp); + + /* Set new deny mask */ + old_deny_bmap = stp->st_deny_bmap; set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); - return nfs_ok; + if (!fp->fi_fds[oflag]) { + spin_unlock(&fp->fi_lock); + status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp); + if (status) + goto out_put_access; + spin_lock(&fp->fi_lock); + if (!fp->fi_fds[oflag]) { + fp->fi_fds[oflag] = filp; + filp = NULL; + } + } + spin_unlock(&fp->fi_lock); + if (filp) + fput(filp); + + status = nfsd4_truncate(rqstp, cur_fh, open); + if (status) + goto out_put_access; +out: + return status; +out_put_access: + stp->st_access_bmap = old_access_bmap; + nfs4_file_put_access(fp, open->op_share_access); + reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp); + goto out; } +static __be32 +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) +{ + __be32 status; + unsigned char old_deny_bmap; + + if (!test_access(open->op_share_access, stp)) + return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open); + + /* test and set deny mode */ + spin_lock(&fp->fi_lock); + status = nfs4_file_check_deny(fp, open->op_share_deny); + if (status == nfs_ok) { + old_deny_bmap = stp->st_deny_bmap; + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= + (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + } + spin_unlock(&fp->fi_lock); + + if (status != nfs_ok) + return status; + + status = nfsd4_truncate(rqstp, cur_fh, open); + if (status != nfs_ok) + reset_union_bmap_deny(old_deny_bmap, stp); + return status; +} static void nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session) @@ -3123,7 +3752,7 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } -static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag) +static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) { struct file_lock *fl; @@ -3135,53 +3764,101 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f fl->fl_flags = FL_DELEG; fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; fl->fl_end = OFFSET_MAX; - fl->fl_owner = (fl_owner_t)(dp->dl_file); + fl->fl_owner = (fl_owner_t)fp; fl->fl_pid = current->tgid; return fl; } static int nfs4_setlease(struct nfs4_delegation *dp) { - struct nfs4_file *fp = dp->dl_file; + struct nfs4_file *fp = dp->dl_stid.sc_file; struct file_lock *fl; - int status; + struct file *filp; + int status = 0; - fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); + fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ); if (!fl) return -ENOMEM; - fl->fl_file = find_readable_file(fp); - status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); - if (status) - goto out_free; + filp = find_readable_file(fp); + if (!filp) { + /* We should always have a readable file here */ + WARN_ON_ONCE(1); + return -EBADF; + } + fl->fl_file = filp; + status = vfs_setlease(filp, fl->fl_type, &fl); + if (status) { + locks_free_lock(fl); + goto out_fput; + } + spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + /* Did the lease get broken before we took the lock? */ + status = -EAGAIN; + if (fp->fi_had_conflict) + goto out_unlock; + /* Race breaker */ + if (fp->fi_lease) { + status = 0; + atomic_inc(&fp->fi_delegees); + hash_delegation_locked(dp, fp); + goto out_unlock; + } fp->fi_lease = fl; - fp->fi_deleg_file = get_file(fl->fl_file); + fp->fi_deleg_file = filp; atomic_set(&fp->fi_delegees, 1); - spin_lock(&state_lock); hash_delegation_locked(dp, fp); + spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); return 0; -out_free: - locks_free_lock(fl); +out_unlock: + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); +out_fput: + fput(filp); return status; } -static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp) +static struct nfs4_delegation * +nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, + struct nfs4_file *fp) { + int status; + struct nfs4_delegation *dp; + if (fp->fi_had_conflict) - return -EAGAIN; + return ERR_PTR(-EAGAIN); + + dp = alloc_init_deleg(clp, fh); + if (!dp) + return ERR_PTR(-ENOMEM); + get_nfs4_file(fp); - dp->dl_file = fp; - if (!fp->fi_lease) - return nfs4_setlease(dp); spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + dp->dl_stid.sc_file = fp; + if (!fp->fi_lease) { + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + status = nfs4_setlease(dp); + goto out; + } atomic_inc(&fp->fi_delegees); if (fp->fi_had_conflict) { - spin_unlock(&state_lock); - return -EAGAIN; + status = -EAGAIN; + goto out_unlock; } hash_delegation_locked(dp, fp); + status = 0; +out_unlock: + spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); - return 0; +out: + if (status) { + nfs4_put_stid(&dp->dl_stid); + return ERR_PTR(status); + } + return dp; } static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) @@ -3212,11 +3889,12 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) * proper support for them. */ static void -nfs4_open_delegation(struct net *net, struct svc_fh *fh, - struct nfsd4_open *open, struct nfs4_ol_stateid *stp) +nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, + struct nfs4_ol_stateid *stp) { struct nfs4_delegation *dp; - struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); + struct nfs4_openowner *oo = openowner(stp->st_stateowner); + struct nfs4_client *clp = stp->st_stid.sc_client; int cb_up; int status = 0; @@ -3235,7 +3913,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, * Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... */ - if (locks_in_grace(net)) + if (locks_in_grace(clp->net)) goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; @@ -3254,21 +3932,17 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, default: goto out_no_deleg; } - dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh); - if (dp == NULL) + dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file); + if (IS_ERR(dp)) goto out_no_deleg; - status = nfs4_set_delegation(dp, stp->st_file); - if (status) - goto out_free; memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", STATEID_VAL(&dp->dl_stid.sc_stateid)); open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + nfs4_put_stid(&dp->dl_stid); return; -out_free: - destroy_delegation(dp); out_no_deleg: open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && @@ -3301,16 +3975,12 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, */ } -/* - * called with nfs4_lock_state() held. - */ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; - struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; @@ -3320,21 +3990,18 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * and check for delegations in the process of being recalled. * If not found, create the nfs4_file struct */ - fp = find_file(ino); - if (fp) { - if ((status = nfs4_check_open(fp, open, &stp))) - goto out; + fp = find_or_add_file(open->op_file, ¤t_fh->fh_handle); + if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; + stp = nfsd4_find_existing_open(fp, open); } else { + open->op_file = NULL; status = nfserr_bad_stateid; if (nfsd4_is_deleg_cur(open)) goto out; status = nfserr_jukebox; - fp = open->op_file; - open->op_file = NULL; - nfsd4_init_file(fp, ino); } /* @@ -3347,22 +4014,19 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (status) goto out; } else { - status = nfs4_get_vfs_file(rqstp, fp, current_fh, open); - if (status) - goto out; - status = nfsd4_truncate(rqstp, current_fh, open); - if (status) - goto out; stp = open->op_stp; open->op_stp = NULL; init_open_stateid(stp, fp, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); + if (status) { + release_open_stateid(stp); + goto out; + } } update_stateid(&stp->st_stid.sc_stateid); memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); if (nfsd4_has_session(&resp->cstate)) { - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; - if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; open->op_why_no_deleg = WND4_NOT_WANTED; @@ -3374,7 +4038,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. */ - nfs4_open_delegation(SVC_NET(rqstp), current_fh, open, stp); + nfs4_open_delegation(current_fh, open, stp); nodeleg: status = nfs_ok; @@ -3397,41 +4061,27 @@ out: if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && !nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; + if (dp) + nfs4_put_stid(&dp->dl_stid); + if (stp) + nfs4_put_stid(&stp->st_stid); return status; } -void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status) +void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open, __be32 status) { if (open->op_openowner) { - struct nfs4_openowner *oo = open->op_openowner; + struct nfs4_stateowner *so = &open->op_openowner->oo_owner; - if (!list_empty(&oo->oo_owner.so_stateids)) - list_del_init(&oo->oo_close_lru); - if (oo->oo_flags & NFS4_OO_NEW) { - if (status) { - release_openowner(oo); - open->op_openowner = NULL; - } else - oo->oo_flags &= ~NFS4_OO_NEW; - } + nfsd4_cstate_assign_replay(cstate, so); + nfs4_put_stateowner(so); } if (open->op_file) nfsd4_free_file(open->op_file); if (open->op_stp) - free_generic_stateid(open->op_stp); -} - -static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp) -{ - struct nfs4_client *found; - - if (STALE_CLIENTID(clid, nn)) - return nfserr_stale_clientid; - found = find_confirmed_client(clid, session, nn); - if (clp) - *clp = found; - return found ? nfs_ok : nfserr_expired; + nfs4_put_stid(&open->op_stp->st_stid); } __be32 @@ -3442,19 +4092,18 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - nfs4_lock_state(); dprintk("process_renew(%08x/%08x): starting\n", clid->cl_boot, clid->cl_id); - status = lookup_clientid(clid, cstate->minorversion, nn, &clp); + status = lookup_clientid(clid, cstate, nn); if (status) goto out; + clp = cstate->clp; status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) goto out; status = nfs_ok; out: - nfs4_unlock_state(); return status; } @@ -3483,12 +4132,11 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_client *clp; struct nfs4_openowner *oo; struct nfs4_delegation *dp; + struct nfs4_ol_stateid *stp; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nn->nfsd4_lease; time_t t, new_timeo = nn->nfsd4_lease; - nfs4_lock_state(); - dprintk("NFSD: laundromat service - starting\n"); nfsd4_end_grace(nn); INIT_LIST_HEAD(&reaplist); @@ -3505,13 +4153,14 @@ nfs4_laundromat(struct nfsd_net *nn) clp->cl_clientid.cl_id); continue; } - list_move(&clp->cl_lru, &reaplist); + list_add(&clp->cl_lru, &reaplist); } spin_unlock(&nn->client_lock); list_for_each_safe(pos, next, &reaplist) { clp = list_entry(pos, struct nfs4_client, cl_lru); dprintk("NFSD: purging unused client (clientid %08x)\n", clp->cl_clientid.cl_id); + list_del_init(&clp->cl_lru); expire_client(clp); } spin_lock(&state_lock); @@ -3524,24 +4173,37 @@ nfs4_laundromat(struct nfsd_net *nn) new_timeo = min(new_timeo, t); break; } - list_move(&dp->dl_recall_lru, &reaplist); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); - list_for_each_safe(pos, next, &reaplist) { - dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + while (!list_empty(&reaplist)) { + dp = list_first_entry(&reaplist, struct nfs4_delegation, + dl_recall_lru); + list_del_init(&dp->dl_recall_lru); revoke_delegation(dp); } - list_for_each_safe(pos, next, &nn->close_lru) { - oo = container_of(pos, struct nfs4_openowner, oo_close_lru); - if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { + + spin_lock(&nn->client_lock); + while (!list_empty(&nn->close_lru)) { + oo = list_first_entry(&nn->close_lru, struct nfs4_openowner, + oo_close_lru); + if (time_after((unsigned long)oo->oo_time, + (unsigned long)cutoff)) { t = oo->oo_time - cutoff; new_timeo = min(new_timeo, t); break; } - release_openowner(oo); + list_del_init(&oo->oo_close_lru); + stp = oo->oo_last_closed_stid; + oo->oo_last_closed_stid = NULL; + spin_unlock(&nn->client_lock); + nfs4_put_stid(&stp->st_stid); + spin_lock(&nn->client_lock); } + spin_unlock(&nn->client_lock); + new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); - nfs4_unlock_state(); return new_timeo; } @@ -3564,7 +4226,7 @@ laundromat_main(struct work_struct *laundry) static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) { - if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode) + if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) return nfserr_bad_stateid; return nfs_ok; } @@ -3666,10 +4328,10 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; struct nfs4_ol_stateid *ols; - __be32 status; + __be32 status = nfserr_bad_stateid; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - return nfserr_bad_stateid; + return status; /* Client debugging aid. */ if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { char addr_str[INET6_ADDRSTRLEN]; @@ -3677,53 +4339,62 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) sizeof(addr_str)); pr_warn_ratelimited("NFSD: client %s testing state ID " "with incorrect client ID\n", addr_str); - return nfserr_bad_stateid; + return status; } - s = find_stateid(cl, stateid); + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, stateid); if (!s) - return nfserr_bad_stateid; + goto out_unlock; status = check_stateid_generation(stateid, &s->sc_stateid, 1); if (status) - return status; + goto out_unlock; switch (s->sc_type) { case NFS4_DELEG_STID: - return nfs_ok; + status = nfs_ok; + break; case NFS4_REVOKED_DELEG_STID: - return nfserr_deleg_revoked; + status = nfserr_deleg_revoked; + break; case NFS4_OPEN_STID: case NFS4_LOCK_STID: ols = openlockstateid(s); if (ols->st_stateowner->so_is_open_owner && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) - return nfserr_bad_stateid; - return nfs_ok; + status = nfserr_bad_stateid; + else + status = nfs_ok; + break; default: printk("unknown stateid type %x\n", s->sc_type); + /* Fallthrough */ case NFS4_CLOSED_STID: - return nfserr_bad_stateid; + case NFS4_CLOSED_DELEG_STID: + status = nfserr_bad_stateid; } +out_unlock: + spin_unlock(&cl->cl_lock); + return status; } -static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, - struct nfs4_stid **s, bool sessions, - struct nfsd_net *nn) +static __be32 +nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, + stateid_t *stateid, unsigned char typemask, + struct nfs4_stid **s, struct nfsd_net *nn) { - struct nfs4_client *cl; __be32 status; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return nfserr_bad_stateid; - status = lookup_clientid(&stateid->si_opaque.so_clid, sessions, - nn, &cl); + status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { - if (sessions) + if (cstate->session) return nfserr_bad_stateid; return nfserr_stale_stateid; } if (status) return status; - *s = find_stateid_by_type(cl, stateid, typemask); + *s = find_stateid_by_type(cstate->clp, stateid, typemask); if (!*s) return nfserr_bad_stateid; return nfs_ok; @@ -3754,12 +4425,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(net, current_fh, stateid, flags); - nfs4_lock_state(); - - status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, - &s, cstate->minorversion, nn); + status = nfsd4_lookup_stateid(cstate, stateid, + NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, + &s, nn); if (status) - goto out; + return status; status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; @@ -3770,12 +4440,13 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (status) goto out; if (filpp) { - file = dp->dl_file->fi_deleg_file; + file = dp->dl_stid.sc_file->fi_deleg_file; if (!file) { WARN_ON_ONCE(1); status = nfserr_serverfault; goto out; } + get_file(file); } break; case NFS4_OPEN_STID: @@ -3791,10 +4462,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (status) goto out; if (filpp) { + struct nfs4_file *fp = stp->st_stid.sc_file; + if (flags & RD_STATE) - file = find_readable_file(stp->st_file); + file = find_readable_file(fp); else - file = find_writeable_file(stp->st_file); + file = find_writeable_file(fp); } break; default: @@ -3803,28 +4476,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, } status = nfs_ok; if (file) - *filpp = get_file(file); + *filpp = file; out: - nfs4_unlock_state(); + nfs4_put_stid(s); return status; } -static __be32 -nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) -{ - struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); - - if (check_for_locks(stp->st_file, lo)) - return nfserr_locks_held; - /* - * Currently there's a 1-1 lock stateid<->lockowner - * correspondance, and we have to delete the lockowner when we - * delete the lock stateid: - */ - release_lockowner(lo); - return nfs_ok; -} - /* * Test if the stateid is valid */ @@ -3835,11 +4492,9 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_test_stateid_id *stateid; struct nfs4_client *cl = cstate->session->se_client; - nfs4_lock_state(); list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) stateid->ts_id_status = nfsd4_validate_stateid(cl, &stateid->ts_id_stateid); - nfs4_unlock_state(); return nfs_ok; } @@ -3851,37 +4506,50 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; + struct nfs4_ol_stateid *stp; struct nfs4_client *cl = cstate->session->se_client; __be32 ret = nfserr_bad_stateid; - nfs4_lock_state(); - s = find_stateid(cl, stateid); + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, stateid); if (!s) - goto out; + goto out_unlock; switch (s->sc_type) { case NFS4_DELEG_STID: ret = nfserr_locks_held; - goto out; + break; case NFS4_OPEN_STID: - case NFS4_LOCK_STID: ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) - goto out; - if (s->sc_type == NFS4_LOCK_STID) - ret = nfsd4_free_lock_stateid(openlockstateid(s)); - else - ret = nfserr_locks_held; + break; + ret = nfserr_locks_held; break; + case NFS4_LOCK_STID: + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + break; + stp = openlockstateid(s); + ret = nfserr_locks_held; + if (check_for_locks(stp->st_stid.sc_file, + lockowner(stp->st_stateowner))) + break; + unhash_lock_stateid(stp); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + ret = nfs_ok; + goto out; case NFS4_REVOKED_DELEG_STID: dp = delegstateid(s); - destroy_revoked_delegation(dp); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); ret = nfs_ok; - break; - default: - ret = nfserr_bad_stateid; + goto out; + /* Default falls through and returns nfserr_bad_stateid */ } +out_unlock: + spin_unlock(&cl->cl_lock); out: - nfs4_unlock_state(); return ret; } @@ -3926,20 +4594,24 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, { __be32 status; struct nfs4_stid *s; + struct nfs4_ol_stateid *stp = NULL; dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, seqid, STATEID_VAL(stateid)); *stpp = NULL; - status = nfsd4_lookup_stateid(stateid, typemask, &s, - cstate->minorversion, nn); + status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn); if (status) return status; - *stpp = openlockstateid(s); - if (!nfsd4_has_session(cstate)) - cstate->replay_owner = (*stpp)->st_stateowner; + stp = openlockstateid(s); + nfsd4_cstate_assign_replay(cstate, stp->st_stateowner); - return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); + status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp); + if (!status) + *stpp = stp; + else + nfs4_put_stid(&stp->st_stid); + return status; } static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, @@ -3947,14 +4619,18 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs { __be32 status; struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, - NFS4_OPEN_STID, stpp, nn); + NFS4_OPEN_STID, &stp, nn); if (status) return status; - oo = openowner((*stpp)->st_stateowner); - if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) + oo = openowner(stp->st_stateowner); + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { + nfs4_put_stid(&stp->st_stid); return nfserr_bad_stateid; + } + *stpp = stp; return nfs_ok; } @@ -3974,8 +4650,6 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; - nfs4_lock_state(); - status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, NFS4_OPEN_STID, &stp, nn); @@ -3984,7 +4658,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; if (oo->oo_flags & NFS4_OO_CONFIRMED) - goto out; + goto put_stateid; oo->oo_flags |= NFS4_OO_CONFIRMED; update_stateid(&stp->st_stid.sc_stateid); memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); @@ -3993,10 +4667,10 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_client_record_create(oo->oo_owner.so_client); status = nfs_ok; +put_stateid: + nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); - if (!cstate->replay_owner) - nfs4_unlock_state(); return status; } @@ -4004,7 +4678,7 @@ static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 a { if (!test_access(access, stp)) return; - nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access)); + nfs4_file_put_access(stp->st_stid.sc_file, access); clear_access(access, stp); } @@ -4026,16 +4700,6 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac } } -static void -reset_union_bmap_deny(unsigned long deny, struct nfs4_ol_stateid *stp) -{ - int i; - for (i = 0; i < 4; i++) { - if ((i & deny) != i) - clear_deny(i, stp); - } -} - __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -4053,21 +4717,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__, od->od_deleg_want); - nfs4_lock_state(); status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, &od->od_stateid, &stp, nn); if (status) goto out; status = nfserr_inval; if (!test_access(od->od_share_access, stp)) { - dprintk("NFSD: access not a subset current bitmap: 0x%lx, input access=%08x\n", + dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n", stp->st_access_bmap, od->od_share_access); - goto out; + goto put_stateid; } if (!test_deny(od->od_share_deny, stp)) { - dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n", + dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n", stp->st_deny_bmap, od->od_share_deny); - goto out; + goto put_stateid; } nfs4_stateid_downgrade(stp, od->od_share_access); @@ -4076,17 +4739,31 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, update_stateid(&stp->st_stid.sc_stateid); memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); status = nfs_ok; +put_stateid: + nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); - if (!cstate->replay_owner) - nfs4_unlock_state(); return status; } static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { - unhash_open_stateid(s); + struct nfs4_client *clp = s->st_stid.sc_client; + LIST_HEAD(reaplist); + s->st_stid.sc_type = NFS4_CLOSED_STID; + spin_lock(&clp->cl_lock); + unhash_open_stateid(s, &reaplist); + + if (clp->cl_minorversion) { + put_ol_stateid_locked(s, &reaplist); + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + } else { + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + move_to_close_lru(s, clp->net); + } } /* @@ -4097,7 +4774,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_close *close) { __be32 status; - struct nfs4_openowner *oo; struct nfs4_ol_stateid *stp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -4105,7 +4781,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_close on file %pd\n", cstate->current_fh.fh_dentry); - nfs4_lock_state(); status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, &close->cl_stateid, NFS4_OPEN_STID|NFS4_CLOSED_STID, @@ -4113,31 +4788,14 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_bump_seqid(cstate, status); if (status) goto out; - oo = openowner(stp->st_stateowner); update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); nfsd4_close_open_stateid(stp); - if (cstate->minorversion) - free_generic_stateid(stp); - else - oo->oo_last_closed_stid = stp; - - if (list_empty(&oo->oo_owner.so_stateids)) { - if (cstate->minorversion) - release_openowner(oo); - else { - /* - * In the 4.0 case we need to keep the owners around a - * little while to handle CLOSE replay. - */ - move_to_close_lru(oo, SVC_NET(rqstp)); - } - } + /* put reference from nfs4_preprocess_seqid_op */ + nfs4_put_stid(&stp->st_stid); out: - if (!cstate->replay_owner) - nfs4_unlock_state(); return status; } @@ -4154,28 +4812,24 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; - nfs4_lock_state(); - status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, - cstate->minorversion, nn); + status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn); if (status) goto out; dp = delegstateid(s); status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) - goto out; + goto put_stateid; destroy_delegation(dp); +put_stateid: + nfs4_put_stid(&dp->dl_stid); out: - nfs4_unlock_state(); - return status; } #define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) -#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1) - static inline u64 end_offset(u64 start, u64 len) { @@ -4196,13 +4850,6 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1: NFS4_MAX_UINT64; } -static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername) -{ - return (file_hashval(inode) + cl_id - + opaque_hashval(ownername->data, ownername->len)) - & LOCKOWNER_INO_HASH_MASK; -} - /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that * we can't properly handle lock requests that go beyond the (2^63 - 1)-th @@ -4255,47 +4902,56 @@ nevermind: deny->ld_type = NFS4_WRITE_LT; } -static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) +static struct nfs4_lockowner * +find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner, + struct nfs4_client *clp) { - struct nfs4_ol_stateid *lst; + unsigned int strhashval = ownerstr_hashval(owner); + struct nfs4_stateowner *so; - if (!same_owner_str(&lo->lo_owner, owner, clid)) - return false; - if (list_empty(&lo->lo_owner.so_stateids)) { - WARN_ON_ONCE(1); - return false; + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval], + so_strhash) { + if (so->so_is_open_owner) + continue; + if (!same_owner_str(so, owner)) + continue; + atomic_inc(&so->so_count); + return lockowner(so); } - lst = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - return lst->st_file->fi_inode == inode; + return NULL; } static struct nfs4_lockowner * -find_lockowner_str(struct inode *inode, clientid_t *clid, - struct xdr_netobj *owner, struct nfsd_net *nn) +find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner, + struct nfs4_client *clp) { - unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner); struct nfs4_lockowner *lo; - list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) { - if (same_lockowner_ino(lo, inode, clid, owner)) - return lo; - } - return NULL; + spin_lock(&clp->cl_lock); + lo = find_lockowner_str_locked(clid, owner, clp); + spin_unlock(&clp->cl_lock); + return lo; } -static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) +static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop) { - struct inode *inode = open_stp->st_file->fi_inode; - unsigned int inohash = lockowner_ino_hashval(inode, - clp->cl_clientid.cl_id, &lo->lo_owner.so_owner); - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + unhash_lockowner_locked(lockowner(sop)); +} + +static void nfs4_free_lockowner(struct nfs4_stateowner *sop) +{ + struct nfs4_lockowner *lo = lockowner(sop); - list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]); - list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]); - list_add(&lo->lo_perstateid, &open_stp->st_lockowners); + kmem_cache_free(lockowner_slab, lo); } +static const struct nfs4_stateowner_operations lockowner_ops = { + .so_unhash = nfs4_unhash_lockowner, + .so_free = nfs4_free_lockowner, +}; + /* * Alloc a lock owner structure. * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has @@ -4303,42 +4959,107 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s * * strhashval = ownerstr_hashval */ - static struct nfs4_lockowner * -alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) { - struct nfs4_lockowner *lo; +alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, + struct nfs4_ol_stateid *open_stp, + struct nfsd4_lock *lock) +{ + struct nfs4_lockowner *lo, *ret; lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); if (!lo) return NULL; INIT_LIST_HEAD(&lo->lo_owner.so_stateids); lo->lo_owner.so_is_open_owner = 0; - /* It is the openowner seqid that will be incremented in encode in the - * case of new lockowners; so increment the lock seqid manually: */ - lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1; - hash_lockowner(lo, strhashval, clp, open_stp); + lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; + lo->lo_owner.so_ops = &lockowner_ops; + spin_lock(&clp->cl_lock); + ret = find_lockowner_str_locked(&clp->cl_clientid, + &lock->lk_new_owner, clp); + if (ret == NULL) { + list_add(&lo->lo_owner.so_strhash, + &clp->cl_ownerstr_hashtbl[strhashval]); + ret = lo; + } else + nfs4_free_lockowner(&lo->lo_owner); + spin_unlock(&clp->cl_lock); return lo; } -static struct nfs4_ol_stateid * -alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp) +static void +init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, + struct nfs4_file *fp, struct inode *inode, + struct nfs4_ol_stateid *open_stp) { - struct nfs4_ol_stateid *stp; struct nfs4_client *clp = lo->lo_owner.so_client; - stp = nfs4_alloc_stateid(clp); - if (stp == NULL) - return NULL; + lockdep_assert_held(&clp->cl_lock); + + atomic_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_LOCK_STID; - list_add(&stp->st_perfile, &fp->fi_stateids); - list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); stp->st_stateowner = &lo->lo_owner; + atomic_inc(&lo->lo_owner.so_count); get_nfs4_file(fp); - stp->st_file = fp; + stp->st_stid.sc_file = fp; + stp->st_stid.sc_free = nfs4_free_lock_stateid; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - return stp; + list_add(&stp->st_locks, &open_stp->st_locks); + list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); + spin_lock(&fp->fi_lock); + list_add(&stp->st_perfile, &fp->fi_stateids); + spin_unlock(&fp->fi_lock); +} + +static struct nfs4_ol_stateid * +find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *lst; + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { + if (lst->st_stid.sc_file == fp) { + atomic_inc(&lst->st_stid.sc_count); + return lst; + } + } + return NULL; +} + +static struct nfs4_ol_stateid * +find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, + struct inode *inode, struct nfs4_ol_stateid *ost, + bool *new) +{ + struct nfs4_stid *ns = NULL; + struct nfs4_ol_stateid *lst; + struct nfs4_openowner *oo = openowner(ost->st_stateowner); + struct nfs4_client *clp = oo->oo_owner.so_client; + + spin_lock(&clp->cl_lock); + lst = find_lock_stateid(lo, fi); + if (lst == NULL) { + spin_unlock(&clp->cl_lock); + ns = nfs4_alloc_stid(clp, stateid_slab); + if (ns == NULL) + return NULL; + + spin_lock(&clp->cl_lock); + lst = find_lock_stateid(lo, fi); + if (likely(!lst)) { + lst = openlockstateid(ns); + init_lock_stateid(lst, lo, fi, inode, ost); + ns = NULL; + *new = true; + } + } + spin_unlock(&clp->cl_lock); + if (ns) + nfs4_put_stid(ns); + return lst; } static int @@ -4350,46 +5071,53 @@ check_lock_length(u64 offset, u64 length) static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) { - struct nfs4_file *fp = lock_stp->st_file; - int oflag = nfs4_access_to_omode(access); + struct nfs4_file *fp = lock_stp->st_stid.sc_file; + + lockdep_assert_held(&fp->fi_lock); if (test_access(access, lock_stp)) return; - nfs4_file_get_access(fp, oflag); + __nfs4_file_get_access(fp, access); set_access(access, lock_stp); } -static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new) +static __be32 +lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, + struct nfs4_ol_stateid *ost, + struct nfsd4_lock *lock, + struct nfs4_ol_stateid **lst, bool *new) { - struct nfs4_file *fi = ost->st_file; + __be32 status; + struct nfs4_file *fi = ost->st_stid.sc_file; struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *cl = oo->oo_owner.so_client; + struct inode *inode = cstate->current_fh.fh_dentry->d_inode; struct nfs4_lockowner *lo; unsigned int strhashval; - struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id); - - lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, - &lock->v.new.owner, nn); - if (lo) { - if (!cstate->minorversion) - return nfserr_bad_seqid; - /* XXX: a lockowner always has exactly one stateid: */ - *lst = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - return nfs_ok; + + lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl); + if (!lo) { + strhashval = ownerstr_hashval(&lock->v.new.owner); + lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); + if (lo == NULL) + return nfserr_jukebox; + } else { + /* with an existing lockowner, seqids must be the same */ + status = nfserr_bad_seqid; + if (!cstate->minorversion && + lock->lk_new_lock_seqid != lo->lo_owner.so_seqid) + goto out; } - strhashval = ownerstr_hashval(cl->cl_clientid.cl_id, - &lock->v.new.owner); - lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); - if (lo == NULL) - return nfserr_jukebox; - *lst = alloc_init_lock_stateid(lo, fi, ost); + + *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); if (*lst == NULL) { - release_lockowner(lo); - return nfserr_jukebox; + status = nfserr_jukebox; + goto out; } - *new = true; - return nfs_ok; + status = nfs_ok; +out: + nfs4_put_stateowner(&lo->lo_owner); + return status; } /* @@ -4401,14 +5129,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfs4_openowner *open_sop = NULL; struct nfs4_lockowner *lock_sop = NULL; - struct nfs4_ol_stateid *lock_stp; + struct nfs4_ol_stateid *lock_stp = NULL; + struct nfs4_ol_stateid *open_stp = NULL; + struct nfs4_file *fp; struct file *filp = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; __be32 status = 0; - bool new_state = false; int lkflg; int err; + bool new = false; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -4425,11 +5155,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } - nfs4_lock_state(); - if (lock->lk_is_new) { - struct nfs4_ol_stateid *open_stp = NULL; - if (nfsd4_has_session(cstate)) /* See rfc 5661 18.10.3: given clientid is ignored: */ memcpy(&lock->v.new.clientid, @@ -4453,12 +5179,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &lock->v.new.clientid)) goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, - &lock_stp, &new_state); - } else + &lock_stp, &new); + } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, NFS4_LOCK_STID, &lock_stp, nn); + } if (status) goto out; lock_sop = lockowner(lock_stp->st_stateowner); @@ -4482,20 +5209,25 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } + fp = lock_stp->st_stid.sc_file; locks_init_lock(file_lock); switch (lock->lk_type) { case NFS4_READ_LT: case NFS4_READW_LT: - filp = find_readable_file(lock_stp->st_file); + spin_lock(&fp->fi_lock); + filp = find_readable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); + spin_unlock(&fp->fi_lock); file_lock->fl_type = F_RDLCK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: - filp = find_writeable_file(lock_stp->st_file); + spin_lock(&fp->fi_lock); + filp = find_writeable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); + spin_unlock(&fp->fi_lock); file_lock->fl_type = F_WRLCK; break; default: @@ -4544,11 +5276,27 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; } out: - if (status && new_state) - release_lockowner(lock_sop); + if (filp) + fput(filp); + if (lock_stp) { + /* Bump seqid manually if the 4.0 replay owner is openowner */ + if (cstate->replay_owner && + cstate->replay_owner != &lock_sop->lo_owner && + seqid_mutating_err(ntohl(status))) + lock_sop->lo_owner.so_seqid++; + + /* + * If this is a new, never-before-used stateid, and we are + * returning an error, then just go ahead and release it. + */ + if (status && new) + release_lock_stateid(lock_stp); + + nfs4_put_stid(&lock_stp->st_stid); + } + if (open_stp) + nfs4_put_stid(&open_stp->st_stid); nfsd4_bump_seqid(cstate, status); - if (!cstate->replay_owner) - nfs4_unlock_state(); if (file_lock) locks_free_lock(file_lock); if (conflock) @@ -4580,9 +5328,8 @@ __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_lockt *lockt) { - struct inode *inode; struct file_lock *file_lock = NULL; - struct nfs4_lockowner *lo; + struct nfs4_lockowner *lo = NULL; __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); @@ -4592,10 +5339,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_lock_length(lockt->lt_offset, lockt->lt_length)) return nfserr_inval; - nfs4_lock_state(); - if (!nfsd4_has_session(cstate)) { - status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL); + status = lookup_clientid(&lockt->lt_clientid, cstate, nn); if (status) goto out; } @@ -4603,7 +5348,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) goto out; - inode = cstate->current_fh.fh_dentry->d_inode; file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); @@ -4626,7 +5370,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn); + lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner, + cstate->clp); if (lo) file_lock->fl_owner = (fl_owner_t)lo; file_lock->fl_pid = current->tgid; @@ -4646,7 +5391,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_set_lock_denied(file_lock, &lockt->lt_denied); } out: - nfs4_unlock_state(); + if (lo) + nfs4_put_stateowner(&lo->lo_owner); if (file_lock) locks_free_lock(file_lock); return status; @@ -4670,23 +5416,21 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_lock_length(locku->lu_offset, locku->lu_length)) return nfserr_inval; - nfs4_lock_state(); - status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, &locku->lu_stateid, NFS4_LOCK_STID, &stp, nn); if (status) goto out; - filp = find_any_file(stp->st_file); + filp = find_any_file(stp->st_stid.sc_file); if (!filp) { status = nfserr_lock_range; - goto out; + goto put_stateid; } file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; - goto out; + goto fput; } locks_init_lock(file_lock); file_lock->fl_type = F_UNLCK; @@ -4708,41 +5452,51 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } update_stateid(&stp->st_stid.sc_stateid); memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - +fput: + fput(filp); +put_stateid: + nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); - if (!cstate->replay_owner) - nfs4_unlock_state(); if (file_lock) locks_free_lock(file_lock); return status; out_nfserr: status = nfserrno(err); - goto out; + goto fput; } /* * returns - * 1: locks held by lockowner - * 0: no locks held by lockowner + * true: locks held by lockowner + * false: no locks held by lockowner */ -static int -check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) +static bool +check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock **flpp; - struct inode *inode = filp->fi_inode; - int status = 0; + int status = false; + struct file *filp = find_any_file(fp); + struct inode *inode; + + if (!filp) { + /* Any valid lock stateid should have some sort of access */ + WARN_ON_ONCE(1); + return status; + } + + inode = file_inode(filp); spin_lock(&inode->i_lock); for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { if ((*flpp)->fl_owner == (fl_owner_t)lowner) { - status = 1; - goto out; + status = true; + break; } } -out: spin_unlock(&inode->i_lock); + fput(filp); return status; } @@ -4753,53 +5507,46 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, { clientid_t *clid = &rlockowner->rl_clientid; struct nfs4_stateowner *sop; - struct nfs4_lockowner *lo; + struct nfs4_lockowner *lo = NULL; struct nfs4_ol_stateid *stp; struct xdr_netobj *owner = &rlockowner->rl_owner; - struct list_head matches; - unsigned int hashval = ownerstr_hashval(clid->cl_id, owner); + unsigned int hashval = ownerstr_hashval(owner); __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfs4_client *clp; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); - nfs4_lock_state(); - - status = lookup_clientid(clid, cstate->minorversion, nn, NULL); + status = lookup_clientid(clid, cstate, nn); if (status) - goto out; + return status; - status = nfserr_locks_held; - INIT_LIST_HEAD(&matches); + clp = cstate->clp; + /* Find the matching lock stateowner */ + spin_lock(&clp->cl_lock); + list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval], + so_strhash) { - list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) { - if (sop->so_is_open_owner) + if (sop->so_is_open_owner || !same_owner_str(sop, owner)) continue; - if (!same_owner_str(sop, owner, clid)) - continue; - list_for_each_entry(stp, &sop->so_stateids, - st_perstateowner) { - lo = lockowner(sop); - if (check_for_locks(stp->st_file, lo)) - goto out; - list_add(&lo->lo_list, &matches); + + /* see if there are still any locks associated with it */ + lo = lockowner(sop); + list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { + if (check_for_locks(stp->st_stid.sc_file, lo)) { + status = nfserr_locks_held; + spin_unlock(&clp->cl_lock); + return status; + } } + + atomic_inc(&sop->so_count); + break; } - /* Clients probably won't expect us to return with some (but not all) - * of the lockowner state released; so don't release any until all - * have been checked. */ - status = nfs_ok; - while (!list_empty(&matches)) { - lo = list_entry(matches.next, struct nfs4_lockowner, - lo_list); - /* unhash_stateowner deletes so_perclient only - * for openowners. */ - list_del(&lo->lo_list); + spin_unlock(&clp->cl_lock); + if (lo) release_lockowner(lo); - } -out: - nfs4_unlock_state(); return status; } @@ -4887,34 +5634,123 @@ nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn) * Called from OPEN. Look for clientid in reclaim list. */ __be32 -nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn) +nfs4_check_open_reclaim(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn) { - struct nfs4_client *clp; + __be32 status; /* find clientid in conf_id_hashtbl */ - clp = find_confirmed_client(clid, sessions, nn); - if (clp == NULL) + status = lookup_clientid(clid, cstate, nn); + if (status) return nfserr_reclaim_bad; - return nfsd4_client_record_check(clp) ? nfserr_reclaim_bad : nfs_ok; + if (nfsd4_client_record_check(cstate->clp)) + return nfserr_reclaim_bad; + + return nfs_ok; } #ifdef CONFIG_NFSD_FAULT_INJECTION +static inline void +put_client(struct nfs4_client *clp) +{ + atomic_dec(&clp->cl_refcount); +} -u64 nfsd_forget_client(struct nfs4_client *clp, u64 max) +static struct nfs4_client * +nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size) { - if (mark_client_expired(clp)) - return 0; - expire_client(clp); - return 1; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return NULL; + + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + if (memcmp(&clp->cl_addr, addr, addr_size) == 0) + return clp; + } + return NULL; } -u64 nfsd_print_client(struct nfs4_client *clp, u64 num) +u64 +nfsd_inject_print_clients(void) { + struct nfs4_client *clp; + u64 count = 0; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); char buf[INET6_ADDRSTRLEN]; - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); - printk(KERN_INFO "NFS Client: %s\n", buf); - return 1; + + if (!nfsd_netns_ready(nn)) + return 0; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); + pr_info("NFS Client: %s\n", buf); + ++count; + } + spin_unlock(&nn->client_lock); + + return count; +} + +u64 +nfsd_inject_forget_client(struct sockaddr_storage *addr, size_t addr_size) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) { + if (mark_client_expired_locked(clp) == nfs_ok) + ++count; + else + clp = NULL; + } + spin_unlock(&nn->client_lock); + + if (clp) + expire_client(clp); + + return count; +} + +u64 +nfsd_inject_forget_clients(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { + if (mark_client_expired_locked(clp) == nfs_ok) { + list_add(&clp->cl_lru, &reaplist); + if (max != 0 && ++count >= max) + break; + } + } + spin_unlock(&nn->client_lock); + + list_for_each_entry_safe(clp, next, &reaplist, cl_lru) + expire_client(clp); + + return count; } static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, @@ -4925,158 +5761,484 @@ static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type); } -static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *)) +static void +nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst, + struct list_head *collect) +{ + struct nfs4_client *clp = lst->st_stid.sc_client; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!collect) + return; + + lockdep_assert_held(&nn->client_lock); + atomic_inc(&clp->cl_refcount); + list_add(&lst->st_locks, collect); +} + +static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, + struct list_head *collect, + void (*func)(struct nfs4_ol_stateid *)) { struct nfs4_openowner *oop; - struct nfs4_lockowner *lop, *lo_next; struct nfs4_ol_stateid *stp, *st_next; + struct nfs4_ol_stateid *lst, *lst_next; u64 count = 0; + spin_lock(&clp->cl_lock); list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) { - list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) { - list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) { - if (func) - func(lop); - if (++count == max) - return count; + list_for_each_entry_safe(stp, st_next, + &oop->oo_owner.so_stateids, st_perstateowner) { + list_for_each_entry_safe(lst, lst_next, + &stp->st_locks, st_locks) { + if (func) { + func(lst); + nfsd_inject_add_lock_to_list(lst, + collect); + } + ++count; + /* + * Despite the fact that these functions deal + * with 64-bit integers for "count", we must + * ensure that it doesn't blow up the + * clp->cl_refcount. Throw a warning if we + * start to approach INT_MAX here. + */ + WARN_ON_ONCE(count == (INT_MAX / 2)); + if (count == max) + goto out; } } } +out: + spin_unlock(&clp->cl_lock); return count; } -u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max) +static u64 +nfsd_collect_client_locks(struct nfs4_client *clp, struct list_head *collect, + u64 max) { - return nfsd_foreach_client_lock(clp, max, release_lockowner); + return nfsd_foreach_client_lock(clp, max, collect, unhash_lock_stateid); } -u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max) +static u64 +nfsd_print_client_locks(struct nfs4_client *clp) { - u64 count = nfsd_foreach_client_lock(clp, max, NULL); + u64 count = nfsd_foreach_client_lock(clp, 0, NULL, NULL); nfsd_print_count(clp, count, "locked files"); return count; } -static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *)) +u64 +nfsd_inject_print_locks(void) +{ + struct nfs4_client *clp; + u64 count = 0; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return 0; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) + count += nfsd_print_client_locks(clp); + spin_unlock(&nn->client_lock); + + return count; +} + +static void +nfsd_reap_locks(struct list_head *reaplist) +{ + struct nfs4_client *clp; + struct nfs4_ol_stateid *stp, *next; + + list_for_each_entry_safe(stp, next, reaplist, st_locks) { + list_del_init(&stp->st_locks); + clp = stp->st_stid.sc_client; + nfs4_put_stid(&stp->st_stid); + put_client(clp); + } +} + +u64 +nfsd_inject_forget_client_locks(struct sockaddr_storage *addr, size_t addr_size) +{ + unsigned int count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_collect_client_locks(clp, &reaplist, 0); + spin_unlock(&nn->client_lock); + nfsd_reap_locks(&reaplist); + return count; +} + +u64 +nfsd_inject_forget_locks(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + count += nfsd_collect_client_locks(clp, &reaplist, max - count); + if (max != 0 && count >= max) + break; + } + spin_unlock(&nn->client_lock); + nfsd_reap_locks(&reaplist); + return count; +} + +static u64 +nfsd_foreach_client_openowner(struct nfs4_client *clp, u64 max, + struct list_head *collect, + void (*func)(struct nfs4_openowner *)) { struct nfs4_openowner *oop, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); u64 count = 0; + lockdep_assert_held(&nn->client_lock); + + spin_lock(&clp->cl_lock); list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) { - if (func) + if (func) { func(oop); - if (++count == max) + if (collect) { + atomic_inc(&clp->cl_refcount); + list_add(&oop->oo_perclient, collect); + } + } + ++count; + /* + * Despite the fact that these functions deal with + * 64-bit integers for "count", we must ensure that + * it doesn't blow up the clp->cl_refcount. Throw a + * warning if we start to approach INT_MAX here. + */ + WARN_ON_ONCE(count == (INT_MAX / 2)); + if (count == max) break; } + spin_unlock(&clp->cl_lock); return count; } -u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max) +static u64 +nfsd_print_client_openowners(struct nfs4_client *clp) { - return nfsd_foreach_client_open(clp, max, release_openowner); + u64 count = nfsd_foreach_client_openowner(clp, 0, NULL, NULL); + + nfsd_print_count(clp, count, "openowners"); + return count; } -u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max) +static u64 +nfsd_collect_client_openowners(struct nfs4_client *clp, + struct list_head *collect, u64 max) { - u64 count = nfsd_foreach_client_open(clp, max, NULL); - nfsd_print_count(clp, count, "open files"); - return count; + return nfsd_foreach_client_openowner(clp, max, collect, + unhash_openowner_locked); } -static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, - struct list_head *victims) +u64 +nfsd_inject_print_openowners(void) { - struct nfs4_delegation *dp, *next; + struct nfs4_client *clp; u64 count = 0; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + + if (!nfsd_netns_ready(nn)) + return 0; + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) + count += nfsd_print_client_openowners(clp); + spin_unlock(&nn->client_lock); - lockdep_assert_held(&state_lock); - list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { - if (victims) - list_move(&dp->dl_recall_lru, victims); - if (++count == max) - break; - } return count; } -u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max) +static void +nfsd_reap_openowners(struct list_head *reaplist) { - struct nfs4_delegation *dp, *next; - LIST_HEAD(victims); - u64 count; + struct nfs4_client *clp; + struct nfs4_openowner *oop, *next; - spin_lock(&state_lock); - count = nfsd_find_all_delegations(clp, max, &victims); - spin_unlock(&state_lock); + list_for_each_entry_safe(oop, next, reaplist, oo_perclient) { + list_del_init(&oop->oo_perclient); + clp = oop->oo_owner.so_client; + release_openowner(oop); + put_client(clp); + } +} - list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) - revoke_delegation(dp); +u64 +nfsd_inject_forget_client_openowners(struct sockaddr_storage *addr, + size_t addr_size) +{ + unsigned int count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_collect_client_openowners(clp, &reaplist, 0); + spin_unlock(&nn->client_lock); + nfsd_reap_openowners(&reaplist); return count; } -u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max) +u64 +nfsd_inject_forget_openowners(u64 max) { - struct nfs4_delegation *dp, *next; - LIST_HEAD(victims); - u64 count; + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); - spin_lock(&state_lock); - count = nfsd_find_all_delegations(clp, max, &victims); - list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) - nfsd_break_one_deleg(dp); - spin_unlock(&state_lock); + if (!nfsd_netns_ready(nn)) + return count; + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + count += nfsd_collect_client_openowners(clp, &reaplist, + max - count); + if (max != 0 && count >= max) + break; + } + spin_unlock(&nn->client_lock); + nfsd_reap_openowners(&reaplist); return count; } -u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max) +static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, + struct list_head *victims) { + struct nfs4_delegation *dp, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); u64 count = 0; + lockdep_assert_held(&nn->client_lock); + spin_lock(&state_lock); - count = nfsd_find_all_delegations(clp, max, NULL); + list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { + if (victims) { + /* + * It's not safe to mess with delegations that have a + * non-zero dl_time. They might have already been broken + * and could be processed by the laundromat outside of + * the state_lock. Just leave them be. + */ + if (dp->dl_time != 0) + continue; + + atomic_inc(&clp->cl_refcount); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, victims); + } + ++count; + /* + * Despite the fact that these functions deal with + * 64-bit integers for "count", we must ensure that + * it doesn't blow up the clp->cl_refcount. Throw a + * warning if we start to approach INT_MAX here. + */ + WARN_ON_ONCE(count == (INT_MAX / 2)); + if (count == max) + break; + } spin_unlock(&state_lock); + return count; +} + +static u64 +nfsd_print_client_delegations(struct nfs4_client *clp) +{ + u64 count = nfsd_find_all_delegations(clp, 0, NULL); nfsd_print_count(clp, count, "delegations"); return count; } -u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) +u64 +nfsd_inject_print_delegations(void) { - struct nfs4_client *clp, *next; + struct nfs4_client *clp; u64 count = 0; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); if (!nfsd_netns_ready(nn)) return 0; - list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { - count += func(clp, max - count); - if ((max != 0) && (count >= max)) - break; + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) + count += nfsd_print_client_delegations(clp); + spin_unlock(&nn->client_lock); + + return count; +} + +static void +nfsd_forget_delegations(struct list_head *reaplist) +{ + struct nfs4_client *clp; + struct nfs4_delegation *dp, *next; + + list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) { + list_del_init(&dp->dl_recall_lru); + clp = dp->dl_stid.sc_client; + revoke_delegation(dp); + put_client(clp); } +} +u64 +nfsd_inject_forget_client_delegations(struct sockaddr_storage *addr, + size_t addr_size) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_find_all_delegations(clp, 0, &reaplist); + spin_unlock(&nn->client_lock); + + nfsd_forget_delegations(&reaplist); return count; } -struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size) +u64 +nfsd_inject_forget_delegations(u64 max) { + u64 count = 0; struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); if (!nfsd_netns_ready(nn)) - return NULL; + return count; + spin_lock(&nn->client_lock); list_for_each_entry(clp, &nn->client_lru, cl_lru) { - if (memcmp(&clp->cl_addr, addr, addr_size) == 0) - return clp; + count += nfsd_find_all_delegations(clp, max - count, &reaplist); + if (max != 0 && count >= max) + break; } - return NULL; + spin_unlock(&nn->client_lock); + nfsd_forget_delegations(&reaplist); + return count; } +static void +nfsd_recall_delegations(struct list_head *reaplist) +{ + struct nfs4_client *clp; + struct nfs4_delegation *dp, *next; + + list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) { + list_del_init(&dp->dl_recall_lru); + clp = dp->dl_stid.sc_client; + /* + * We skipped all entries that had a zero dl_time before, + * so we can now reset the dl_time back to 0. If a delegation + * break comes in now, then it won't make any difference since + * we're recalling it either way. + */ + spin_lock(&state_lock); + dp->dl_time = 0; + spin_unlock(&state_lock); + nfsd_break_one_deleg(dp); + put_client(clp); + } +} + +u64 +nfsd_inject_recall_client_delegations(struct sockaddr_storage *addr, + size_t addr_size) +{ + u64 count = 0; + struct nfs4_client *clp; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + clp = nfsd_find_client(addr, addr_size); + if (clp) + count = nfsd_find_all_delegations(clp, 0, &reaplist); + spin_unlock(&nn->client_lock); + + nfsd_recall_delegations(&reaplist); + return count; +} + +u64 +nfsd_inject_recall_delegations(u64 max) +{ + u64 count = 0; + struct nfs4_client *clp, *next; + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + LIST_HEAD(reaplist); + + if (!nfsd_netns_ready(nn)) + return count; + + spin_lock(&nn->client_lock); + list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { + count += nfsd_find_all_delegations(clp, max - count, &reaplist); + if (max != 0 && ++count >= max) + break; + } + spin_unlock(&nn->client_lock); + nfsd_recall_delegations(&reaplist); + return count; +} #endif /* CONFIG_NFSD_FAULT_INJECTION */ /* @@ -5113,14 +6275,6 @@ static int nfs4_state_create_net(struct net *net) CLIENT_HASH_SIZE, GFP_KERNEL); if (!nn->unconf_id_hashtbl) goto err_unconf_id; - nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) * - OWNER_HASH_SIZE, GFP_KERNEL); - if (!nn->ownerstr_hashtbl) - goto err_ownerstr; - nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) * - LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL); - if (!nn->lockowner_ino_hashtbl) - goto err_lockowner_ino; nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) * SESSION_HASH_SIZE, GFP_KERNEL); if (!nn->sessionid_hashtbl) @@ -5130,10 +6284,6 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); } - for (i = 0; i < OWNER_HASH_SIZE; i++) - INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]); - for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++) - INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]); for (i = 0; i < SESSION_HASH_SIZE; i++) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; @@ -5149,10 +6299,6 @@ static int nfs4_state_create_net(struct net *net) return 0; err_sessionid: - kfree(nn->lockowner_ino_hashtbl); -err_lockowner_ino: - kfree(nn->ownerstr_hashtbl); -err_ownerstr: kfree(nn->unconf_id_hashtbl); err_unconf_id: kfree(nn->conf_id_hashtbl); @@ -5182,8 +6328,6 @@ nfs4_state_destroy_net(struct net *net) } kfree(nn->sessionid_hashtbl); - kfree(nn->lockowner_ino_hashtbl); - kfree(nn->ownerstr_hashtbl); kfree(nn->unconf_id_hashtbl); kfree(nn->conf_id_hashtbl); put_net(net); @@ -5247,22 +6391,22 @@ nfs4_state_shutdown_net(struct net *net) cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); - nfs4_lock_state(); INIT_LIST_HEAD(&reaplist); spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - list_move(&dp->dl_recall_lru, &reaplist); + unhash_delegation_locked(dp); + list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - destroy_delegation(dp); + list_del_init(&dp->dl_recall_lru); + nfs4_put_stid(&dp->dl_stid); } nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); - nfs4_unlock_state(); } void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 83baf2bfe9e9..f9821ce6658a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -181,28 +181,43 @@ static int zero_clientid(clientid_t *clid) } /** - * defer_free - mark an allocation as deferred freed - * @argp: NFSv4 compound argument structure to be freed with - * @release: release callback to free @p, typically kfree() - * @p: pointer to be freed + * svcxdr_tmpalloc - allocate memory to be freed after compound processing + * @argp: NFSv4 compound argument structure + * @p: pointer to be freed (with kfree()) * * Marks @p to be freed when processing the compound operation * described in @argp finishes. */ -static int -defer_free(struct nfsd4_compoundargs *argp, - void (*release)(const void *), void *p) +static void * +svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len) { - struct tmpbuf *tb; + struct svcxdr_tmpbuf *tb; - tb = kmalloc(sizeof(*tb), GFP_KERNEL); + tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL); if (!tb) - return -ENOMEM; - tb->buf = p; - tb->release = release; + return NULL; tb->next = argp->to_free; argp->to_free = tb; - return 0; + return tb->buf; +} + +/* + * For xdr strings that need to be passed to other kernel api's + * as null-terminated strings. + * + * Note null-terminating in place usually isn't safe since the + * buffer might end on a page boundary. + */ +static char * +svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len) +{ + char *p = svcxdr_tmpalloc(argp, len + 1); + + if (!p) + return NULL; + memcpy(p, buf, len); + p[len] = '\0'; + return p; } /** @@ -217,19 +232,13 @@ defer_free(struct nfsd4_compoundargs *argp, */ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) { - if (p == argp->tmp) { - p = kmemdup(argp->tmp, nbytes, GFP_KERNEL); - if (!p) - return NULL; - } else { - BUG_ON(p != argp->tmpp); - argp->tmpp = NULL; - } - if (defer_free(argp, kfree, p)) { - kfree(p); + void *ret; + + ret = svcxdr_tmpalloc(argp, nbytes); + if (!ret) return NULL; - } else - return (char *)p; + memcpy(ret, p, nbytes); + return ret; } static __be32 @@ -292,12 +301,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (nace > NFS4_ACL_MAX) return nfserr_fbig; - *acl = nfs4_acl_new(nace); + *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); if (*acl == NULL) return nfserr_jukebox; - defer_free(argp, kfree, *acl); - (*acl)->naces = nace; for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { READ_BUF(16); len += 16; @@ -418,12 +425,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_badlabel; len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); - label->data = kzalloc(dummy32 + 1, GFP_KERNEL); + label->len = dummy32; + label->data = svcxdr_dupstr(argp, buf, dummy32); if (!label->data) return nfserr_jukebox; - label->len = dummy32; - defer_free(argp, kfree, label->data); - memcpy(label->data, buf, dummy32); } #endif @@ -598,9 +603,11 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create switch (create->cr_type) { case NF4LNK: READ_BUF(4); - create->cr_linklen = be32_to_cpup(p++); - READ_BUF(create->cr_linklen); - SAVEMEM(create->cr_linkname, create->cr_linklen); + create->cr_datalen = be32_to_cpup(p++); + READ_BUF(create->cr_datalen); + create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen); + if (!create->cr_data) + return nfserr_jukebox; break; case NF4BLK: case NF4CHR: @@ -1470,13 +1477,12 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta INIT_LIST_HEAD(&test_stateid->ts_stateid_list); for (i = 0; i < test_stateid->ts_num_ids; i++) { - stateid = kmalloc(sizeof(struct nfsd4_test_stateid_id), GFP_KERNEL); + stateid = svcxdr_tmpalloc(argp, sizeof(*stateid)); if (!stateid) { status = nfserrno(-ENOMEM); goto out; } - defer_free(argp, kfree, stateid); INIT_LIST_HEAD(&stateid->ts_id_list); list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); @@ -1629,7 +1635,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) goto xdr_error; if (argp->opcnt > ARRAY_SIZE(argp->iops)) { - argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); + argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); if (!argp->ops) { argp->ops = argp->iops; dprintk("nfsd: couldn't allocate room for COMPOUND\n"); @@ -2630,7 +2636,7 @@ nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { __be32 *p; - p = xdr_reserve_space(xdr, 6); + p = xdr_reserve_space(xdr, 20); if (!p) return NULL; *p++ = htonl(2); @@ -2868,6 +2874,7 @@ again: * return the conflicting open: */ if (conf->len) { + kfree(conf->data); conf->len = 0; conf->data = NULL; goto again; @@ -2880,6 +2887,7 @@ again: if (conf->len) { p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8); p = xdr_encode_opaque(p, conf->data, conf->len); + kfree(conf->data); } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ p = xdr_encode_hyper(p, (u64)0); /* clientid */ *p++ = cpu_to_be32(0); /* length of owner name */ @@ -2896,7 +2904,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); else if (nfserr == nfserr_denied) nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied); - kfree(lock->lk_denied.ld_owner.data); + return nfserr; } @@ -3064,11 +3072,8 @@ static __be32 nfsd4_encode_splice_read( __be32 nfserr; __be32 *p = xdr->p - 2; - /* - * Don't inline pages unless we know there's room for eof, - * count, and possible padding: - */ - if (xdr->end - xdr->p < 3) + /* Make sure there will be room for padding if needed */ + if (xdr->end - xdr->p < 1) return nfserr_resource; nfserr = nfsd_splice_read(read->rd_rqstp, file, @@ -3134,9 +3139,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, len = maxcount; v = 0; - thislen = (void *)xdr->end - (void *)xdr->p; - if (len < thislen) - thislen = len; + thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p)); p = xdr_reserve_space(xdr, (thislen+3)&~3); WARN_ON_ONCE(!p); resp->rqstp->rq_vec[v].iov_base = p; @@ -3203,10 +3206,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, xdr_commit_encode(xdr); maxcount = svc_max_payload(resp->rqstp); - if (maxcount > xdr->buf->buflen - xdr->buf->len) - maxcount = xdr->buf->buflen - xdr->buf->len; - if (maxcount > read->rd_length) - maxcount = read->rd_length; + maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len)); + maxcount = min_t(unsigned long, maxcount, read->rd_length); if (!read->rd_filp) { err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, @@ -3267,7 +3268,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd wire_count = htonl(maxcount); write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); - xdr_truncate_encode(xdr, length_offset + 4 + maxcount); + xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4)); if (maxcount & 3) write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero, 4 - (maxcount&3)); @@ -3924,8 +3925,6 @@ status: * * XDR note: do not encode rp->rp_buflen: the buffer contains the * previously sent already encoded operation. - * - * called with nfs4_lock_state() held */ void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) @@ -3964,9 +3963,8 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp) kfree(args->tmpp); args->tmpp = NULL; while (args->to_free) { - struct tmpbuf *tb = args->to_free; + struct svcxdr_tmpbuf *tb = args->to_free; args->to_free = tb->next; - tb->release(tb->buf); kfree(tb); } return 1; @@ -3999,7 +3997,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo /* * All that remains is to write the tag and operation count... */ - struct nfsd4_compound_state *cs = &resp->cstate; struct xdr_buf *buf = resp->xdr.buf; WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + @@ -4013,19 +4010,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo p += XDR_QUADLEN(resp->taglen); *p++ = htonl(resp->opcnt); - if (nfsd4_has_session(cs)) { - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - struct nfs4_client *clp = cs->session->se_client; - if (cs->status != nfserr_replay_cache) { - nfsd4_store_cache_entry(resp); - cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; - } - /* Renew the clientid on success and on replay */ - spin_lock(&nn->client_lock); - nfsd4_put_session(cs->session); - spin_unlock(&nn->client_lock); - put_client_renew(clp); - } + nfsd4_sequence_done(resp); return 1; } diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 6040da8830ff..ff9567633245 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -221,7 +221,12 @@ static void hash_refile(struct svc_cacherep *rp) { hlist_del_init(&rp->c_hash); - hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits)); + /* + * No point in byte swapping c_xid since we're just using it to pick + * a hash bucket. + */ + hlist_add_head(&rp->c_hash, cache_hash + + hash_32((__force u32)rp->c_xid, maskbits)); } /* @@ -356,7 +361,11 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) struct hlist_head *rh; unsigned int entries = 0; - rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)]; + /* + * No point in byte swapping rq_xid since we're just using it to pick + * a hash bucket. + */ + rh = &cache_hash[hash_32((__force u32)rqstp->rq_xid, maskbits)]; hlist_for_each_entry(rp, rh, c_hash) { ++entries; if (nfsd_cache_match(rqstp, csum, rp)) { diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 51844048937f..4e042105fb6e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -39,6 +39,7 @@ enum { NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, + NFSD_MaxConnections, NFSD_SupportedEnctypes, /* * The below MUST come last. Otherwise we leave a hole in nfsd_files[] @@ -62,6 +63,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); static ssize_t write_versions(struct file *file, char *buf, size_t size); static ssize_t write_ports(struct file *file, char *buf, size_t size); static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); +static ssize_t write_maxconn(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_V4 static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_gracetime(struct file *file, char *buf, size_t size); @@ -77,6 +79,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Versions] = write_versions, [NFSD_Ports] = write_ports, [NFSD_MaxBlkSize] = write_maxblksize, + [NFSD_MaxConnections] = write_maxconn, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = write_leasetime, [NFSD_Gracetime] = write_gracetime, @@ -369,8 +372,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) if (maxsize < NFS_FHSIZE) return -EINVAL; - if (maxsize > NFS3_FHSIZE) - maxsize = NFS3_FHSIZE; + maxsize = min(maxsize, NFS3_FHSIZE); if (qword_get(&mesg, mesg, size)>0) return -EINVAL; @@ -871,10 +873,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) /* force bsize into allowed range and * required alignment. */ - if (bsize < 1024) - bsize = 1024; - if (bsize > NFSSVC_MAXBLKSIZE) - bsize = NFSSVC_MAXBLKSIZE; + bsize = max_t(int, bsize, 1024); + bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE); bsize &= ~(1024-1); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv) { @@ -889,6 +889,44 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) nfsd_max_blksize); } +/** + * write_maxconn - Set or report the current max number of connections + * + * Input: + * buf: ignored + * size: zero + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * number of max connections + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing numeric value of max_connections setting + * for this net namespace; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_maxconn(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + struct net *net = file->f_dentry->d_sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unsigned int maxconn = nn->max_connections; + + if (size > 0) { + int rv = get_uint(&mesg, &maxconn); + + if (rv) + return rv; + nn->max_connections = maxconn; + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn); +} + #ifdef CONFIG_NFSD_V4 static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time, struct nfsd_net *nn) @@ -1064,6 +1102,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, + [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ec8393418154..e883a5868be6 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -162,7 +162,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) /* deprecated, convert to type 3 */ len = key_len(FSID_ENCODE_DEV)/4; fh->fh_fsid_type = FSID_ENCODE_DEV; - fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl(fh->fh_fsid[0]), ntohl(fh->fh_fsid[1]))); + /* + * struct knfsd_fh uses host-endian fields, which are + * sometimes used to hold net-endian values. This + * confuses sparse, so we must use __force here to + * keep it from complaining. + */ + fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]), + ntohl((__force __be32)fh->fh_fsid[1]))); fh->fh_fsid[1] = fh->fh_fsid[2]; } data_left -= len; @@ -539,8 +546,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, dentry); fhp->fh_dentry = dget(dentry); /* our internal copy */ - fhp->fh_export = exp; - cache_get(&exp->h); + fhp->fh_export = exp_get(exp); if (fhp->fh_handle.fh_version == 0xca) { /* old style filehandle please */ diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 2e89e70ac15c..08236d70c667 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -73,8 +73,15 @@ enum fsid_source { extern enum fsid_source fsid_source(struct svc_fh *fhp); -/* This might look a little large to "inline" but in all calls except +/* + * This might look a little large to "inline" but in all calls except * one, 'vers' is constant so moste of the function disappears. + * + * In some cases the values are considered to be host endian and in + * others, net endian. fsidv is always considered to be u32 as the + * callers don't know which it will be. So we must use __force to keep + * sparse from complaining. Since these values are opaque to the + * client, that shouldn't be a problem. */ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, u32 fsid, unsigned char *uuid) @@ -82,7 +89,7 @@ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, u32 *up; switch(vers) { case FSID_DEV: - fsidv[0] = htonl((MAJOR(dev)<<16) | + fsidv[0] = (__force __u32)htonl((MAJOR(dev)<<16) | MINOR(dev)); fsidv[1] = ino_t_to_u32(ino); break; @@ -90,8 +97,8 @@ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, fsidv[0] = fsid; break; case FSID_MAJOR_MINOR: - fsidv[0] = htonl(MAJOR(dev)); - fsidv[1] = htonl(MINOR(dev)); + fsidv[0] = (__force __u32)htonl(MAJOR(dev)); + fsidv[1] = (__force __u32)htonl(MINOR(dev)); fsidv[2] = ino_t_to_u32(ino); break; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 54c6b3d3cc79..b8680738f588 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -403,12 +403,13 @@ nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp, fh_init(&newfh, NFS_FHSIZE); /* - * Create the link, look up new file and set attrs. + * Crazy hack: the request fits in a page, and already-decoded + * attributes follow argp->tname, so it's safe to just write a + * null to ensure it's null-terminated: */ + argp->tname[argp->tlen] = '\0'; nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, - argp->tname, argp->tlen, - &newfh, &argp->attrs); - + argp->tname, &newfh); fh_put(&argp->ffh); fh_put(&newfh); @@ -716,6 +717,7 @@ nfserrno (int errno) { nfserr_noent, -ENOENT }, { nfserr_io, -EIO }, { nfserr_nxio, -ENXIO }, + { nfserr_fbig, -E2BIG }, { nfserr_acces, -EACCES }, { nfserr_exist, -EEXIST }, { nfserr_xdev, -EXDEV }, @@ -743,6 +745,7 @@ nfserrno (int errno) { nfserr_notsupp, -EOPNOTSUPP }, { nfserr_toosmall, -ETOOSMALL }, { nfserr_serverfault, -ESERVERFAULT }, + { nfserr_serverfault, -ENFILE }, }; int i; @@ -750,7 +753,7 @@ nfserrno (int errno) if (nfs_errtbl[i].syserr == errno) return nfs_errtbl[i].nfserr; } - printk (KERN_INFO "nfsd: non-standard errno: %d\n", errno); + WARN(1, "nfsd: non-standard errno: %d\n", errno); return nfserr_io; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1879e43f2868..752d56bbe0ba 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -221,7 +221,8 @@ static int nfsd_startup_generic(int nrservs) */ ret = nfsd_racache_init(2*nrservs); if (ret) - return ret; + goto dec_users; + ret = nfs4_state_start(); if (ret) goto out_racache; @@ -229,6 +230,8 @@ static int nfsd_startup_generic(int nrservs) out_racache: nfsd_racache_shutdown(); +dec_users: + nfsd_users--; return ret; } @@ -405,6 +408,7 @@ int nfsd_create_serv(struct net *net) if (nn->nfsd_serv == NULL) return -ENOMEM; + nn->nfsd_serv->sv_maxconn = nn->max_connections; error = svc_bind(nn->nfsd_serv, net); if (error < 0) { svc_destroy(nn->nfsd_serv); @@ -469,8 +473,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) /* enforce a global maximum number of threads */ tot = 0; for (i = 0; i < n; i++) { - if (nthreads[i] > NFSD_MAXSERVS) - nthreads[i] = NFSD_MAXSERVS; + nthreads[i] = min(nthreads[i], NFSD_MAXSERVS); tot += nthreads[i]; } if (tot > NFSD_MAXSERVS) { @@ -519,11 +522,11 @@ nfsd_svc(int nrservs, struct net *net) mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); - if (nrservs <= 0) - nrservs = 0; - if (nrservs > NFSD_MAXSERVS) - nrservs = NFSD_MAXSERVS; + + nrservs = max(nrservs, 0); + nrservs = min(nrservs, NFSD_MAXSERVS); error = 0; + if (nrservs == 0 && nn->nfsd_serv == NULL) goto out; @@ -564,6 +567,7 @@ nfsd(void *vrqstp) struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); int err; /* Lock module and set up kernel thread */ @@ -597,6 +601,9 @@ nfsd(void *vrqstp) * The main request loop */ for (;;) { + /* Update sv_maxconn if it has changed */ + rqstp->rq_server->sv_maxconn = nn->max_connections; + /* * Find a socket with data available and call its * recvfrom routine. diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 1ac306b769df..412d7061f9e5 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -257,8 +257,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, len = args->count = ntohl(*p++); p++; /* totalcount - unused */ - if (len > NFSSVC_MAXBLKSIZE_V2) - len = NFSSVC_MAXBLKSIZE_V2; + len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2); /* set up somewhere to store response. * We take pages, put them on reslist and include in iovec @@ -268,7 +267,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, struct page *p = *(rqstp->rq_next_page++); rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE; + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); len -= rqstp->rq_vec[v].iov_len; v++; } @@ -400,9 +399,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, return 0; args->cookie = ntohl(*p++); args->count = ntohl(*p++); - if (args->count > PAGE_SIZE) - args->count = PAGE_SIZE; - + args->count = min_t(u32, args->count, PAGE_SIZE); args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); @@ -516,10 +513,11 @@ nfssvc_encode_entry(void *ccdv, const char *name, } if (cd->offset) *cd->offset = htonl(offset); - if (namlen > NFS2_MAXNAMLEN) - namlen = NFS2_MAXNAMLEN;/* truncate filename */ + /* truncate filename */ + namlen = min(namlen, NFS2_MAXNAMLEN); slen = XDR_QUADLEN(namlen); + if ((buflen = cd->buflen - slen - 4) < 0) { cd->common.err = nfserr_toosmall; return -EINVAL; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 374c66283ac5..4a89e00d7461 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -72,7 +72,13 @@ struct nfsd4_callback { bool cb_done; }; +/* + * A core object that represents a "common" stateid. These are generally + * embedded within the different (more specific) stateid objects and contain + * fields that are of general use to any stateid. + */ struct nfs4_stid { + atomic_t sc_count; #define NFS4_OPEN_STID 1 #define NFS4_LOCK_STID 2 #define NFS4_DELEG_STID 4 @@ -80,22 +86,43 @@ struct nfs4_stid { #define NFS4_CLOSED_STID 8 /* For a deleg stateid kept around only to process free_stateid's: */ #define NFS4_REVOKED_DELEG_STID 16 +#define NFS4_CLOSED_DELEG_STID 32 unsigned char sc_type; stateid_t sc_stateid; struct nfs4_client *sc_client; + struct nfs4_file *sc_file; + void (*sc_free)(struct nfs4_stid *); }; +/* + * Represents a delegation stateid. The nfs4_client holds references to these + * and they are put when it is being destroyed or when the delegation is + * returned by the client: + * + * o 1 reference as long as a delegation is still in force (taken when it's + * alloc'd, put when it's returned or revoked) + * + * o 1 reference as long as a recall rpc is in progress (taken when the lease + * is broken, put when the rpc exits) + * + * o 1 more ephemeral reference for each nfsd thread currently doing something + * with that delegation without holding the cl_lock + * + * If the server attempts to recall a delegation and the client doesn't do so + * before a timeout, the server may also revoke the delegation. In that case, + * the object will either be destroyed (v4.0) or moved to a per-client list of + * revoked delegations (v4.1+). + * + * This object is a superset of the nfs4_stid. + */ struct nfs4_delegation { struct nfs4_stid dl_stid; /* must be first field */ struct list_head dl_perfile; struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ - atomic_t dl_count; /* ref count */ - struct nfs4_file *dl_file; u32 dl_type; time_t dl_time; /* For recall: */ - struct knfsd_fh dl_fh; int dl_retries; struct nfsd4_callback dl_recall; }; @@ -194,6 +221,11 @@ struct nfsd4_conn { unsigned char cn_flags; }; +/* + * Representation of a v4.1+ session. These are refcounted in a similar fashion + * to the nfs4_client. References are only taken when the server is actively + * working on the object (primarily during the processing of compounds). + */ struct nfsd4_session { atomic_t se_ref; struct list_head se_hash; /* hash by sessionid */ @@ -212,8 +244,6 @@ struct nfsd4_session { struct nfsd4_slot *se_slots[]; /* forward channel slots */ }; -extern void nfsd4_put_session(struct nfsd4_session *ses); - /* formatted contents of nfs4_sessionid */ struct nfsd4_sessionid { clientid_t clientid; @@ -225,17 +255,35 @@ struct nfsd4_sessionid { /* * struct nfs4_client - one per client. Clientids live here. - * o Each nfs4_client is hashed by clientid. * - * o Each nfs4_clients is also hashed by name - * (the opaque quantity initially sent by the client to identify itself). + * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0) + * or EXCHANGE_ID (for NFSv4.1+). These objects are refcounted and timestamped. + * Each nfsd_net_ns object contains a set of these and they are tracked via + * short and long form clientid. They are hashed and searched for under the + * per-nfsd_net client_lock spinlock. + * + * References to it are only held during the processing of compounds, and in + * certain other operations. In their "resting state" they have a refcount of + * 0. If they are not renewed within a lease period, they become eligible for + * destruction by the laundromat. + * + * These objects can also be destroyed prematurely by the fault injection code, + * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates. + * Care is taken *not* to do this however when the objects have an elevated + * refcount. + * + * o Each nfs4_client is hashed by clientid + * + * o Each nfs4_clients is also hashed by name (the opaque quantity initially + * sent by the client to identify itself). * - * o cl_perclient list is used to ensure no dangling stateowner references - * when we expire the nfs4_client + * o cl_perclient list is used to ensure no dangling stateowner references + * when we expire the nfs4_client */ struct nfs4_client { struct list_head cl_idhash; /* hash by cl_clientid.id */ struct rb_node cl_namenode; /* link into by-name trees */ + struct list_head *cl_ownerstr_hashtbl; struct list_head cl_openowners; struct idr cl_stateids; /* stateid lookup */ struct list_head cl_delegations; @@ -329,21 +377,43 @@ struct nfs4_replay { unsigned int rp_buflen; char *rp_buf; struct knfsd_fh rp_openfh; + struct mutex rp_mutex; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; +struct nfs4_stateowner; + +struct nfs4_stateowner_operations { + void (*so_unhash)(struct nfs4_stateowner *); + void (*so_free)(struct nfs4_stateowner *); +}; + +/* + * A core object that represents either an open or lock owner. The object and + * lock owner objects have one of these embedded within them. Refcounts and + * other fields common to both owner types are contained within these + * structures. + */ struct nfs4_stateowner { - struct list_head so_strhash; /* hash by op_name */ - struct list_head so_stateids; - struct nfs4_client * so_client; - /* after increment in ENCODE_SEQID_OP_TAIL, represents the next + struct list_head so_strhash; + struct list_head so_stateids; + struct nfs4_client *so_client; + const struct nfs4_stateowner_operations *so_ops; + /* after increment in nfsd4_bump_seqid, represents the next * sequence id expected from the client: */ - u32 so_seqid; - struct xdr_netobj so_owner; /* open owner name */ - struct nfs4_replay so_replay; - bool so_is_open_owner; + atomic_t so_count; + u32 so_seqid; + struct xdr_netobj so_owner; /* open owner name */ + struct nfs4_replay so_replay; + bool so_is_open_owner; }; +/* + * When a file is opened, the client provides an open state owner opaque string + * that indicates the "owner" of that open. These objects are refcounted. + * References to it are held by each open state associated with it. This object + * is a superset of the nfs4_stateowner struct. + */ struct nfs4_openowner { struct nfs4_stateowner oo_owner; /* must be first field */ struct list_head oo_perclient; @@ -358,15 +428,17 @@ struct nfs4_openowner { struct nfs4_ol_stateid *oo_last_closed_stid; time_t oo_time; /* time of placement on so_close_lru */ #define NFS4_OO_CONFIRMED 1 -#define NFS4_OO_NEW 4 unsigned char oo_flags; }; +/* + * Represents a generic "lockowner". Similar to an openowner. References to it + * are held by the lock stateids that are created on its behalf. This object is + * a superset of the nfs4_stateowner struct (or would be if it needed any extra + * fields). + */ struct nfs4_lockowner { struct nfs4_stateowner lo_owner; /* must be first element */ - struct list_head lo_owner_ino_hash; /* hash by owner,file */ - struct list_head lo_perstateid; - struct list_head lo_list; /* for temporary uses */ }; static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) @@ -379,9 +451,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) return container_of(so, struct nfs4_lockowner, lo_owner); } -/* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */ +/* + * nfs4_file: a file opened by some number of (open) nfs4_stateowners. + * + * These objects are global. nfsd only keeps one instance of a nfs4_file per + * inode (though it may keep multiple file descriptors open per inode). These + * are tracked in the file_hashtbl which is protected by the state_lock + * spinlock. + */ struct nfs4_file { atomic_t fi_ref; + spinlock_t fi_lock; struct hlist_node fi_hash; /* hash by "struct inode *" */ struct list_head fi_stateids; struct list_head fi_delegations; @@ -395,49 +475,36 @@ struct nfs4_file { * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set. */ atomic_t fi_access[2]; + u32 fi_share_deny; struct file *fi_deleg_file; struct file_lock *fi_lease; atomic_t fi_delegees; - struct inode *fi_inode; + struct knfsd_fh fi_fhandle; bool fi_had_conflict; }; -/* XXX: for first cut may fall back on returning file that doesn't work - * at all? */ -static inline struct file *find_writeable_file(struct nfs4_file *f) -{ - if (f->fi_fds[O_WRONLY]) - return f->fi_fds[O_WRONLY]; - return f->fi_fds[O_RDWR]; -} - -static inline struct file *find_readable_file(struct nfs4_file *f) -{ - if (f->fi_fds[O_RDONLY]) - return f->fi_fds[O_RDONLY]; - return f->fi_fds[O_RDWR]; -} - -static inline struct file *find_any_file(struct nfs4_file *f) -{ - if (f->fi_fds[O_RDWR]) - return f->fi_fds[O_RDWR]; - else if (f->fi_fds[O_WRONLY]) - return f->fi_fds[O_WRONLY]; - else - return f->fi_fds[O_RDONLY]; -} - -/* "ol" stands for "Open or Lock". Better suggestions welcome. */ +/* + * A generic struct representing either a open or lock stateid. The nfs4_client + * holds a reference to each of these objects, and they in turn hold a + * reference to their respective stateowners. The client's reference is + * released in response to a close or unlock (depending on whether it's an open + * or lock stateid) or when the client is being destroyed. + * + * In the case of v4.0 open stateids, these objects are preserved for a little + * while after close in order to handle CLOSE replays. Those are eventually + * reclaimed via a LRU scheme by the laundromat. + * + * This object is a superset of the nfs4_stid. "ol" stands for "Open or Lock". + * Better suggestions welcome. + */ struct nfs4_ol_stateid { struct nfs4_stid st_stid; /* must be first field */ struct list_head st_perfile; struct list_head st_perstateowner; - struct list_head st_lockowners; + struct list_head st_locks; struct nfs4_stateowner * st_stateowner; - struct nfs4_file * st_file; - unsigned long st_access_bmap; - unsigned long st_deny_bmap; + unsigned char st_access_bmap; + unsigned char st_deny_bmap; struct nfs4_ol_stateid * st_openstp; }; @@ -456,15 +523,16 @@ struct nfsd_net; extern __be32 nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filp); -extern void nfs4_lock_state(void); -extern void nfs4_unlock_state(void); +void nfs4_put_stid(struct nfs4_stid *s); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn); -extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn); +extern __be32 nfs4_check_open_reclaim(clientid_t *clid, + struct nfsd4_compound_state *cstate, struct nfsd_net *nn); extern int set_callback_cred(void); -extern void nfsd4_init_callback(struct nfsd4_callback *); +void nfsd4_run_cb_null(struct work_struct *w); +void nfsd4_run_cb_recall(struct work_struct *w); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); @@ -472,11 +540,10 @@ extern void nfsd4_cb_recall(struct nfs4_delegation *dp); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); -extern void nfs4_put_delegation(struct nfs4_delegation *dp); +extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); -extern void put_client_renew(struct nfs4_client *clp); /* nfs4recover operations */ extern int nfsd4_client_tracking_init(struct net *net); @@ -490,19 +557,24 @@ extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); #ifdef CONFIG_NFSD_FAULT_INJECTION int nfsd_fault_inject_init(void); void nfsd_fault_inject_cleanup(void); -u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64)); -struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t); - -u64 nfsd_forget_client(struct nfs4_client *, u64); -u64 nfsd_forget_client_locks(struct nfs4_client*, u64); -u64 nfsd_forget_client_openowners(struct nfs4_client *, u64); -u64 nfsd_forget_client_delegations(struct nfs4_client *, u64); -u64 nfsd_recall_client_delegations(struct nfs4_client *, u64); - -u64 nfsd_print_client(struct nfs4_client *, u64); -u64 nfsd_print_client_locks(struct nfs4_client *, u64); -u64 nfsd_print_client_openowners(struct nfs4_client *, u64); -u64 nfsd_print_client_delegations(struct nfs4_client *, u64); + +u64 nfsd_inject_print_clients(void); +u64 nfsd_inject_forget_client(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_clients(u64); + +u64 nfsd_inject_print_locks(void); +u64 nfsd_inject_forget_client_locks(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_locks(u64); + +u64 nfsd_inject_print_openowners(void); +u64 nfsd_inject_forget_client_openowners(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_openowners(u64); + +u64 nfsd_inject_print_delegations(void); +u64 nfsd_inject_forget_client_delegations(struct sockaddr_storage *, size_t); +u64 nfsd_inject_forget_delegations(u64); +u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t); +u64 nfsd_inject_recall_delegations(u64); #else /* CONFIG_NFSD_FAULT_INJECTION */ static inline int nfsd_fault_inject_init(void) { return 0; } static inline void nfsd_fault_inject_cleanup(void) {} diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 140c496f612c..f501a9b5c9df 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -189,8 +189,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); dparent = fhp->fh_dentry; - exp = fhp->fh_export; - exp_get(exp); + exp = exp_get(fhp->fh_export); /* Lookup the name, but don't follow links */ if (isdotent(name, len)) { @@ -464,7 +463,7 @@ out_put_write_access: if (size_change) put_write_access(inode); if (!err) - commit_metadata(fhp); + err = nfserrno(commit_metadata(fhp)); out: return err; } @@ -820,7 +819,8 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } -__be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err) +static __be32 +nfsd_finish_read(struct file *file, unsigned long *count, int host_err) { if (host_err >= 0) { nfsdstats.io_read += host_err; @@ -831,7 +831,7 @@ __be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err) return nfserrno(host_err); } -int nfsd_splice_read(struct svc_rqst *rqstp, +__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct file *file, loff_t offset, unsigned long *count) { struct splice_desc sd = { @@ -847,7 +847,7 @@ int nfsd_splice_read(struct svc_rqst *rqstp, return nfsd_finish_read(file, count, host_err); } -int nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, +__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { mm_segment_t oldfs; @@ -1121,7 +1121,8 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, iap->ia_valid &= ~(ATTR_UID|ATTR_GID); if (iap->ia_valid) return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); - return 0; + /* Callers expect file metadata to be committed here */ + return nfserrno(commit_metadata(resfhp)); } /* HPUX client sometimes creates a file in mode 000, and sets size to 0. @@ -1253,9 +1254,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfsd_create_setattr(rqstp, resfhp, iap); /* - * nfsd_setattr already committed the child. Transactional filesystems - * had a chance to commit changes for both parent and child - * simultaneously making the following commit_metadata a noop. + * nfsd_create_setattr already committed the child. Transactional + * filesystems had a chance to commit changes for both parent and + * child * simultaneously making the following commit_metadata a + * noop. */ err2 = nfserrno(commit_metadata(fhp)); if (err2) @@ -1426,7 +1428,8 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfsd_create_setattr(rqstp, resfhp, iap); /* - * nfsd_setattr already committed the child (and possibly also the parent). + * nfsd_create_setattr already committed the child + * (and possibly also the parent). */ if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1504,16 +1507,15 @@ out_nfserr: __be32 nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, - char *path, int plen, - struct svc_fh *resfhp, - struct iattr *iap) + char *path, + struct svc_fh *resfhp) { struct dentry *dentry, *dnew; __be32 err, cerr; int host_err; err = nfserr_noent; - if (!flen || !plen) + if (!flen || path[0] == '\0') goto out; err = nfserr_exist; if (isdotent(fname, flen)) @@ -1534,18 +1536,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - if (unlikely(path[plen] != 0)) { - char *path_alloced = kmalloc(plen+1, GFP_KERNEL); - if (path_alloced == NULL) - host_err = -ENOMEM; - else { - strncpy(path_alloced, path, plen); - path_alloced[plen] = 0; - host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced); - kfree(path_alloced); - } - } else - host_err = vfs_symlink(dentry->d_inode, dnew, path); + host_err = vfs_symlink(dentry->d_inode, dnew, path); err = nfserrno(host_err); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -2093,8 +2084,7 @@ nfsd_racache_init(int cache_size) if (raparm_hash[0].pb_head) return 0; nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - if (nperbucket < 2) - nperbucket = 2; + nperbucket = max(2, nperbucket); cache_size = nperbucket * RAPARM_HASH_SIZE; dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 91b6ae3f658b..c2ff3f14e5f6 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -74,9 +74,9 @@ struct raparms; __be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *, struct file **, struct raparms **); void nfsd_put_tmp_read_open(struct file *, struct raparms *); -int nfsd_splice_read(struct svc_rqst *, +__be32 nfsd_splice_read(struct svc_rqst *, struct file *, loff_t, unsigned long *); -int nfsd_readv(struct file *, loff_t, struct kvec *, int, +__be32 nfsd_readv(struct file *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); @@ -85,8 +85,8 @@ __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, - char *name, int len, char *path, int plen, - struct svc_fh *res, struct iattr *); + char *name, int len, char *path, + struct svc_fh *res); __be32 nfsd_link(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *); __be32 nfsd_rename(struct svc_rqst *, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 18cbb6d9c8a9..465e7799742a 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -55,6 +55,7 @@ struct nfsd4_compound_state { struct svc_fh current_fh; struct svc_fh save_fh; struct nfs4_stateowner *replay_owner; + struct nfs4_client *clp; /* For sessions DRC */ struct nfsd4_session *session; struct nfsd4_slot *slot; @@ -107,8 +108,8 @@ struct nfsd4_create { u32 cr_type; /* request */ union { /* request */ struct { - u32 namelen; - char *name; + u32 datalen; + char *data; } link; /* NF4LNK */ struct { u32 specdata1; @@ -121,8 +122,8 @@ struct nfsd4_create { struct nfs4_acl *cr_acl; struct xdr_netobj cr_label; }; -#define cr_linklen u.link.namelen -#define cr_linkname u.link.name +#define cr_datalen u.link.datalen +#define cr_data u.link.data #define cr_specdata1 u.dev.specdata1 #define cr_specdata2 u.dev.specdata2 @@ -478,6 +479,14 @@ struct nfsd4_op { bool nfsd4_cache_this_op(struct nfsd4_op *); +/* + * Memory needed just for the duration of processing one compound: + */ +struct svcxdr_tmpbuf { + struct svcxdr_tmpbuf *next; + char buf[]; +}; + struct nfsd4_compoundargs { /* scratch variables for XDR decode */ __be32 * p; @@ -486,11 +495,7 @@ struct nfsd4_compoundargs { int pagelen; __be32 tmp[8]; __be32 * tmpp; - struct tmpbuf { - struct tmpbuf *next; - void (*release)(const void *); - void *buf; - } *to_free; + struct svcxdr_tmpbuf *to_free; struct svc_rqst *rqstp; @@ -574,7 +579,6 @@ extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_setclientid_confirm *setclientid_confirm); -extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_exchange_id *); extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *); @@ -585,6 +589,7 @@ extern __be32 nfsd4_create_session(struct svc_rqst *, extern __be32 nfsd4_sequence(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_sequence *); +extern void nfsd4_sequence_done(struct nfsd4_compoundres *resp); extern __be32 nfsd4_destroy_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_session *); @@ -594,7 +599,9 @@ extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, struct nfsd4_open *open, struct nfsd_net *nn); extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open); -extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); +extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate); +extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open, __be32 status); extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); extern __be32 nfsd4_close(struct svc_rqst *rqstp, @@ -625,6 +632,7 @@ extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid); extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr); + #endif /* diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile index 85c98737a146..fc603e0431bb 100644 --- a/fs/nilfs2/Makefile +++ b/fs/nilfs2/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ btnode.o bmap.o btree.o direct.o dat.o recovery.o \ the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ - ifile.o alloc.o gcinode.o ioctl.o + ifile.o alloc.o gcinode.o ioctl.o sysfs.o diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 9bc72dec3fa6..0696161bf59d 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -320,6 +320,14 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); int nilfs_init_gcinode(struct inode *inode); void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs); +/* sysfs.c */ +int __init nilfs_sysfs_init(void); +void nilfs_sysfs_exit(void); +int nilfs_sysfs_create_device_group(struct super_block *); +void nilfs_sysfs_delete_device_group(struct the_nilfs *); +int nilfs_sysfs_create_snapshot_group(struct nilfs_root *); +void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *); + /* * Inodes and files operations */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8c532b2ca3ab..228f5bdf0772 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -942,7 +942,7 @@ static int nilfs_get_root_dentry(struct super_block *sb, iput(inode); } } else { - dentry = d_obtain_alias(inode); + dentry = d_obtain_root(inode); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto failed_dentry; @@ -1452,13 +1452,19 @@ static int __init init_nilfs_fs(void) if (err) goto fail; - err = register_filesystem(&nilfs_fs_type); + err = nilfs_sysfs_init(); if (err) goto free_cachep; + err = register_filesystem(&nilfs_fs_type); + if (err) + goto deinit_sysfs_entry; + printk(KERN_INFO "NILFS version 2 loaded\n"); return 0; +deinit_sysfs_entry: + nilfs_sysfs_exit(); free_cachep: nilfs_destroy_cachep(); fail: @@ -1468,6 +1474,7 @@ fail: static void __exit exit_nilfs_fs(void) { nilfs_destroy_cachep(); + nilfs_sysfs_exit(); unregister_filesystem(&nilfs_fs_type); } diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c new file mode 100644 index 000000000000..bbb0dcc35905 --- /dev/null +++ b/fs/nilfs2/sysfs.c @@ -0,0 +1,1137 @@ +/* + * sysfs.c - sysfs support implementation. + * + * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation. + * Copyright (C) 2014 HGST, Inc., a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com> + */ + +#include <linux/kobject.h> + +#include "nilfs.h" +#include "mdt.h" +#include "sufile.h" +#include "cpfile.h" +#include "sysfs.h" + +/* /sys/fs/<nilfs>/ */ +static struct kset *nilfs_kset; + +#define NILFS_SHOW_TIME(time_t_val, buf) ({ \ + struct tm res; \ + int count = 0; \ + time_to_tm(time_t_val, 0, &res); \ + res.tm_year += 1900; \ + res.tm_mon += 1; \ + count = scnprintf(buf, PAGE_SIZE, \ + "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \ + res.tm_year, res.tm_mon, res.tm_mday, \ + res.tm_hour, res.tm_min, res.tm_sec);\ + count; \ +}) + +#define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \ +static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \ + struct attribute *attr, char *buf) \ +{ \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + struct nilfs_##name##_attr *a = container_of(attr, \ + struct nilfs_##name##_attr, \ + attr); \ + return a->show ? a->show(a, nilfs, buf) : 0; \ +} \ +static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \ + struct attribute *attr, \ + const char *buf, size_t len) \ +{ \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + struct nilfs_##name##_attr *a = container_of(attr, \ + struct nilfs_##name##_attr, \ + attr); \ + return a->store ? a->store(a, nilfs, buf, len) : 0; \ +} \ +static const struct sysfs_ops nilfs_##name##_attr_ops = { \ + .show = nilfs_##name##_attr_show, \ + .store = nilfs_##name##_attr_store, \ +}; + +#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \ +static void nilfs_##name##_attr_release(struct kobject *kobj) \ +{ \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + subgroups = nilfs->ns_##parent_name##_subgroups; \ + complete(&subgroups->sg_##name##_kobj_unregister); \ +} \ +static struct kobj_type nilfs_##name##_ktype = { \ + .default_attrs = nilfs_##name##_attrs, \ + .sysfs_ops = &nilfs_##name##_attr_ops, \ + .release = nilfs_##name##_attr_release, \ +}; + +#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \ +static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ +{ \ + struct kobject *parent; \ + struct kobject *kobj; \ + struct completion *kobj_unregister; \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ + int err; \ + subgroups = nilfs->ns_##parent_name##_subgroups; \ + kobj = &subgroups->sg_##name##_kobj; \ + kobj_unregister = &subgroups->sg_##name##_kobj_unregister; \ + parent = &nilfs->ns_##parent_name##_kobj; \ + kobj->kset = nilfs_kset; \ + init_completion(kobj_unregister); \ + err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \ + #name); \ + if (err) \ + return err; \ + return 0; \ +} \ +static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \ +{ \ + kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \ +} + +/************************************************************************ + * NILFS snapshot attrs * + ************************************************************************/ + +static ssize_t +nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)atomic64_read(&root->inodes_count)); +} + +static ssize_t +nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)atomic64_read(&root->blocks_count)); +} + +static const char snapshot_readme_str[] = + "The group contains details about mounted snapshot.\n\n" + "(1) inodes_count\n\tshow number of inodes for snapshot.\n\n" + "(2) blocks_count\n\tshow number of blocks for snapshot.\n\n"; + +static ssize_t +nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, snapshot_readme_str); +} + +NILFS_SNAPSHOT_RO_ATTR(inodes_count); +NILFS_SNAPSHOT_RO_ATTR(blocks_count); +NILFS_SNAPSHOT_RO_ATTR(README); + +static struct attribute *nilfs_snapshot_attrs[] = { + NILFS_SNAPSHOT_ATTR_LIST(inodes_count), + NILFS_SNAPSHOT_ATTR_LIST(blocks_count), + NILFS_SNAPSHOT_ATTR_LIST(README), + NULL, +}; + +static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct nilfs_root *root = + container_of(kobj, struct nilfs_root, snapshot_kobj); + struct nilfs_snapshot_attr *a = + container_of(attr, struct nilfs_snapshot_attr, attr); + + return a->show ? a->show(a, root, buf) : 0; +} + +static ssize_t nilfs_snapshot_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct nilfs_root *root = + container_of(kobj, struct nilfs_root, snapshot_kobj); + struct nilfs_snapshot_attr *a = + container_of(attr, struct nilfs_snapshot_attr, attr); + + return a->store ? a->store(a, root, buf, len) : 0; +} + +static void nilfs_snapshot_attr_release(struct kobject *kobj) +{ + struct nilfs_root *root = container_of(kobj, struct nilfs_root, + snapshot_kobj); + complete(&root->snapshot_kobj_unregister); +} + +static const struct sysfs_ops nilfs_snapshot_attr_ops = { + .show = nilfs_snapshot_attr_show, + .store = nilfs_snapshot_attr_store, +}; + +static struct kobj_type nilfs_snapshot_ktype = { + .default_attrs = nilfs_snapshot_attrs, + .sysfs_ops = &nilfs_snapshot_attr_ops, + .release = nilfs_snapshot_attr_release, +}; + +int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root) +{ + struct the_nilfs *nilfs; + struct kobject *parent; + int err; + + nilfs = root->nilfs; + parent = &nilfs->ns_dev_subgroups->sg_mounted_snapshots_kobj; + root->snapshot_kobj.kset = nilfs_kset; + init_completion(&root->snapshot_kobj_unregister); + + if (root->cno == NILFS_CPTREE_CURRENT_CNO) { + err = kobject_init_and_add(&root->snapshot_kobj, + &nilfs_snapshot_ktype, + &nilfs->ns_dev_kobj, + "current_checkpoint"); + } else { + err = kobject_init_and_add(&root->snapshot_kobj, + &nilfs_snapshot_ktype, + parent, + "%llu", root->cno); + } + + if (err) + return err; + + return 0; +} + +void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root) +{ + kobject_del(&root->snapshot_kobj); +} + +/************************************************************************ + * NILFS mounted snapshots attrs * + ************************************************************************/ + +static const char mounted_snapshots_readme_str[] = + "The mounted_snapshots group contains group for\n" + "every mounted snapshot.\n"; + +static ssize_t +nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str); +} + +NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README); + +static struct attribute *nilfs_mounted_snapshots_attrs[] = { + NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev); +NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev); +NILFS_DEV_INT_GROUP_FNS(mounted_snapshots, dev); + +/************************************************************************ + * NILFS checkpoints attrs * + ************************************************************************/ + +static ssize_t +nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 ncheckpoints; + struct nilfs_cpstat cpstat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", + err); + return err; + } + + ncheckpoints = cpstat.cs_ncps; + + return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints); +} + +static ssize_t +nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 nsnapshots; + struct nilfs_cpstat cpstat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", + err); + return err; + } + + nsnapshots = cpstat.cs_nsss; + + return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots); +} + +static ssize_t +nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 last_cno; + + spin_lock(&nilfs->ns_last_segment_lock); + last_cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno); +} + +static ssize_t +nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 cno; + + down_read(&nilfs->ns_sem); + cno = nilfs->ns_cno; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", cno); +} + +static const char checkpoints_readme_str[] = + "The checkpoints group contains attributes that describe\n" + "details about volume's checkpoints.\n\n" + "(1) checkpoints_number\n\tshow number of checkpoints on volume.\n\n" + "(2) snapshots_number\n\tshow number of snapshots on volume.\n\n" + "(3) last_seg_checkpoint\n" + "\tshow checkpoint number of the latest segment.\n\n" + "(4) next_checkpoint\n\tshow next checkpoint number.\n\n"; + +static ssize_t +nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, checkpoints_readme_str); +} + +NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number); +NILFS_CHECKPOINTS_RO_ATTR(snapshots_number); +NILFS_CHECKPOINTS_RO_ATTR(last_seg_checkpoint); +NILFS_CHECKPOINTS_RO_ATTR(next_checkpoint); +NILFS_CHECKPOINTS_RO_ATTR(README); + +static struct attribute *nilfs_checkpoints_attrs[] = { + NILFS_CHECKPOINTS_ATTR_LIST(checkpoints_number), + NILFS_CHECKPOINTS_ATTR_LIST(snapshots_number), + NILFS_CHECKPOINTS_ATTR_LIST(last_seg_checkpoint), + NILFS_CHECKPOINTS_ATTR_LIST(next_checkpoint), + NILFS_CHECKPOINTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(checkpoints, dev); +NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev); +NILFS_DEV_INT_GROUP_FNS(checkpoints, dev); + +/************************************************************************ + * NILFS segments attrs * + ************************************************************************/ + +static ssize_t +nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments); +} + +static ssize_t +nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment); +} + +static ssize_t +nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned long ncleansegs; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + + return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs); +} + +static ssize_t +nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_sustat sustat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n", + err); + return err; + } + + return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs); +} + +static const char segments_readme_str[] = + "The segments group contains attributes that describe\n" + "details about volume's segments.\n\n" + "(1) segments_number\n\tshow number of segments on volume.\n\n" + "(2) blocks_per_segment\n\tshow number of blocks in segment.\n\n" + "(3) clean_segments\n\tshow count of clean segments.\n\n" + "(4) dirty_segments\n\tshow count of dirty segments.\n\n"; + +static ssize_t +nilfs_segments_README_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, segments_readme_str); +} + +NILFS_SEGMENTS_RO_ATTR(segments_number); +NILFS_SEGMENTS_RO_ATTR(blocks_per_segment); +NILFS_SEGMENTS_RO_ATTR(clean_segments); +NILFS_SEGMENTS_RO_ATTR(dirty_segments); +NILFS_SEGMENTS_RO_ATTR(README); + +static struct attribute *nilfs_segments_attrs[] = { + NILFS_SEGMENTS_ATTR_LIST(segments_number), + NILFS_SEGMENTS_ATTR_LIST(blocks_per_segment), + NILFS_SEGMENTS_ATTR_LIST(clean_segments), + NILFS_SEGMENTS_ATTR_LIST(dirty_segments), + NILFS_SEGMENTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(segments, dev); +NILFS_DEV_INT_GROUP_TYPE(segments, dev); +NILFS_DEV_INT_GROUP_FNS(segments, dev); + +/************************************************************************ + * NILFS segctor attrs * + ************************************************************************/ + +static ssize_t +nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + sector_t last_pseg; + + spin_lock(&nilfs->ns_last_segment_lock); + last_pseg = nilfs->ns_last_pseg; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)last_pseg); +} + +static ssize_t +nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u64 last_seq; + + spin_lock(&nilfs->ns_last_segment_lock); + last_seq = nilfs->ns_last_seq; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq); +} + +static ssize_t +nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 last_cno; + + spin_lock(&nilfs->ns_last_segment_lock); + last_cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno); +} + +static ssize_t +nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u64 seg_seq; + + down_read(&nilfs->ns_sem); + seg_seq = nilfs->ns_seg_seq; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq); +} + +static ssize_t +nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 segnum; + + down_read(&nilfs->ns_sem); + segnum = nilfs->ns_segnum; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", segnum); +} + +static ssize_t +nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 nextnum; + + down_read(&nilfs->ns_sem); + nextnum = nilfs->ns_nextnum; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum); +} + +static ssize_t +nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned long pseg_offset; + + down_read(&nilfs->ns_sem); + pseg_offset = nilfs->ns_pseg_offset; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset); +} + +static ssize_t +nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 cno; + + down_read(&nilfs->ns_sem); + cno = nilfs->ns_cno; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", cno); +} + +static ssize_t +nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t ctime; + + down_read(&nilfs->ns_sem); + ctime = nilfs->ns_ctime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(ctime, buf); +} + +static ssize_t +nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t ctime; + + down_read(&nilfs->ns_sem); + ctime = nilfs->ns_ctime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime); +} + +static ssize_t +nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t nongc_ctime; + + down_read(&nilfs->ns_sem); + nongc_ctime = nilfs->ns_nongc_ctime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(nongc_ctime, buf); +} + +static ssize_t +nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t nongc_ctime; + + down_read(&nilfs->ns_sem); + nongc_ctime = nilfs->ns_nongc_ctime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)nongc_ctime); +} + +static ssize_t +nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u32 ndirtyblks; + + down_read(&nilfs->ns_sem); + ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks); + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks); +} + +static const char segctor_readme_str[] = + "The segctor group contains attributes that describe\n" + "segctor thread activity details.\n\n" + "(1) last_pseg_block\n" + "\tshow start block number of the latest segment.\n\n" + "(2) last_seg_sequence\n" + "\tshow sequence value of the latest segment.\n\n" + "(3) last_seg_checkpoint\n" + "\tshow checkpoint number of the latest segment.\n\n" + "(4) current_seg_sequence\n\tshow segment sequence counter.\n\n" + "(5) current_last_full_seg\n" + "\tshow index number of the latest full segment.\n\n" + "(6) next_full_seg\n" + "\tshow index number of the full segment index to be used next.\n\n" + "(7) next_pseg_offset\n" + "\tshow offset of next partial segment in the current full segment.\n\n" + "(8) next_checkpoint\n\tshow next checkpoint number.\n\n" + "(9) last_seg_write_time\n" + "\tshow write time of the last segment in human-readable format.\n\n" + "(10) last_seg_write_time_secs\n" + "\tshow write time of the last segment in seconds.\n\n" + "(11) last_nongc_write_time\n" + "\tshow write time of the last segment not for cleaner operation " + "in human-readable format.\n\n" + "(12) last_nongc_write_time_secs\n" + "\tshow write time of the last segment not for cleaner operation " + "in seconds.\n\n" + "(13) dirty_data_blocks_count\n" + "\tshow number of dirty data blocks.\n\n"; + +static ssize_t +nilfs_segctor_README_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, segctor_readme_str); +} + +NILFS_SEGCTOR_RO_ATTR(last_pseg_block); +NILFS_SEGCTOR_RO_ATTR(last_seg_sequence); +NILFS_SEGCTOR_RO_ATTR(last_seg_checkpoint); +NILFS_SEGCTOR_RO_ATTR(current_seg_sequence); +NILFS_SEGCTOR_RO_ATTR(current_last_full_seg); +NILFS_SEGCTOR_RO_ATTR(next_full_seg); +NILFS_SEGCTOR_RO_ATTR(next_pseg_offset); +NILFS_SEGCTOR_RO_ATTR(next_checkpoint); +NILFS_SEGCTOR_RO_ATTR(last_seg_write_time); +NILFS_SEGCTOR_RO_ATTR(last_seg_write_time_secs); +NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time); +NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time_secs); +NILFS_SEGCTOR_RO_ATTR(dirty_data_blocks_count); +NILFS_SEGCTOR_RO_ATTR(README); + +static struct attribute *nilfs_segctor_attrs[] = { + NILFS_SEGCTOR_ATTR_LIST(last_pseg_block), + NILFS_SEGCTOR_ATTR_LIST(last_seg_sequence), + NILFS_SEGCTOR_ATTR_LIST(last_seg_checkpoint), + NILFS_SEGCTOR_ATTR_LIST(current_seg_sequence), + NILFS_SEGCTOR_ATTR_LIST(current_last_full_seg), + NILFS_SEGCTOR_ATTR_LIST(next_full_seg), + NILFS_SEGCTOR_ATTR_LIST(next_pseg_offset), + NILFS_SEGCTOR_ATTR_LIST(next_checkpoint), + NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time), + NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time_secs), + NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time), + NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time_secs), + NILFS_SEGCTOR_ATTR_LIST(dirty_data_blocks_count), + NILFS_SEGCTOR_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(segctor, dev); +NILFS_DEV_INT_GROUP_TYPE(segctor, dev); +NILFS_DEV_INT_GROUP_FNS(segctor, dev); + +/************************************************************************ + * NILFS superblock attrs * + ************************************************************************/ + +static ssize_t +nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t sbwtime; + + down_read(&nilfs->ns_sem); + sbwtime = nilfs->ns_sbwtime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(sbwtime, buf); +} + +static ssize_t +nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t sbwtime; + + down_read(&nilfs->ns_sem); + sbwtime = nilfs->ns_sbwtime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime); +} + +static ssize_t +nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned sbwcount; + + down_read(&nilfs->ns_sem); + sbwcount = nilfs->ns_sbwcount; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount); +} + +static ssize_t +nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned sb_update_freq; + + down_read(&nilfs->ns_sem); + sb_update_freq = nilfs->ns_sb_update_freq; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq); +} + +static ssize_t +nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + const char *buf, size_t count) +{ + unsigned val; + int err; + + err = kstrtouint(skip_spaces(buf), 0, &val); + if (err) { + printk(KERN_ERR "NILFS: unable to convert string: err=%d\n", + err); + return err; + } + + if (val < NILFS_SB_FREQ) { + val = NILFS_SB_FREQ; + printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n"); + } + + down_write(&nilfs->ns_sem); + nilfs->ns_sb_update_freq = val; + up_write(&nilfs->ns_sem); + + return count; +} + +static const char sb_readme_str[] = + "The superblock group contains attributes that describe\n" + "superblock's details.\n\n" + "(1) sb_write_time\n\tshow previous write time of super block " + "in human-readable format.\n\n" + "(2) sb_write_time_secs\n\tshow previous write time of super block " + "in seconds.\n\n" + "(3) sb_write_count\n\tshow write count of super block.\n\n" + "(4) sb_update_frequency\n" + "\tshow/set interval of periodical update of superblock (in seconds).\n\n" + "\tYou can set preferable frequency of superblock update by command:\n\n" + "\t'echo <val> > /sys/fs/<nilfs>/<dev>/superblock/sb_update_frequency'\n"; + +static ssize_t +nilfs_superblock_README_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, sb_readme_str); +} + +NILFS_SUPERBLOCK_RO_ATTR(sb_write_time); +NILFS_SUPERBLOCK_RO_ATTR(sb_write_time_secs); +NILFS_SUPERBLOCK_RO_ATTR(sb_write_count); +NILFS_SUPERBLOCK_RW_ATTR(sb_update_frequency); +NILFS_SUPERBLOCK_RO_ATTR(README); + +static struct attribute *nilfs_superblock_attrs[] = { + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time), + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time_secs), + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_count), + NILFS_SUPERBLOCK_ATTR_LIST(sb_update_frequency), + NILFS_SUPERBLOCK_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(superblock, dev); +NILFS_DEV_INT_GROUP_TYPE(superblock, dev); +NILFS_DEV_INT_GROUP_FNS(superblock, dev); + +/************************************************************************ + * NILFS device attrs * + ************************************************************************/ + +static +ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u32 major = le32_to_cpu(sbp[0]->s_rev_level); + u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level); + + return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor); +} + +static +ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize); +} + +static +ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size); + + return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size); +} + +static +ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + sector_t free_blocks = 0; + + nilfs_count_free_blocks(nilfs, &free_blocks); + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)free_blocks); +} + +static +ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + + return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid); +} + +static +ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + + return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n", + sbp[0]->s_volume_name); +} + +static const char dev_readme_str[] = + "The <device> group contains attributes that describe file system\n" + "partition's details.\n\n" + "(1) revision\n\tshow NILFS file system revision.\n\n" + "(2) blocksize\n\tshow volume block size in bytes.\n\n" + "(3) device_size\n\tshow volume size in bytes.\n\n" + "(4) free_blocks\n\tshow count of free blocks on volume.\n\n" + "(5) uuid\n\tshow volume's UUID.\n\n" + "(6) volume_name\n\tshow volume's name.\n\n"; + +static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, dev_readme_str); +} + +NILFS_DEV_RO_ATTR(revision); +NILFS_DEV_RO_ATTR(blocksize); +NILFS_DEV_RO_ATTR(device_size); +NILFS_DEV_RO_ATTR(free_blocks); +NILFS_DEV_RO_ATTR(uuid); +NILFS_DEV_RO_ATTR(volume_name); +NILFS_DEV_RO_ATTR(README); + +static struct attribute *nilfs_dev_attrs[] = { + NILFS_DEV_ATTR_LIST(revision), + NILFS_DEV_ATTR_LIST(blocksize), + NILFS_DEV_ATTR_LIST(device_size), + NILFS_DEV_ATTR_LIST(free_blocks), + NILFS_DEV_ATTR_LIST(uuid), + NILFS_DEV_ATTR_LIST(volume_name), + NILFS_DEV_ATTR_LIST(README), + NULL, +}; + +static ssize_t nilfs_dev_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr, + attr); + + return a->show ? a->show(a, nilfs, buf) : 0; +} + +static ssize_t nilfs_dev_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr, + attr); + + return a->store ? a->store(a, nilfs, buf, len) : 0; +} + +static void nilfs_dev_attr_release(struct kobject *kobj) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + complete(&nilfs->ns_dev_kobj_unregister); +} + +static const struct sysfs_ops nilfs_dev_attr_ops = { + .show = nilfs_dev_attr_show, + .store = nilfs_dev_attr_store, +}; + +static struct kobj_type nilfs_dev_ktype = { + .default_attrs = nilfs_dev_attrs, + .sysfs_ops = &nilfs_dev_attr_ops, + .release = nilfs_dev_attr_release, +}; + +int nilfs_sysfs_create_device_group(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + size_t devgrp_size = sizeof(struct nilfs_sysfs_dev_subgroups); + int err; + + nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL); + if (unlikely(!nilfs->ns_dev_subgroups)) { + err = -ENOMEM; + printk(KERN_ERR "NILFS: unable to allocate memory for device group\n"); + goto failed_create_device_group; + } + + nilfs->ns_dev_kobj.kset = nilfs_kset; + init_completion(&nilfs->ns_dev_kobj_unregister); + err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL, + "%s", sb->s_id); + if (err) + goto free_dev_subgroups; + + err = nilfs_sysfs_create_mounted_snapshots_group(nilfs); + if (err) + goto cleanup_dev_kobject; + + err = nilfs_sysfs_create_checkpoints_group(nilfs); + if (err) + goto delete_mounted_snapshots_group; + + err = nilfs_sysfs_create_segments_group(nilfs); + if (err) + goto delete_checkpoints_group; + + err = nilfs_sysfs_create_superblock_group(nilfs); + if (err) + goto delete_segments_group; + + err = nilfs_sysfs_create_segctor_group(nilfs); + if (err) + goto delete_superblock_group; + + return 0; + +delete_superblock_group: + nilfs_sysfs_delete_superblock_group(nilfs); + +delete_segments_group: + nilfs_sysfs_delete_segments_group(nilfs); + +delete_checkpoints_group: + nilfs_sysfs_delete_checkpoints_group(nilfs); + +delete_mounted_snapshots_group: + nilfs_sysfs_delete_mounted_snapshots_group(nilfs); + +cleanup_dev_kobject: + kobject_del(&nilfs->ns_dev_kobj); + +free_dev_subgroups: + kfree(nilfs->ns_dev_subgroups); + +failed_create_device_group: + return err; +} + +void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) +{ + nilfs_sysfs_delete_mounted_snapshots_group(nilfs); + nilfs_sysfs_delete_checkpoints_group(nilfs); + nilfs_sysfs_delete_segments_group(nilfs); + nilfs_sysfs_delete_superblock_group(nilfs); + nilfs_sysfs_delete_segctor_group(nilfs); + kobject_del(&nilfs->ns_dev_kobj); + kfree(nilfs->ns_dev_subgroups); +} + +/************************************************************************ + * NILFS feature attrs * + ************************************************************************/ + +static ssize_t nilfs_feature_revision_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d.%d\n", + NILFS_CURRENT_REV, NILFS_MINOR_REV); +} + +static const char features_readme_str[] = + "The features group contains attributes that describe NILFS file\n" + "system driver features.\n\n" + "(1) revision\n\tshow current revision of NILFS file system driver.\n"; + +static ssize_t nilfs_feature_README_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, features_readme_str); +} + +NILFS_FEATURE_RO_ATTR(revision); +NILFS_FEATURE_RO_ATTR(README); + +static struct attribute *nilfs_feature_attrs[] = { + NILFS_FEATURE_ATTR_LIST(revision), + NILFS_FEATURE_ATTR_LIST(README), + NULL, +}; + +static const struct attribute_group nilfs_feature_attr_group = { + .name = "features", + .attrs = nilfs_feature_attrs, +}; + +int __init nilfs_sysfs_init(void) +{ + int err; + + nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj); + if (!nilfs_kset) { + err = -ENOMEM; + printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n", + err); + goto failed_sysfs_init; + } + + err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: unable to create feature group: err %d\n", + err); + goto cleanup_sysfs_init; + } + + return 0; + +cleanup_sysfs_init: + kset_unregister(nilfs_kset); + +failed_sysfs_init: + return err; +} + +void nilfs_sysfs_exit(void) +{ + sysfs_remove_group(&nilfs_kset->kobj, &nilfs_feature_attr_group); + kset_unregister(nilfs_kset); +} diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h new file mode 100644 index 000000000000..677e3a1a8370 --- /dev/null +++ b/fs/nilfs2/sysfs.h @@ -0,0 +1,176 @@ +/* + * sysfs.h - sysfs support declarations. + * + * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation. + * Copyright (C) 2014 HGST, Inc., a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com> + */ + +#ifndef _NILFS_SYSFS_H +#define _NILFS_SYSFS_H + +#include <linux/sysfs.h> + +#define NILFS_ROOT_GROUP_NAME "nilfs2" + +/* + * struct nilfs_sysfs_dev_subgroups - device subgroup kernel objects + * @sg_superblock_kobj: /sys/fs/<nilfs>/<device>/superblock + * @sg_superblock_kobj_unregister: completion state + * @sg_segctor_kobj: /sys/fs/<nilfs>/<device>/segctor + * @sg_segctor_kobj_unregister: completion state + * @sg_mounted_snapshots_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots + * @sg_mounted_snapshots_kobj_unregister: completion state + * @sg_checkpoints_kobj: /sys/fs/<nilfs>/<device>/checkpoints + * @sg_checkpoints_kobj_unregister: completion state + * @sg_segments_kobj: /sys/fs/<nilfs>/<device>/segments + * @sg_segments_kobj_unregister: completion state + */ +struct nilfs_sysfs_dev_subgroups { + /* /sys/fs/<nilfs>/<device>/superblock */ + struct kobject sg_superblock_kobj; + struct completion sg_superblock_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/segctor */ + struct kobject sg_segctor_kobj; + struct completion sg_segctor_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/mounted_snapshots */ + struct kobject sg_mounted_snapshots_kobj; + struct completion sg_mounted_snapshots_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/checkpoints */ + struct kobject sg_checkpoints_kobj; + struct completion sg_checkpoints_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/segments */ + struct kobject sg_segments_kobj; + struct completion sg_segments_kobj_unregister; +}; + +#define NILFS_COMMON_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct kobject *, struct attribute *, \ + char *); \ + ssize_t (*store)(struct kobject *, struct attribute *, \ + const char *, size_t); \ +}; + +NILFS_COMMON_ATTR_STRUCT(feature); + +#define NILFS_DEV_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct nilfs_##name##_attr *, struct the_nilfs *, \ + char *); \ + ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \ + const char *, size_t); \ +}; + +NILFS_DEV_ATTR_STRUCT(dev); +NILFS_DEV_ATTR_STRUCT(segments); +NILFS_DEV_ATTR_STRUCT(mounted_snapshots); +NILFS_DEV_ATTR_STRUCT(checkpoints); +NILFS_DEV_ATTR_STRUCT(superblock); +NILFS_DEV_ATTR_STRUCT(segctor); + +#define NILFS_CP_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct nilfs_##name##_attr *, struct nilfs_root *, \ + char *); \ + ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \ + const char *, size_t); \ +}; + +NILFS_CP_ATTR_STRUCT(snapshot); + +#define NILFS_ATTR(type, name, mode, show, store) \ + static struct nilfs_##type##_attr nilfs_##type##_attr_##name = \ + __ATTR(name, mode, show, store) + +#define NILFS_INFO_ATTR(type, name) \ + NILFS_ATTR(type, name, 0444, NULL, NULL) +#define NILFS_RO_ATTR(type, name) \ + NILFS_ATTR(type, name, 0444, nilfs_##type##_##name##_show, NULL) +#define NILFS_RW_ATTR(type, name) \ + NILFS_ATTR(type, name, 0644, \ + nilfs_##type##_##name##_show, \ + nilfs_##type##_##name##_store) + +#define NILFS_FEATURE_INFO_ATTR(name) \ + NILFS_INFO_ATTR(feature, name) +#define NILFS_FEATURE_RO_ATTR(name) \ + NILFS_RO_ATTR(feature, name) +#define NILFS_FEATURE_RW_ATTR(name) \ + NILFS_RW_ATTR(feature, name) + +#define NILFS_DEV_INFO_ATTR(name) \ + NILFS_INFO_ATTR(dev, name) +#define NILFS_DEV_RO_ATTR(name) \ + NILFS_RO_ATTR(dev, name) +#define NILFS_DEV_RW_ATTR(name) \ + NILFS_RW_ATTR(dev, name) + +#define NILFS_SEGMENTS_RO_ATTR(name) \ + NILFS_RO_ATTR(segments, name) +#define NILFS_SEGMENTS_RW_ATTR(name) \ + NILFS_RW_ATTR(segs_info, name) + +#define NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(name) \ + NILFS_RO_ATTR(mounted_snapshots, name) + +#define NILFS_CHECKPOINTS_RO_ATTR(name) \ + NILFS_RO_ATTR(checkpoints, name) +#define NILFS_CHECKPOINTS_RW_ATTR(name) \ + NILFS_RW_ATTR(checkpoints, name) + +#define NILFS_SNAPSHOT_INFO_ATTR(name) \ + NILFS_INFO_ATTR(snapshot, name) +#define NILFS_SNAPSHOT_RO_ATTR(name) \ + NILFS_RO_ATTR(snapshot, name) +#define NILFS_SNAPSHOT_RW_ATTR(name) \ + NILFS_RW_ATTR(snapshot, name) + +#define NILFS_SUPERBLOCK_RO_ATTR(name) \ + NILFS_RO_ATTR(superblock, name) +#define NILFS_SUPERBLOCK_RW_ATTR(name) \ + NILFS_RW_ATTR(superblock, name) + +#define NILFS_SEGCTOR_INFO_ATTR(name) \ + NILFS_INFO_ATTR(segctor, name) +#define NILFS_SEGCTOR_RO_ATTR(name) \ + NILFS_RO_ATTR(segctor, name) +#define NILFS_SEGCTOR_RW_ATTR(name) \ + NILFS_RW_ATTR(segctor, name) + +#define NILFS_FEATURE_ATTR_LIST(name) \ + (&nilfs_feature_attr_##name.attr) +#define NILFS_DEV_ATTR_LIST(name) \ + (&nilfs_dev_attr_##name.attr) +#define NILFS_SEGMENTS_ATTR_LIST(name) \ + (&nilfs_segments_attr_##name.attr) +#define NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(name) \ + (&nilfs_mounted_snapshots_attr_##name.attr) +#define NILFS_CHECKPOINTS_ATTR_LIST(name) \ + (&nilfs_checkpoints_attr_##name.attr) +#define NILFS_SNAPSHOT_ATTR_LIST(name) \ + (&nilfs_snapshot_attr_##name.attr) +#define NILFS_SUPERBLOCK_ATTR_LIST(name) \ + (&nilfs_superblock_attr_##name.attr) +#define NILFS_SEGCTOR_ATTR_LIST(name) \ + (&nilfs_segctor_attr_##name.attr) + +#endif /* _NILFS_SYSFS_H */ diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8ba8229ba076..9da25fe9ea61 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -85,6 +85,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) nilfs->ns_cptree = RB_ROOT; spin_lock_init(&nilfs->ns_cptree_lock); init_rwsem(&nilfs->ns_segctor_sem); + nilfs->ns_sb_update_freq = NILFS_SB_FREQ; return nilfs; } @@ -97,6 +98,7 @@ void destroy_nilfs(struct the_nilfs *nilfs) { might_sleep(); if (nilfs_init(nilfs)) { + nilfs_sysfs_delete_device_group(nilfs); brelse(nilfs->ns_sbh[0]); brelse(nilfs->ns_sbh[1]); } @@ -640,6 +642,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto failed_sbh; + err = nilfs_sysfs_create_device_group(sb); + if (err) + goto failed_sbh; + set_nilfs_init(nilfs); err = 0; out: @@ -740,12 +746,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) { struct rb_node **p, *parent; struct nilfs_root *root, *new; + int err; root = nilfs_lookup_root(nilfs, cno); if (root) return root; - new = kmalloc(sizeof(*root), GFP_KERNEL); + new = kzalloc(sizeof(*root), GFP_KERNEL); if (!new) return NULL; @@ -782,6 +789,12 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) spin_unlock(&nilfs->ns_cptree_lock); + err = nilfs_sysfs_create_snapshot_group(new); + if (err) { + kfree(new); + new = NULL; + } + return new; } @@ -790,6 +803,8 @@ void nilfs_put_root(struct nilfs_root *root) if (atomic_dec_and_test(&root->count)) { struct the_nilfs *nilfs = root->nilfs; + nilfs_sysfs_delete_snapshot_group(root); + spin_lock(&nilfs->ns_cptree_lock); rb_erase(&root->rb_node, &nilfs->ns_cptree); spin_unlock(&nilfs->ns_cptree_lock); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index de8cc53b4a5c..d01ead1bea9a 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -33,6 +33,7 @@ #include <linux/slab.h> struct nilfs_sc_info; +struct nilfs_sysfs_dev_subgroups; /* the_nilfs struct */ enum { @@ -54,6 +55,7 @@ enum { * @ns_sbwcount: write count of super block * @ns_sbsize: size of valid data in super block * @ns_mount_state: file system state + * @ns_sb_update_freq: interval of periodical update of superblocks (in seconds) * @ns_seg_seq: segment sequence counter * @ns_segnum: index number of the latest full segment. * @ns_nextnum: index number of the full segment index to be used next @@ -95,6 +97,9 @@ enum { * @ns_inode_size: size of on-disk inode * @ns_first_ino: first not-special inode number * @ns_crc_seed: seed value of CRC32 calculation + * @ns_dev_kobj: /sys/fs/<nilfs>/<device> + * @ns_dev_kobj_unregister: completion state + * @ns_dev_subgroups: <device> subgroups pointer */ struct the_nilfs { unsigned long ns_flags; @@ -114,6 +119,7 @@ struct the_nilfs { unsigned ns_sbwcount; unsigned ns_sbsize; unsigned ns_mount_state; + unsigned ns_sb_update_freq; /* * Following fields are dedicated to a writable FS-instance. @@ -188,6 +194,11 @@ struct the_nilfs { int ns_inode_size; int ns_first_ino; u32 ns_crc_seed; + + /* /sys/fs/<nilfs>/<device> */ + struct kobject ns_dev_kobj; + struct completion ns_dev_kobj_unregister; + struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups; }; #define THE_NILFS_FNS(bit, name) \ @@ -232,6 +243,8 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty) * @ifile: inode file * @inodes_count: number of inodes * @blocks_count: number of blocks + * @snapshot_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> + * @snapshot_kobj_unregister: completion state for kernel object */ struct nilfs_root { __u64 cno; @@ -243,6 +256,10 @@ struct nilfs_root { atomic64_t inodes_count; atomic64_t blocks_count; + + /* /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> */ + struct kobject snapshot_kobj; + struct completion snapshot_kobj_unregister; }; /* Special checkpoint number */ @@ -254,7 +271,8 @@ struct nilfs_root { static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { u64 t = get_seconds(); - return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ; + return t < nilfs->ns_sbwtime || + t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq; } static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index ee9cb3795c2b..30d3addfad75 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -70,8 +70,15 @@ static int fanotify_get_response(struct fsnotify_group *group, wait_event(group->fanotify_data.access_waitq, event->response || atomic_read(&group->fanotify_data.bypass_perm)); - if (!event->response) /* bypass_perm set */ + if (!event->response) { /* bypass_perm set */ + /* + * Event was canceled because group is being destroyed. Remove + * it from group's event list because we are responsible for + * freeing the permission event. + */ + fsnotify_remove_event(group, &event->fae.fse); return 0; + } /* userspace responded, convert to something usable */ switch (event->response) { @@ -210,7 +217,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, return -ENOMEM; fsn_event = &event->fse; - ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); + ret = fsnotify_add_event(group, fsn_event, fanotify_merge); if (ret) { /* Permission events shouldn't be merged */ BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 3fdc8a3e1134..b13992a41bd9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -66,7 +66,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, /* held the notification_mutex the whole time, so this is the * same event we peeked above */ - return fsnotify_remove_notify_event(group); + return fsnotify_remove_first_event(group); } static int create_fd(struct fsnotify_group *group, @@ -359,6 +359,11 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; + /* + * There may be still new events arriving in the notification queue + * but since userspace cannot use fanotify fd anymore, no event can + * enter or leave access_list by now. + */ spin_lock(&group->fanotify_data.access_lock); atomic_inc(&group->fanotify_data.bypass_perm); @@ -373,6 +378,13 @@ static int fanotify_release(struct inode *ignored, struct file *file) } spin_unlock(&group->fanotify_data.access_lock); + /* + * Since bypass_perm is set, newly queued events will not wait for + * access response. Wake up the already sleeping ones now. + * synchronize_srcu() in fsnotify_destroy_group() will wait for all + * processes sleeping in fanotify_handle_event() waiting for access + * response and thus also for all permission events to be freed. + */ wake_up(&group->fanotify_data.access_waitq); #endif diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 74825be65b7b..9ce062218de9 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -232,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list); + hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list); out: fsnotify_recalc_inode_mask_locked(inode); spin_unlock(&inode->i_lock); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 43ab1e1a07a2..0f88bc0b4e6c 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -108,7 +108,7 @@ int inotify_handle_event(struct fsnotify_group *group, if (len) strcpy(event->name, file_name); - ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge); + ret = fsnotify_add_event(group, fsn_event, inotify_merge); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index cc423a30a0c8..daf76652fe58 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -149,7 +149,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (fsnotify_notify_queue_is_empty(group)) return NULL; - event = fsnotify_peek_notify_event(group); + event = fsnotify_peek_first_event(group); pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -159,7 +159,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, /* held the notification_mutex the whole time, so this is the * same event we peeked above */ - fsnotify_remove_notify_event(group); + fsnotify_remove_first_event(group); return event; } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 1e58402171a5..a95d8e037aeb 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -73,7 +73,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group, /* Overflow events are per-group and we don't want to free them */ if (!event || event->mask == FS_Q_OVERFLOW) return; - + /* If the event is still queued, we have a problem... */ + WARN_ON(!list_empty(&event->list)); group->ops->free_event(event); } @@ -83,10 +84,10 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * added to the queue, 1 if the event was merged with some other queued event, * 2 if the queue of events has overflown. */ -int fsnotify_add_notify_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct list_head *, - struct fsnotify_event *)) +int fsnotify_add_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; @@ -125,10 +126,25 @@ queue: } /* + * Remove @event from group's notification queue. It is the responsibility of + * the caller to destroy the event. + */ +void fsnotify_remove_event(struct fsnotify_group *group, + struct fsnotify_event *event) +{ + mutex_lock(&group->notification_mutex); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + group->q_len--; + } + mutex_unlock(&group->notification_mutex); +} + +/* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event */ -struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { struct fsnotify_event *event; @@ -140,7 +156,7 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group struct fsnotify_event, list); /* * We need to init list head for the case of overflow event so that - * check in fsnotify_add_notify_events() works + * check in fsnotify_add_event() works */ list_del_init(&event->list); group->q_len--; @@ -149,9 +165,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group } /* - * This will not remove the event, that must be done with fsnotify_remove_notify_event() + * This will not remove the event, that must be done with + * fsnotify_remove_first_event() */ -struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { BUG_ON(!mutex_is_locked(&group->notification_mutex)); @@ -169,7 +186,7 @@ void fsnotify_flush_notify(struct fsnotify_group *group) mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { - event = fsnotify_remove_notify_event(group); + event = fsnotify_remove_first_event(group); fsnotify_destroy_event(group, event); } mutex_unlock(&group->notification_mutex); diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 68ca5a8704b5..ac851e8376b1 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -191,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list); + hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list); out: fsnotify_recalc_vfsmount_mask_locked(mnt); spin_unlock(&mnt->mnt_root->d_lock); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5c9e2c81cb11..f5ec1ce7a532 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -74,8 +74,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * ntfs_attr_extend_initialized - extend the initialized size of an attribute * @ni: ntfs inode of the attribute to extend * @new_init_size: requested new initialized size in bytes - * @cached_page: store any allocated but unused page here - * @lru_pvec: lru-buffering pagevec of the caller * * Extend the initialized size of an attribute described by the ntfs inode @ni * to @new_init_size bytes. This involves zeroing any non-sparse space between @@ -395,7 +393,6 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, * @nr_pages: number of page cache pages to obtain * @pages: array of pages in which to return the obtained page cache pages * @cached_page: allocated but as yet unused page - * @lru_pvec: lru-buffering pagevec of caller * * Obtain @nr_pages locked page cache pages from the mapping @mapping and * starting at index @index. diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9d8fcf2f3b94..a93bf9892256 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4961,6 +4961,15 @@ leftright: el = path_leaf_el(path); split_index = ocfs2_search_extent_list(el, cpos); + if (split_index == -1) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u " + "which can no longer be found.\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } goto leftright; } out: @@ -5135,7 +5144,7 @@ int ocfs2_change_extent_flag(handle_t *handle, el = path_leaf_el(left_path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -5491,7 +5500,7 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -5557,7 +5566,7 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu: split at cpos %u lost record.", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index a106b3f2b22a..fae17c640df3 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -331,6 +331,7 @@ struct dlm_lock_resource u16 state; char lvb[DLM_LVB_LEN]; unsigned int inflight_locks; + unsigned int inflight_assert_workers; unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; }; @@ -910,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 39efc5057a36..3fcf205ee900 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1923,12 +1923,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - if (total_backoff > - msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { + if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) { status = -ERESTARTSYS; mlog(ML_NOTICE, "Timed out joining dlm domain " "%s after %u msecs\n", dlm->name, - jiffies_to_msecs(total_backoff)); + total_backoff); goto bail; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3087a21d32f9..3ec906ef5d9a 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -581,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; res->inflight_locks = 0; + res->inflight_assert_workers = 0; res->dlm = dlm; @@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, wake_up(&res->wq); } +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + res->inflight_assert_workers++; + mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + +static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + BUG_ON(res->inflight_assert_workers == 0); + res->inflight_assert_workers--; + mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_drop_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + /* * lookup a lock resource by name. * may already exist in the hashtable. @@ -1603,7 +1641,8 @@ send_response: mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); - } + } else + dlm_lockres_grab_inflight_worker(dlm, res); } else { if (res) dlm_lockres_put(res); @@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) dlm_lockres_release_ast(dlm, res); put: + dlm_lockres_drop_inflight_worker(dlm, res); + dlm_lockres_put(res); mlog(0, "finished with dlm_assert_master_worker\n"); @@ -2364,6 +2405,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, if (res->state & DLM_LOCK_RES_MIGRATING) return 0; + /* delay migration when the lockres is in RECOCERING state */ + if (res->state & DLM_LOCK_RES_RECOVERING) + return 0; + if (res->owner != dlm->node_num) return 0; @@ -3088,11 +3133,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it so that only one mle will be found */ __dlm_unlink_mle(dlm, tmp); __dlm_mle_detach_hb_events(dlm, tmp); - ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; - mlog(0, "%s:%.*s: master=%u, newmaster=%u, " - "telling master to get ref for cleared out mle " - "during migration\n", dlm->name, namelen, name, - master, new_master); + if (tmp->type == DLM_MLE_MASTER) { + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref " + "for cleared out mle during " + "migration\n", dlm->name, + namelen, name, master, + new_master); + } } spin_unlock(&tmp->spinlock); } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 5de019437ea5..45067faf5695 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, mlog_errno(-ENOMEM); /* retry!? */ BUG(); - } + } else + __dlm_lockres_grab_inflight_worker(dlm, res); } else /* put.. incase we are not the master */ dlm_lockres_put(res); spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 9db869de829d..69aac6f088ad 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, * refs on it. */ unused = __dlm_lockres_unused(lockres); if (!unused || - (lockres->state & DLM_LOCK_RES_MIGRATING)) { + (lockres->state & DLM_LOCK_RES_MIGRATING) || + (lockres->inflight_assert_workers != 0)) { mlog(0, "%s: res %.*s is in use or being remastered, " - "used %d, state %d\n", dlm->name, - lockres->lockname.len, lockres->lockname.name, - !unused, lockres->state); - list_move_tail(&dlm->purge_list, &lockres->purge); + "used %d, state %d, assert master workers %u\n", + dlm->name, lockres->lockname.len, + lockres->lockname.name, + !unused, lockres->state, + lockres->inflight_assert_workers); + list_move_tail(&lockres->purge, &dlm->purge_list); spin_unlock(&lockres->spinlock); continue; } diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 5698b52cf5c9..2e3c9dbab68c 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, DLM_UNLOCK_CLEAR_CONVERT_TYPE); } else if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR + ) { /* must clear the actions because this unlock * is about to be retried. cannot free or do * any list manipulation. */ @@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, res->lockname.name, status==DLM_RECOVERING?"recovering": (status==DLM_MIGRATING?"migrating": - "forward")); + (status == DLM_FORWARD ? "forward" : + "nolockmanager"))); actions = 0; } if (flags & LKM_CANCEL) @@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, * updated state to the recovery master. this thread * just needs to finish out the operation and call * the unlockast. */ - ret = DLM_NORMAL; + if (dlm_is_node_dead(dlm, owner)) + ret = DLM_NORMAL; + else + ret = DLM_NOLOCKMGR; } else { /* something bad. this will BUG in ocfs2 */ ret = dlm_err_to_dlm_status(tmpret); @@ -638,7 +644,9 @@ retry: if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR) { + /* We want to go away for a tiny bit to allow recovery * / migration to complete on this resource. I don't * know of any wait queue we could sleep on as this @@ -650,7 +658,7 @@ retry: msleep(50); mlog(0, "retrying unlock due to pending recovery/" - "migration/in-progress\n"); + "migration/in-progress/reconnect\n"); goto retry; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 599eb4c4c8be..6219aaadeb08 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -98,7 +98,7 @@ static int __ocfs2_move_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(inode->i_sb, "Inode %llu has an extent at cpos %u which can no " "longer be found.\n", diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2060fc398445..8add6f1030d7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) return inode; } +static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, + struct dentry *dentry, struct inode *inode) +{ + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); + ocfs2_lock_res_free(&dl->dl_lockres); + BUG_ON(dl->dl_count != 1); + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry_attach_lock); + kfree(dl); + iput(inode); +} + static int ocfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -231,6 +246,7 @@ static int ocfs2_mknod(struct inode *dir, sigset_t oldset; int did_block_signals = 0; struct posix_acl *default_acl = NULL, *acl = NULL; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, @@ -423,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, &lookup); @@ -469,6 +487,9 @@ leave: * ocfs2_delete_inode will mutex_lock again. */ if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); @@ -991,6 +1012,65 @@ leave: return status; } +static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, + u64 src_inode_no, u64 dest_inode_no) +{ + int ret = 0, i = 0; + u64 parent_inode_no = 0; + u64 child_inode_no = src_inode_no; + struct inode *child_inode; + +#define MAX_LOOKUP_TIMES 32 + while (1) { + child_inode = ocfs2_iget(osb, child_inode_no, 0, 0); + if (IS_ERR(child_inode)) { + ret = PTR_ERR(child_inode); + break; + } + + ret = ocfs2_inode_lock(child_inode, NULL, 0); + if (ret < 0) { + iput(child_inode); + if (ret != -ENOENT) + mlog_errno(ret); + break; + } + + ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2, + &parent_inode_no); + ocfs2_inode_unlock(child_inode, 0); + iput(child_inode); + if (ret < 0) { + ret = -ENOENT; + break; + } + + if (parent_inode_no == dest_inode_no) { + ret = 1; + break; + } + + if (parent_inode_no == osb->root_inode->i_ino) { + ret = 0; + break; + } + + child_inode_no = parent_inode_no; + + if (++i >= MAX_LOOKUP_TIMES) { + mlog(ML_NOTICE, "max lookup times reached, filesystem " + "may have nested directories, " + "src inode: %llu, dest inode: %llu.\n", + (unsigned long long)src_inode_no, + (unsigned long long)dest_inode_no); + ret = 0; + break; + } + } + + return ret; +} + /* * The only place this should be used is rename! * if they have the same id, then the 1st one is the only one locked. @@ -1002,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, struct inode *inode2) { int status; + int inode1_is_ancestor, inode2_is_ancestor; struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); struct buffer_head **tmpbh; @@ -1015,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, if (*bh2) *bh2 = NULL; - /* we always want to lock the one with the lower lockid first. */ + /* we always want to lock the one with the lower lockid first. + * and if they are nested, we lock ancestor first */ if (oi1->ip_blkno != oi2->ip_blkno) { - if (oi1->ip_blkno < oi2->ip_blkno) { + inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, + oi1->ip_blkno); + if (inode1_is_ancestor < 0) { + status = inode1_is_ancestor; + goto bail; + } + + inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno, + oi2->ip_blkno); + if (inode2_is_ancestor < 0) { + status = inode2_is_ancestor; + goto bail; + } + + if ((inode1_is_ancestor == 1) || + (oi1->ip_blkno < oi2->ip_blkno && + inode2_is_ancestor == 0)) { /* switch id1 and id2 around */ tmpbh = bh2; bh2 = bh1; @@ -1098,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir, struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; struct ocfs2_dir_lookup_result target_insert = { NULL, }; + bool should_add_orphan = false; /* At some point it might be nice to break this function up a * bit. */ @@ -1134,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } rename_lock = 1; + + /* here we cannot guarantee the inodes haven't just been + * changed, so check if they are nested again */ + status = ocfs2_check_if_ancestor(osb, new_dir->i_ino, + old_inode->i_ino); + if (status < 0) { + mlog_errno(status); + goto bail; + } else if (status == 1) { + status = -EPERM; + trace_ocfs2_rename_not_permitted( + (unsigned long long)old_inode->i_ino, + (unsigned long long)new_dir->i_ino); + goto bail; + } } /* if old and new are the same, this'll just do one lock. */ @@ -1304,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir, mlog_errno(status); goto bail; } + should_add_orphan = true; } } else { BUG_ON(new_dentry->d_parent->d_inode != new_dir); @@ -1348,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (S_ISDIR(new_inode->i_mode) || - (ocfs2_read_links_count(newfe) == 1)) { - status = ocfs2_orphan_add(osb, handle, new_inode, - newfe_bh, orphan_name, - &orphan_insert, orphan_dir); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - /* change the dirent to point to the correct inode */ status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, old_inode); @@ -1373,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir, else ocfs2_add_links_count(newfe, -1); ocfs2_journal_dirty(handle, newfe_bh); + if (should_add_orphan) { + status = ocfs2_orphan_add(osb, handle, new_inode, + newfe_bh, orphan_name, + &orphan_insert, orphan_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } } else { /* if the name was not found in new_dir, add it now */ status = ocfs2_add_entry(handle, new_dentry, old_inode, @@ -1642,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_symlink_begin(dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1830,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, &lookup); @@ -1864,6 +1980,9 @@ bail: if (xattr_ac) ocfs2_free_alloc_context(xattr_ac); if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 1b60c62aa9d6..6cb019b7c6a8 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename, __entry->new_len, __get_str(new_name)) ); +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted); + TRACE_EVENT(ocfs2_rename_target_exists, TP_PROTO(int new_len, const char *new_name), TP_ARGS(new_len, new_name), diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 714e53b9cc66..d81f6e2a97f5 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3109,7 +3109,7 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, "Inode %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -4288,9 +4288,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, goto out; } + error = ocfs2_rw_lock(inode, 1); + if (error) { + mlog_errno(error); + goto out; + } + error = ocfs2_inode_lock(inode, &old_bh, 1); if (error) { mlog_errno(error); + ocfs2_rw_unlock(inode, 1); goto out; } @@ -4302,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); brelse(old_bh); if (error) { diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 1424c151cccc..a88b2a4fcc85 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -382,7 +382,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, trace_ocfs2_map_slot_buffers(bytes, si->si_blocks); - si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, + si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *), GFP_KERNEL); if (!si->si_bh) { status = -ENOMEM; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c7a89cea5c5d..ddb662b32447 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1925,15 +1925,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_shutdown_local_alloc(osb); + ocfs2_truncate_log_shutdown(osb); + /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); - /* - * During dismount, when it recovers another node it will call - * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq. - */ - ocfs2_truncate_log_shutdown(osb); - ocfs2_journal_shutdown(osb); ocfs2_sync_blockdev(sb); diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index ec58c7659183..ba8819702c56 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -321,7 +321,7 @@ static int omfs_get_imap(struct super_block *sb) goto out; sbi->s_imap_size = array_size; - sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL); + sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL); if (!sbi->s_imap) goto nomem; diff --git a/fs/open.c b/fs/open.c index 36662d036237..d6fd3acde134 100644 --- a/fs/open.c +++ b/fs/open.c @@ -263,11 +263,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EPERM; /* - * We can not allow to do any fallocate operation on an active - * swapfile + * We cannot allow any fallocate operation on an active swapfile */ if (IS_SWAPFILE(inode)) - ret = -ETXTBSY; + return -ETXTBSY; /* * Revalidate the write permissions, in case security policy has diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 239493ec718e..7151ea428041 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -23,6 +23,7 @@ proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o proc-y += self.o +proc-y += thread_self.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 64db2bceac59..cd3653e4f35c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -297,15 +297,11 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_puts(m, header); CAP_FOR_EACH_U32(__capi) { seq_printf(m, "%08x", - a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); + a->cap[CAP_LAST_U32 - __capi]); } seq_putc(m, '\n'); } -/* Remove non-existent capabilities */ -#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \ - CAP_TO_MASK(CAP_LAST_CAP + 1) - 1) - static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; @@ -319,11 +315,6 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_bset = cred->cap_bset; rcu_read_unlock(); - NORM_CAPS(cap_inheritable); - NORM_CAPS(cap_permitted); - NORM_CAPS(cap_effective); - NORM_CAPS(cap_bset); - render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); @@ -473,13 +464,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, priority = task_prio(task); nice = task_nice(task); - /* Temporary variable needed for gcc-2.96 */ - /* convert timespec -> nsec*/ - start_time = - (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC - + task->real_start_time.tv_nsec; /* convert nsec -> ticks */ - start_time = nsec_to_clock_t(start_time); + start_time = nsec_to_clock_t(task->real_start_time); seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); seq_put_decimal_ll(m, ' ', ppid); diff --git a/fs/proc/base.c b/fs/proc/base.c index 2d696b0c93bf..baf852b648ad 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -105,7 +105,7 @@ */ struct pid_entry { - char *name; + const char *name; int len; umode_t mode; const struct inode_operations *iop; @@ -130,10 +130,6 @@ struct pid_entry { { .proc_get_link = get_link } ) #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) -#define INF(NAME, MODE, read) \ - NOD(NAME, (S_IFREG|(MODE)), \ - NULL, &proc_info_file_operations, \ - { .proc_read = read } ) #define ONE(NAME, MODE, show) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ @@ -200,27 +196,32 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static int proc_pid_cmdline(struct task_struct *task, char *buffer) +static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return get_cmdline(task, buffer, PAGE_SIZE); + /* + * Rely on struct seq_operations::show() being called once + * per internal buffer allocation. See single_open(), traverse(). + */ + BUG_ON(m->size < PAGE_SIZE); + m->count += get_cmdline(task, m->buf, PAGE_SIZE); + return 0; } -static int proc_pid_auxv(struct task_struct *task, char *buffer) +static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); - int res = PTR_ERR(mm); if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ - res = nwords * sizeof(mm->saved_auxv[0]); - if (res > PAGE_SIZE) - res = PAGE_SIZE; - memcpy(buffer, mm->saved_auxv, res); + seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); mmput(mm); - } - return res; + return 0; + } else + return PTR_ERR(mm); } @@ -229,7 +230,8 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer) * Provides a wchan file via kallsyms in a proper one-value-per-file format. * Returns the resolved symbol. If that fails, simply return the address. */ -static int proc_pid_wchan(struct task_struct *task, char *buffer) +static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { unsigned long wchan; char symname[KSYM_NAME_LEN]; @@ -240,9 +242,9 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer) if (!ptrace_may_access(task, PTRACE_MODE_READ)) return 0; else - return sprintf(buffer, "%lu", wchan); + return seq_printf(m, "%lu", wchan); else - return sprintf(buffer, "%s", symname); + return seq_printf(m, "%s", symname); } #endif /* CONFIG_KALLSYMS */ @@ -304,9 +306,10 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, /* * Provides /proc/PID/schedstat */ -static int proc_pid_schedstat(struct task_struct *task, char *buffer) +static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return sprintf(buffer, "%llu %llu %lu\n", + return seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); @@ -404,7 +407,8 @@ static const struct file_operations proc_cpuset_operations = { }; #endif -static int proc_oom_score(struct task_struct *task, char *buffer) +static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { unsigned long totalpages = totalram_pages + total_swap_pages; unsigned long points = 0; @@ -414,12 +418,12 @@ static int proc_oom_score(struct task_struct *task, char *buffer) points = oom_badness(task, NULL, NULL, totalpages) * 1000 / totalpages; read_unlock(&tasklist_lock); - return sprintf(buffer, "%lu\n", points); + return seq_printf(m, "%lu\n", points); } struct limit_names { - char *name; - char *unit; + const char *name; + const char *unit; }; static const struct limit_names lnames[RLIM_NLIMITS] = { @@ -442,12 +446,11 @@ static const struct limit_names lnames[RLIM_NLIMITS] = { }; /* Display limits for a process */ -static int proc_pid_limits(struct task_struct *task, char *buffer) +static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { unsigned int i; - int count = 0; unsigned long flags; - char *bufptr = buffer; struct rlimit rlim[RLIM_NLIMITS]; @@ -459,35 +462,34 @@ static int proc_pid_limits(struct task_struct *task, char *buffer) /* * print the file header */ - count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n", + seq_printf(m, "%-25s %-20s %-20s %-10s\n", "Limit", "Soft Limit", "Hard Limit", "Units"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) - count += sprintf(&bufptr[count], "%-25s %-20s ", + seq_printf(m, "%-25s %-20s ", lnames[i].name, "unlimited"); else - count += sprintf(&bufptr[count], "%-25s %-20lu ", + seq_printf(m, "%-25s %-20lu ", lnames[i].name, rlim[i].rlim_cur); if (rlim[i].rlim_max == RLIM_INFINITY) - count += sprintf(&bufptr[count], "%-20s ", "unlimited"); + seq_printf(m, "%-20s ", "unlimited"); else - count += sprintf(&bufptr[count], "%-20lu ", - rlim[i].rlim_max); + seq_printf(m, "%-20lu ", rlim[i].rlim_max); if (lnames[i].unit) - count += sprintf(&bufptr[count], "%-10s\n", - lnames[i].unit); + seq_printf(m, "%-10s\n", lnames[i].unit); else - count += sprintf(&bufptr[count], "\n"); + seq_putc(m, '\n'); } - return count; + return 0; } #ifdef CONFIG_HAVE_ARCH_TRACEHOOK -static int proc_pid_syscall(struct task_struct *task, char *buffer) +static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { long nr; unsigned long args[6], sp, pc; @@ -496,11 +498,11 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer) return res; if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) - res = sprintf(buffer, "running\n"); + seq_puts(m, "running\n"); else if (nr < 0) - res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc); else - res = sprintf(buffer, + seq_printf(m, "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", nr, args[0], args[1], args[2], args[3], args[4], args[5], @@ -598,43 +600,6 @@ static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; -#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ - -static ssize_t proc_info_read(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - struct inode * inode = file_inode(file); - unsigned long page; - ssize_t length; - struct task_struct *task = get_proc_task(inode); - - length = -ESRCH; - if (!task) - goto out_no_task; - - if (count > PROC_BLOCK_SIZE) - count = PROC_BLOCK_SIZE; - - length = -ENOMEM; - if (!(page = __get_free_page(GFP_TEMPORARY))) - goto out; - - length = PROC_I(inode)->op.proc_read(task, (char*)page); - - if (length >= 0) - length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); - free_page(page); -out: - put_task_struct(task); -out_no_task: - return length; -} - -static const struct file_operations proc_info_file_operations = { - .read = proc_info_read, - .llseek = generic_file_llseek, -}; - static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; @@ -2056,7 +2021,7 @@ static int show_timer(struct seq_file *m, void *v) struct k_itimer *timer; struct timers_private *tp = m->private; int notify; - static char *nstr[] = { + static const char * const nstr[] = { [SIGEV_SIGNAL] = "signal", [SIGEV_NONE] = "none", [SIGEV_THREAD] = "thread", @@ -2392,7 +2357,7 @@ static const struct file_operations proc_coredump_filter_operations = { #endif #ifdef CONFIG_TASK_IO_ACCOUNTING -static int do_io_accounting(struct task_struct *task, char *buffer, int whole) +static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { struct task_io_accounting acct = task->ioac; unsigned long flags; @@ -2416,7 +2381,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) unlock_task_sighand(task, &flags); } - result = sprintf(buffer, + result = seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" @@ -2436,20 +2401,22 @@ out_unlock: return result; } -static int proc_tid_io_accounting(struct task_struct *task, char *buffer) +static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return do_io_accounting(task, buffer, 0); + return do_io_accounting(task, m, 0); } -static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) +static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) { - return do_io_accounting(task, buffer, 1); + return do_io_accounting(task, m, 1); } #endif /* CONFIG_TASK_IO_ACCOUNTING */ #ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, - struct seq_operations *seq_ops) + const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; @@ -2557,10 +2524,10 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), - INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUGO, proc_pid_limits), + ONE("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif @@ -2569,9 +2536,9 @@ static const struct pid_entry tgid_base_stuff[] = { #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, proc_pid_syscall), + ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), @@ -2594,13 +2561,13 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, proc_pid_wchan), + ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS - INF("schedstat", S_IRUGO, proc_pid_schedstat), + ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), @@ -2611,7 +2578,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_CGROUPS REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif - INF("oom_score", S_IRUGO, proc_oom_score), + ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL @@ -2625,10 +2592,10 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUSR, proc_tgid_io_accounting), + ONE("io", S_IRUSR, proc_tgid_io_accounting), #endif #ifdef CONFIG_HARDWALL - INF("hardwall", S_IRUGO, proc_pid_hardwall), + ONE("hardwall", S_IRUGO, proc_pid_hardwall), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), @@ -2780,12 +2747,12 @@ out: struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - int result = 0; + int result = -ENOENT; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; - tgid = name_to_int(dentry); + tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) goto out; @@ -2847,7 +2814,7 @@ retry: return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2) /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file *file, struct dir_context *ctx) @@ -2859,14 +2826,19 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) if (pos >= PID_MAX_LIMIT + TGID_OFFSET) return 0; - if (pos == TGID_OFFSET - 1) { + if (pos == TGID_OFFSET - 2) { struct inode *inode = ns->proc_self->d_inode; if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) return 0; - iter.tgid = 0; - } else { - iter.tgid = pos - TGID_OFFSET; + ctx->pos = pos = pos + 1; } + if (pos == TGID_OFFSET - 1) { + struct inode *inode = ns->proc_thread_self->d_inode; + if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) + return 0; + ctx->pos = pos = pos + 1; + } + iter.tgid = pos - TGID_OFFSET; iter.task = NULL; for (iter = next_tgid(ns, iter); iter.task; @@ -2895,19 +2867,22 @@ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), +#ifdef CONFIG_NET + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), +#endif REG("environ", S_IRUSR, proc_environ_operations), - INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUGO, proc_pid_limits), + ONE("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, proc_pid_syscall), + ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_tid_maps_operations), @@ -2932,13 +2907,13 @@ static const struct pid_entry tid_base_stuff[] = { DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, proc_pid_wchan), + ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS - INF("schedstat", S_IRUGO, proc_pid_schedstat), + ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), @@ -2949,7 +2924,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_CGROUPS REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif - INF("oom_score", S_IRUGO, proc_oom_score), + ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL @@ -2960,10 +2935,10 @@ static const struct pid_entry tid_base_stuff[] = { REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUSR, proc_tid_io_accounting), + ONE("io", S_IRUSR, proc_tid_io_accounting), #endif #ifdef CONFIG_HARDWALL - INF("hardwall", S_IRUGO, proc_pid_hardwall), + ONE("hardwall", S_IRUGO, proc_pid_hardwall), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), @@ -3033,7 +3008,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry if (!leader) goto out_no_task; - tid = name_to_int(dentry); + tid = name_to_int(&dentry->d_name); if (tid == ~0U) goto out; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 0788d093f5d8..955bb55fab8c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -206,7 +206,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, { struct task_struct *task = get_proc_task(dir); int result = -ENOENT; - unsigned fd = name_to_int(dentry); + unsigned fd = name_to_int(&dentry->d_name); if (!task) goto out_no_task; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index b7f268eb5f45..317b72641ebf 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -27,7 +27,7 @@ #include "internal.h" -DEFINE_SPINLOCK(proc_subdir_lock); +static DEFINE_SPINLOCK(proc_subdir_lock); static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { @@ -330,28 +330,28 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, nlink_t nlink) { struct proc_dir_entry *ent = NULL; - const char *fn = name; - unsigned int len; - - /* make sure name is valid */ - if (!name || !strlen(name)) - goto out; + const char *fn; + struct qstr qstr; if (xlate_proc_name(name, parent, &fn) != 0) goto out; + qstr.name = fn; + qstr.len = strlen(fn); + if (qstr.len == 0 || qstr.len >= 256) { + WARN(1, "name len %u\n", qstr.len); + return NULL; + } + if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { + WARN(1, "create '/proc/%s' by hand\n", qstr.name); + return NULL; + } - /* At this point there must not be any '/' characters beyond *fn */ - if (strchr(fn, '/')) - goto out; - - len = strlen(fn); - - ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); + ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); if (!ent) goto out; - memcpy(ent->name, fn, len + 1); - ent->namelen = len; + memcpy(ent->name, fn, qstr.len + 1); + ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; atomic_set(&ent->count, 1); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 0adbc02d60e3..333080d7a671 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -442,6 +442,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) int proc_fill_super(struct super_block *s) { struct inode *root_inode; + int ret; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; @@ -463,5 +464,9 @@ int proc_fill_super(struct super_block *s) return -ENOMEM; } - return proc_setup_self(s); + ret = proc_setup_self(s); + if (ret) { + return ret; + } + return proc_setup_thread_self(s); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3ab6d14e71c5..7da13e49128a 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -52,7 +52,6 @@ struct proc_dir_entry { union proc_op { int (*proc_get_link)(struct dentry *, struct path *); - int (*proc_read)(struct task_struct *task, char *page); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); @@ -112,10 +111,10 @@ static inline int task_dumpable(struct task_struct *task) return 0; } -static inline unsigned name_to_int(struct dentry *dentry) +static inline unsigned name_to_int(const struct qstr *qstr) { - const char *name = dentry->d_name.name; - int len = dentry->d_name.len; + const char *name = qstr->name; + int len = qstr->len; unsigned n = 0; if (len > 1 && *name == '0') @@ -178,8 +177,6 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i /* * generic.c */ -extern spinlock_t proc_subdir_lock; - extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, struct dentry *); @@ -234,6 +231,12 @@ static inline int proc_net_init(void) { return 0; } extern int proc_setup_self(struct super_block *); /* + * proc_thread_self.c + */ +extern int proc_setup_thread_self(struct super_block *); +extern void proc_thread_self_init(void); + +/* * proc_sysctl.c */ #ifdef CONFIG_PROC_SYSCTL diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 39e6ef32f0bd..6df8d0722c97 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -172,7 +172,7 @@ get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK; end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1; - end = ALIGN(end, PAGE_SIZE); + end = PAGE_ALIGN(end); /* overlap check (because we have to align page */ list_for_each_entry(tmp, head, list) { if (tmp->type != KCORE_VMEMMAP) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 7445af0b1aa3..aa1eee06420f 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_WRITEBACK)), K(global_page_state(NR_ANON_PAGES)), K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SHMEM)), + K(i.sharedram), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 4677bb7dc7c2..a63af3e0a612 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir) rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); if (task != NULL) { - ns = task_nsproxy(task); + task_lock(task); + ns = task->nsproxy; if (ns != NULL) net = get_net(ns->net_ns); + task_unlock(task); } rcu_read_unlock(); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 71290463a1d3..f92d5dd578a4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -632,7 +632,7 @@ out: return ret; } -static int scan(struct ctl_table_header *head, ctl_table *table, +static int scan(struct ctl_table_header *head, struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index cb761f010300..15f327bed8c6 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -18,7 +18,7 @@ /* * The /proc/tty directory inodes... */ -static struct proc_dir_entry *proc_tty_ldisc, *proc_tty_driver; +static struct proc_dir_entry *proc_tty_driver; /* * This is the handler for /proc/tty/drivers @@ -176,7 +176,7 @@ void __init proc_tty_init(void) { if (!proc_mkdir("tty", NULL)) return; - proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL); + proc_mkdir("tty/ldisc", NULL); /* Preserved: it's userspace visible */ /* * /proc/tty/driver/serial reveals the exact character counts for * serial links which is just too easy to abuse for inferring diff --git a/fs/proc/root.c b/fs/proc/root.c index 5dbadecb234d..094e44d4a6be 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -149,6 +149,8 @@ static void proc_kill_sb(struct super_block *sb) ns = (struct pid_namespace *)sb->s_fs_info; if (ns->proc_self) dput(ns->proc_self); + if (ns->proc_thread_self) + dput(ns->proc_thread_self); kill_anon_super(sb); put_pid_ns(ns); } @@ -170,6 +172,7 @@ void __init proc_root_init(void) return; proc_self_init(); + proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); @@ -199,10 +202,10 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_lookup(dir, dentry, flags)) + if (!proc_pid_lookup(dir, dentry, flags)) return NULL; - return proc_pid_lookup(dir, dentry, flags); + return proc_lookup(dir, dentry, flags); } static int proc_root_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9d231e9e5f0e..bf2d03f8fd3e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -184,29 +184,11 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_possible_cpus(); - char *buf; - struct seq_file *m; - int res; + size_t size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; - - /* don't ask for more than the kmalloc() max size */ - if (size > KMALLOC_MAX_SIZE) - size = KMALLOC_MAX_SIZE; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = ksize(buf); - } else - kfree(buf); - return res; + return single_open_size(file, show_stat, NULL, size); } static const struct file_operations proc_stat_operations = { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cfa63ee92c96..dfc791c42d64 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -925,15 +925,30 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; - unsigned long addr; + unsigned long addr = start; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); - for (addr = start; addr < end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); - if (err) - break; + while (addr < end) { + struct vm_area_struct *vma = find_vma(walk->mm, addr); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + unsigned long vm_end; + + if (!vma) { + vm_end = end; + } else { + vm_end = min(end, vma->vm_end); + if (vma->vm_flags & VM_SOFTDIRTY) + pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); + } + + for (; addr < vm_end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, &pme, pm); + if (err) + goto out; + } } + +out: return err; } diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c new file mode 100644 index 000000000000..59075b509df3 --- /dev/null +++ b/fs/proc/thread_self.c @@ -0,0 +1,85 @@ +#include <linux/sched.h> +#include <linux/namei.h> +#include <linux/slab.h> +#include <linux/pid_namespace.h> +#include "internal.h" + +/* + * /proc/thread_self: + */ +static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + pid_t pid = task_pid_nr_ns(current, ns); + char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF]; + if (!pid) + return -ENOENT; + sprintf(tmp, "%d/task/%d", tgid, pid); + return readlink_copy(buffer, buflen, tmp); +} + +static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + pid_t pid = task_pid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (pid) { + name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d/task/%d", tgid, pid); + } + nd_set_link(nd, name); + return NULL; +} + +static const struct inode_operations proc_thread_self_inode_operations = { + .readlink = proc_thread_self_readlink, + .follow_link = proc_thread_self_follow_link, + .put_link = kfree_put_link, +}; + +static unsigned thread_self_inum; + +int proc_setup_thread_self(struct super_block *s) +{ + struct inode *root_inode = s->s_root->d_inode; + struct pid_namespace *ns = s->s_fs_info; + struct dentry *thread_self; + + mutex_lock(&root_inode->i_mutex); + thread_self = d_alloc_name(s->s_root, "thread-self"); + if (thread_self) { + struct inode *inode = new_inode_pseudo(s); + if (inode) { + inode->i_ino = thread_self_inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_op = &proc_thread_self_inode_operations; + d_add(thread_self, inode); + } else { + dput(thread_self); + thread_self = ERR_PTR(-ENOMEM); + } + } else { + thread_self = ERR_PTR(-ENOMEM); + } + mutex_unlock(&root_inode->i_mutex); + if (IS_ERR(thread_self)) { + pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); + return PTR_ERR(thread_self); + } + ns->proc_thread_self = thread_self; + return 0; +} + +void __init proc_thread_self_init(void) +{ + proc_alloc_inum(&thread_self_inum); +} diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 382aa890e228..a90d6d354199 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -328,6 +328,82 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz) * virtually contiguous user-space in ELF layout. */ #ifdef CONFIG_MMU +/* + * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages + * reported as not being ram with the zero page. + * + * @vma: vm_area_struct describing requested mapping + * @from: start remapping from + * @pfn: page frame number to start remapping to + * @size: remapping size + * @prot: protection bits + * + * Returns zero on success, -EAGAIN on failure. + */ +static int remap_oldmem_pfn_checked(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long map_size; + unsigned long pos_start, pos_end, pos; + unsigned long zeropage_pfn = my_zero_pfn(0); + size_t len = 0; + + pos_start = pfn; + pos_end = pfn + (size >> PAGE_SHIFT); + + for (pos = pos_start; pos < pos_end; ++pos) { + if (!pfn_is_ram(pos)) { + /* + * We hit a page which is not ram. Remap the continuous + * region between pos_start and pos-1 and replace + * the non-ram page at pos with the zero page. + */ + if (pos > pos_start) { + /* Remap continuous region */ + map_size = (pos - pos_start) << PAGE_SHIFT; + if (remap_oldmem_pfn_range(vma, from + len, + pos_start, map_size, + prot)) + goto fail; + len += map_size; + } + /* Remap the zero page */ + if (remap_oldmem_pfn_range(vma, from + len, + zeropage_pfn, + PAGE_SIZE, prot)) + goto fail; + len += PAGE_SIZE; + pos_start = pos + 1; + } + } + if (pos > pos_start) { + /* Remap the rest */ + map_size = (pos - pos_start) << PAGE_SHIFT; + if (remap_oldmem_pfn_range(vma, from + len, pos_start, + map_size, prot)) + goto fail; + } + return 0; +fail: + do_munmap(vma->vm_mm, from, len); + return -EAGAIN; +} + +static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + /* + * Check if oldmem_pfn_is_ram was registered to avoid + * looping over all pages without a reason. + */ + if (oldmem_pfn_is_ram) + return remap_oldmem_pfn_checked(vma, from, pfn, size, prot); + else + return remap_oldmem_pfn_range(vma, from, pfn, size, prot); +} + static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) { size_t size = vma->vm_end - vma->vm_start; @@ -387,9 +463,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) tsz = min_t(size_t, m->offset + m->size - start, size); paddr = m->paddr + start - m->offset; - if (remap_oldmem_pfn_range(vma, vma->vm_start + len, - paddr >> PAGE_SHIFT, tsz, - vma->vm_page_prot)) + if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) goto fail; size -= tsz; start += tsz; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 1a81373947f3..73ca1740d839 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -232,17 +232,15 @@ static int mounts_open_common(struct inode *inode, struct file *file, if (!task) goto err; - rcu_read_lock(); - nsp = task_nsproxy(task); + task_lock(task); + nsp = task->nsproxy; if (!nsp || !nsp->mnt_ns) { - rcu_read_unlock(); + task_unlock(task); put_task_struct(task); goto err; } ns = nsp->mnt_ns; get_mnt_ns(ns); - rcu_read_unlock(); - task_lock(task); if (!task->fs) { task_unlock(task); put_task_struct(task); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 34a1e5aa848c..9d7b9a83699e 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -394,7 +394,7 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size) prot = pgprot_noncached(PAGE_KERNEL); - pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL); + pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); if (!pages) { pr_err("%s: Failed to allocate array for %u pages\n", __func__, page_count); diff --git a/fs/qnx6/Makefile b/fs/qnx6/Makefile index 9dd06199afc9..5e6bae6fae50 100644 --- a/fs/qnx6/Makefile +++ b/fs/qnx6/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6.o qnx6-objs := inode.o dir.o namei.o super_mmi.o +ccflags-$(CONFIG_QNX6FS_DEBUG) += -DDEBUG diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 15b7d92ed60d..8d64bb5366bf 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -77,21 +77,20 @@ static int qnx6_dir_longfilename(struct inode *inode, if (de->de_size != 0xff) { /* error - long filename entries always have size 0xff in direntry */ - printk(KERN_ERR "qnx6: invalid direntry size (%i).\n", - de->de_size); + pr_err("invalid direntry size (%i).\n", de->de_size); return 0; } lf = qnx6_longname(s, de, &page); if (IS_ERR(lf)) { - printk(KERN_ERR "qnx6:Error reading longname\n"); + pr_err("Error reading longname\n"); return 0; } lf_size = fs16_to_cpu(sbi, lf->lf_size); if (lf_size > QNX6_LONG_NAME_MAX) { - QNX6DEBUG((KERN_INFO "file %s\n", lf->lf_fname)); - printk(KERN_ERR "qnx6:Filename too long (%i)\n", lf_size); + pr_debug("file %s\n", lf->lf_fname); + pr_err("Filename too long (%i)\n", lf_size); qnx6_put_page(page); return 0; } @@ -100,10 +99,10 @@ static int qnx6_dir_longfilename(struct inode *inode, mmi 3g filesystem does not have that checksum */ if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) != qnx6_lfile_checksum(lf->lf_fname, lf_size)) - printk(KERN_INFO "qnx6: long filename checksum error.\n"); + pr_info("long filename checksum error.\n"); - QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n", - lf_size, lf->lf_fname, de_inode)); + pr_debug("qnx6_readdir:%.*s inode:%u\n", + lf_size, lf->lf_fname, de_inode); if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { qnx6_put_page(page); return 0; @@ -136,7 +135,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) int i = start; if (IS_ERR(page)) { - printk(KERN_ERR "qnx6_readdir: read failed\n"); + pr_err("%s(): read failed\n", __func__); ctx->pos = (n + 1) << PAGE_CACHE_SHIFT; return PTR_ERR(page); } @@ -159,9 +158,9 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) break; } } else { - QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s" - " inode:%u\n", size, de->de_fname, - no_inode)); + pr_debug("%s():%.*s inode:%u\n", + __func__, size, de->de_fname, + no_inode); if (!dir_emit(ctx, de->de_fname, size, no_inode, DT_UNKNOWN)) { done = true; @@ -259,8 +258,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, if (ino) goto found; } else - printk(KERN_ERR "qnx6: undefined " - "filename size in inode.\n"); + pr_err("undefined filename size in inode.\n"); } qnx6_put_page(page); } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 65cdaab3ed49..44e73923670d 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -73,8 +73,8 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock, { unsigned phys; - QNX6DEBUG((KERN_INFO "qnx6: qnx6_get_block inode=[%ld] iblock=[%ld]\n", - inode->i_ino, (unsigned long)iblock)); + pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n", + inode->i_ino, (unsigned long)iblock); phys = qnx6_block_map(inode, iblock); if (phys) { @@ -87,7 +87,7 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock, static int qnx6_check_blockptr(__fs32 ptr) { if (ptr == ~(__fs32)0) { - printk(KERN_ERR "qnx6: hit unused blockpointer.\n"); + pr_err("hit unused blockpointer.\n"); return 0; } return 1; @@ -127,8 +127,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no) levelptr = no >> bitdelta; if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) { - printk(KERN_ERR "qnx6:Requested file block number (%u) too big.", - no); + pr_err("Requested file block number (%u) too big.", no); return 0; } @@ -137,8 +136,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no) for (i = 0; i < depth; i++) { bh = sb_bread(s, block); if (!bh) { - printk(KERN_ERR "qnx6:Error reading block (%u)\n", - block); + pr_err("Error reading block (%u)\n", block); return 0; } bitdelta -= ptrbits; @@ -207,26 +205,16 @@ void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s) { struct qnx6_sb_info *sbi = QNX6_SB(s); - QNX6DEBUG((KERN_INFO "magic: %08x\n", - fs32_to_cpu(sbi, sb->sb_magic))); - QNX6DEBUG((KERN_INFO "checksum: %08x\n", - fs32_to_cpu(sbi, sb->sb_checksum))); - QNX6DEBUG((KERN_INFO "serial: %llx\n", - fs64_to_cpu(sbi, sb->sb_serial))); - QNX6DEBUG((KERN_INFO "flags: %08x\n", - fs32_to_cpu(sbi, sb->sb_flags))); - QNX6DEBUG((KERN_INFO "blocksize: %08x\n", - fs32_to_cpu(sbi, sb->sb_blocksize))); - QNX6DEBUG((KERN_INFO "num_inodes: %08x\n", - fs32_to_cpu(sbi, sb->sb_num_inodes))); - QNX6DEBUG((KERN_INFO "free_inodes: %08x\n", - fs32_to_cpu(sbi, sb->sb_free_inodes))); - QNX6DEBUG((KERN_INFO "num_blocks: %08x\n", - fs32_to_cpu(sbi, sb->sb_num_blocks))); - QNX6DEBUG((KERN_INFO "free_blocks: %08x\n", - fs32_to_cpu(sbi, sb->sb_free_blocks))); - QNX6DEBUG((KERN_INFO "inode_levels: %02x\n", - sb->Inode.levels)); + pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic)); + pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum)); + pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial)); + pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags)); + pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize)); + pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes)); + pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes)); + pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks)); + pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks)); + pr_debug("inode_levels: %02x\n", sb->Inode.levels); } #endif @@ -277,7 +265,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, start with the first superblock */ bh = sb_bread(s, offset); if (!bh) { - printk(KERN_ERR "qnx6: unable to read the first superblock\n"); + pr_err("unable to read the first superblock\n"); return NULL; } sb = (struct qnx6_super_block *)bh->b_data; @@ -285,20 +273,16 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, sbi->s_bytesex = BYTESEX_BE; if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) { /* we got a big endian fs */ - QNX6DEBUG((KERN_INFO "qnx6: fs got different" - " endianness.\n")); + pr_debug("fs got different endianness.\n"); return bh; } else sbi->s_bytesex = BYTESEX_LE; if (!silent) { if (offset == 0) { - printk(KERN_ERR "qnx6: wrong signature (magic)" - " in superblock #1.\n"); + pr_err("wrong signature (magic) in superblock #1.\n"); } else { - printk(KERN_INFO "qnx6: wrong signature (magic)" - " at position (0x%lx) - will try" - " alternative position (0x0000).\n", - offset * s->s_blocksize); + pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n", + offset * s->s_blocksize); } } brelse(bh); @@ -329,13 +313,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* Superblock always is 512 Byte long */ if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) { - printk(KERN_ERR "qnx6: unable to set blocksize\n"); + pr_err("unable to set blocksize\n"); goto outnobh; } /* parse the mount-options */ if (!qnx6_parse_options((char *) data, s)) { - printk(KERN_ERR "qnx6: invalid mount options.\n"); + pr_err("invalid mount options.\n"); goto outnobh; } if (test_opt(s, MMI_FS)) { @@ -355,7 +339,7 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* try again without bootblock offset */ bh1 = qnx6_check_first_superblock(s, 0, silent); if (!bh1) { - printk(KERN_ERR "qnx6: unable to read the first superblock\n"); + pr_err("unable to read the first superblock\n"); goto outnobh; } /* seems that no bootblock at partition start */ @@ -370,13 +354,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb1->sb_checksum) != crc32_be(0, (char *)(bh1->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); + pr_err("superblock #1 checksum error\n"); goto out; } /* set new blocksize */ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { - printk(KERN_ERR "qnx6: unable to set blocksize\n"); + pr_err("unable to set blocksize\n"); goto out; } /* blocksize invalidates bh - pull it back in */ @@ -398,21 +382,20 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* next the second superblock */ bh2 = sb_bread(s, offset); if (!bh2) { - printk(KERN_ERR "qnx6: unable to read the second superblock\n"); + pr_err("unable to read the second superblock\n"); goto out; } sb2 = (struct qnx6_super_block *)bh2->b_data; if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) - printk(KERN_ERR "qnx6: wrong signature (magic)" - " in superblock #2.\n"); + pr_err("wrong signature (magic) in superblock #2.\n"); goto out; } /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb2->sb_checksum) != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #2 checksum error\n"); + pr_err("superblock #2 checksum error\n"); goto out; } @@ -422,25 +405,24 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) sbi->sb_buf = bh1; sbi->sb = (struct qnx6_super_block *)bh1->b_data; brelse(bh2); - printk(KERN_INFO "qnx6: superblock #1 active\n"); + pr_info("superblock #1 active\n"); } else { /* superblock #2 active */ sbi->sb_buf = bh2; sbi->sb = (struct qnx6_super_block *)bh2->b_data; brelse(bh1); - printk(KERN_INFO "qnx6: superblock #2 active\n"); + pr_info("superblock #2 active\n"); } mmi_success: /* sanity check - limit maximum indirect pointer levels */ if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) { - printk(KERN_ERR "qnx6: too many inode levels (max %i, sb %i)\n", - QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); + pr_err("too many inode levels (max %i, sb %i)\n", + QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); goto out; } if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) { - printk(KERN_ERR "qnx6: too many longfilename levels" - " (max %i, sb %i)\n", - QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels); + pr_err("too many longfilename levels (max %i, sb %i)\n", + QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels); goto out; } s->s_op = &qnx6_sops; @@ -460,7 +442,7 @@ mmi_success: /* prefetch root inode */ root = qnx6_iget(s, QNX6_ROOT_INO); if (IS_ERR(root)) { - printk(KERN_ERR "qnx6: get inode failed\n"); + pr_err("get inode failed\n"); ret = PTR_ERR(root); goto out2; } @@ -474,7 +456,7 @@ mmi_success: errmsg = qnx6_checkroot(s); if (errmsg != NULL) { if (!silent) - printk(KERN_ERR "qnx6: %s\n", errmsg); + pr_err("%s\n", errmsg); goto out3; } return 0; @@ -555,8 +537,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode->i_mode = 0; if (ino == 0) { - printk(KERN_ERR "qnx6: bad inode number on dev %s: %u is " - "out of range\n", + pr_err("bad inode number on dev %s: %u is out of range\n", sb->s_id, ino); iget_failed(inode); return ERR_PTR(-EIO); @@ -566,8 +547,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) mapping = sbi->inodes->i_mapping; page = read_mapping_page(mapping, n, NULL); if (IS_ERR(page)) { - printk(KERN_ERR "qnx6: major problem: unable to read inode from " - "dev %s\n", sb->s_id); + pr_err("major problem: unable to read inode from dev %s\n", + sb->s_id); iget_failed(inode); return ERR_CAST(page); } @@ -689,7 +670,7 @@ static int __init init_qnx6_fs(void) return err; } - printk(KERN_INFO "QNX6 filesystem 1.0.0 registered.\n"); + pr_info("QNX6 filesystem 1.0.0 registered.\n"); return 0; } diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c index 0561326a94f5..6c1a323137dd 100644 --- a/fs/qnx6/namei.c +++ b/fs/qnx6/namei.c @@ -29,12 +29,12 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, foundinode = qnx6_iget(dir->i_sb, ino); qnx6_put_page(page); if (IS_ERR(foundinode)) { - QNX6DEBUG((KERN_ERR "qnx6: lookup->iget -> " - " error %ld\n", PTR_ERR(foundinode))); + pr_debug("lookup->iget -> error %ld\n", + PTR_ERR(foundinode)); return ERR_CAST(foundinode); } } else { - QNX6DEBUG((KERN_INFO "qnx6_lookup: not found %s\n", name)); + pr_debug("%s(): not found %s\n", __func__, name); return NULL; } d_add(dentry, foundinode); diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h index b00fcc960d37..d3fb2b698800 100644 --- a/fs/qnx6/qnx6.h +++ b/fs/qnx6/qnx6.h @@ -10,6 +10,12 @@ * */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/pagemap.h> @@ -19,12 +25,6 @@ typedef __u64 __bitwise __fs64; #include <linux/qnx6_fs.h> -#ifdef CONFIG_QNX6FS_DEBUG -#define QNX6DEBUG(X) printk X -#else -#define QNX6DEBUG(X) (void) 0 -#endif - struct qnx6_sb_info { struct buffer_head *sb_buf; /* superblock buffer */ struct qnx6_super_block *sb; /* our superblock */ diff --git a/fs/qnx6/super_mmi.c b/fs/qnx6/super_mmi.c index 29c32cba62d6..62aaf3e3126a 100644 --- a/fs/qnx6/super_mmi.c +++ b/fs/qnx6/super_mmi.c @@ -44,15 +44,14 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) start with the first superblock */ bh1 = sb_bread(s, 0); if (!bh1) { - printk(KERN_ERR "qnx6: Unable to read first mmi superblock\n"); + pr_err("Unable to read first mmi superblock\n"); return NULL; } sb1 = (struct qnx6_mmi_super_block *)bh1->b_data; sbi = QNX6_SB(s); if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) { - printk(KERN_ERR "qnx6: wrong signature (magic) in" - " superblock #1.\n"); + pr_err("wrong signature (magic) in superblock #1.\n"); goto out; } } @@ -60,7 +59,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb1->sb_checksum) != crc32_be(0, (char *)(bh1->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); + pr_err("superblock #1 checksum error\n"); goto out; } @@ -70,7 +69,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) /* set new blocksize */ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { - printk(KERN_ERR "qnx6: unable to set blocksize\n"); + pr_err("unable to set blocksize\n"); goto out; } /* blocksize invalidates bh - pull it back in */ @@ -83,27 +82,26 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) /* read second superblock */ bh2 = sb_bread(s, offset); if (!bh2) { - printk(KERN_ERR "qnx6: unable to read the second superblock\n"); + pr_err("unable to read the second superblock\n"); goto out; } sb2 = (struct qnx6_mmi_super_block *)bh2->b_data; if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) - printk(KERN_ERR "qnx6: wrong signature (magic) in" - " superblock #2.\n"); + pr_err("wrong signature (magic) in superblock #2.\n"); goto out; } /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb2->sb_checksum) != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); + pr_err("superblock #1 checksum error\n"); goto out; } qsb = kmalloc(sizeof(*qsb), GFP_KERNEL); if (!qsb) { - printk(KERN_ERR "qnx6: unable to allocate memory.\n"); + pr_err("unable to allocate memory.\n"); goto out; } @@ -119,7 +117,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) sbi->sb_buf = bh1; sbi->sb = (struct qnx6_super_block *)bh1->b_data; brelse(bh2); - printk(KERN_INFO "qnx6: superblock #1 active\n"); + pr_info("superblock #1 active\n"); } else { /* superblock #2 active */ qnx6_mmi_copy_sb(qsb, sb2); @@ -131,7 +129,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) sbi->sb_buf = bh2; sbi->sb = (struct qnx6_super_block *)bh2->b_data; brelse(bh1); - printk(KERN_INFO "qnx6: superblock #2 active\n"); + pr_info("superblock #2 active\n"); } kfree(qsb); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9cd5f63715c0..f2d0eee9d1f1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -96,13 +96,16 @@ * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock * - * Any operation working on dquots via inode pointers must hold dqptr_sem. If - * operation is just reading pointers from inode (or not using them at all) the - * read lock is enough. If pointers are altered function must hold write lock. + * Operation accessing dquots via inode pointers are protected by dquot_srcu. + * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and + * synchronize_srcu(&dquot_srcu) is called after clearing pointers from + * inode and before dropping dquot references to avoid use of dquots after + * they are freed. dq_data_lock is used to serialize the pointer setting and + * clearing operations. * Special care needs to be taken about S_NOQUOTA inode flag (marking that * inode is a quota file). Functions adding pointers from inode to dquots have - * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they - * have to do all pointer modifications before dropping dqptr_sem. This makes + * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they + * have to do all pointer modifications before dropping dq_data_lock. This makes * sure they cannot race with quotaon which first sets S_NOQUOTA flag and * then drops all pointers to dquots from an inode. * @@ -116,21 +119,15 @@ * spinlock to internal buffers before writing. * * Lock ordering (including related VFS locks) is the following: - * dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock > - * dqio_mutex + * dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc. - * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem > - * dqptr_sem. But filesystem has to count with the fact that functions such as - * dquot_alloc_space() acquire dqptr_sem and they usually have to be called - * from inside a transaction to keep filesystem consistency after a crash. Also - * filesystems usually want to do some IO on dquot from ->mark_dirty which is - * called with dqptr_sem held. */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); +DEFINE_STATIC_SRCU(dquot_srcu); void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...) @@ -702,6 +699,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct dquot *dquot; unsigned long freed = 0; + spin_lock(&dq_list_lock); head = free_dquots.prev; while (head != &free_dquots && sc->nr_to_scan) { dquot = list_entry(head, struct dquot, dq_free); @@ -713,6 +711,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed++; head = free_dquots.prev; } + spin_unlock(&dq_list_lock); return freed; } @@ -731,7 +730,6 @@ static struct shrinker dqcache_shrinker = { /* * Put reference to dquot - * NOTE: If you change this function please check whether dqput_blocks() works right... */ void dqput(struct dquot *dquot) { @@ -961,46 +959,33 @@ static void add_dquot_ref(struct super_block *sb, int type) } /* - * Return 0 if dqput() won't block. - * (note that 1 doesn't necessarily mean blocking) - */ -static inline int dqput_blocks(struct dquot *dquot) -{ - if (atomic_read(&dquot->dq_count) <= 1) - return 1; - return 0; -} - -/* * Remove references to dquots from inode and add dquot to list for freeing * if we have the last reference to dquot - * We can't race with anybody because we hold dqptr_sem for writing... */ -static int remove_inode_dquot_ref(struct inode *inode, int type, - struct list_head *tofree_head) +static void remove_inode_dquot_ref(struct inode *inode, int type, + struct list_head *tofree_head) { struct dquot *dquot = inode->i_dquot[type]; inode->i_dquot[type] = NULL; - if (dquot) { - if (dqput_blocks(dquot)) { -#ifdef CONFIG_QUOTA_DEBUG - if (atomic_read(&dquot->dq_count) != 1) - quota_error(inode->i_sb, "Adding dquot with " - "dq_count %d to dispose list", - atomic_read(&dquot->dq_count)); -#endif - spin_lock(&dq_list_lock); - /* As dquot must have currently users it can't be on - * the free list... */ - list_add(&dquot->dq_free, tofree_head); - spin_unlock(&dq_list_lock); - return 1; - } - else - dqput(dquot); /* We have guaranteed we won't block */ + if (!dquot) + return; + + if (list_empty(&dquot->dq_free)) { + /* + * The inode still has reference to dquot so it can't be in the + * free list + */ + spin_lock(&dq_list_lock); + list_add(&dquot->dq_free, tofree_head); + spin_unlock(&dq_list_lock); + } else { + /* + * Dquot is already in a list to put so we won't drop the last + * reference here. + */ + dqput(dquot); } - return 0; } /* @@ -1035,13 +1020,15 @@ static void remove_dquot_ref(struct super_block *sb, int type, * We have to scan also I_NEW inodes because they can already * have quota pointer initialized. Luckily, we need to touch * only quota pointers and these have separate locking - * (dqptr_sem). + * (dq_data_lock). */ + spin_lock(&dq_data_lock); if (!IS_NOQUOTA(inode)) { if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; remove_inode_dquot_ref(inode, type, tofree_head); } + spin_unlock(&dq_data_lock); } spin_unlock(&inode_sb_list_lock); #ifdef CONFIG_QUOTA_DEBUG @@ -1059,9 +1046,8 @@ static void drop_dquot_ref(struct super_block *sb, int type) LIST_HEAD(tofree_head); if (sb->dq_op) { - down_write(&sb_dqopt(sb)->dqptr_sem); remove_dquot_ref(sb, type, &tofree_head); - up_write(&sb_dqopt(sb)->dqptr_sem); + synchronize_srcu(&dquot_srcu); put_dquot_list(&tofree_head); } } @@ -1392,21 +1378,16 @@ static int dquot_active(const struct inode *inode) /* * Initialize quota pointers in inode * - * We do things in a bit complicated way but by that we avoid calling - * dqget() and thus filesystem callbacks under dqptr_sem. - * * It is better to call this function outside of any transaction as it * might need a lot of space in journal for dquot structure allocation. */ static void __dquot_initialize(struct inode *inode, int type) { - int cnt; + int cnt, init_needed = 0; struct dquot *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; qsize_t rsv; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return; @@ -1416,6 +1397,15 @@ static void __dquot_initialize(struct inode *inode, int type) got[cnt] = NULL; if (type != -1 && cnt != type) continue; + /* + * The i_dquot should have been initialized in most cases, + * we check it without locking here to avoid unnecessary + * dqget()/dqput() calls. + */ + if (inode->i_dquot[cnt]) + continue; + init_needed = 1; + switch (cnt) { case USRQUOTA: qid = make_kqid_uid(inode->i_uid); @@ -1427,7 +1417,11 @@ static void __dquot_initialize(struct inode *inode, int type) got[cnt] = dqget(sb, qid); } - down_write(&sb_dqopt(sb)->dqptr_sem); + /* All required i_dquot has been initialized */ + if (!init_needed) + return; + + spin_lock(&dq_data_lock); if (IS_NOQUOTA(inode)) goto out_err; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1447,15 +1441,12 @@ static void __dquot_initialize(struct inode *inode, int type) * did a write before quota was turned on */ rsv = inode_get_rsv_space(inode); - if (unlikely(rsv)) { - spin_lock(&dq_data_lock); + if (unlikely(rsv)) dquot_resv_space(inode->i_dquot[cnt], rsv); - spin_unlock(&dq_data_lock); - } } } out_err: - up_write(&sb_dqopt(sb)->dqptr_sem); + spin_unlock(&dq_data_lock); /* Drop unused references */ dqput_all(got); } @@ -1467,19 +1458,24 @@ void dquot_initialize(struct inode *inode) EXPORT_SYMBOL(dquot_initialize); /* - * Release all quotas referenced by inode + * Release all quotas referenced by inode. + * + * This function only be called on inode free or converting + * a file to quota file, no other users for the i_dquot in + * both cases, so we needn't call synchronize_srcu() after + * clearing i_dquot. */ static void __dquot_drop(struct inode *inode) { int cnt; struct dquot *put[MAXQUOTAS]; - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { put[cnt] = inode->i_dquot[cnt]; inode->i_dquot[cnt] = NULL; } - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_unlock(&dq_data_lock); dqput_all(put); } @@ -1597,15 +1593,11 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) */ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { - int cnt, ret = 0; + int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; struct dquot **dquots = inode->i_dquot; int reserve = flags & DQUOT_SPACE_RESERVE; - /* - * First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex - */ if (!dquot_active(inode)) { inode_incr_space(inode, number, reserve); goto out; @@ -1614,7 +1606,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!dquots[cnt]) @@ -1641,7 +1633,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) goto out_flush_warn; mark_all_dquot_dirty(dquots); out_flush_warn: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); out: return ret; @@ -1653,17 +1645,16 @@ EXPORT_SYMBOL(__dquot_alloc_space); */ int dquot_alloc_inode(const struct inode *inode) { - int cnt, ret = 0; + int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; struct dquot * const *dquots = inode->i_dquot; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!dquots[cnt]) @@ -1683,7 +1674,7 @@ warn_put_all: spin_unlock(&dq_data_lock); if (ret == 0) mark_all_dquot_dirty(dquots); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); return ret; } @@ -1694,14 +1685,14 @@ EXPORT_SYMBOL(dquot_alloc_inode); */ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { - int cnt; + int cnt, index; if (!dquot_active(inode)) { inode_claim_rsv_space(inode, number); return 0; } - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1713,7 +1704,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); return 0; } EXPORT_SYMBOL(dquot_claim_space_nodirty); @@ -1723,14 +1714,14 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty); */ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { - int cnt; + int cnt, index; if (!dquot_active(inode)) { inode_reclaim_rsv_space(inode, number); return; } - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1742,7 +1733,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) inode_reclaim_rsv_space(inode, number); spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); return; } EXPORT_SYMBOL(dquot_reclaim_space_nodirty); @@ -1755,16 +1746,14 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; struct dquot **dquots = inode->i_dquot; - int reserve = flags & DQUOT_SPACE_RESERVE; + int reserve = flags & DQUOT_SPACE_RESERVE, index; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) { inode_decr_space(inode, number, reserve); return; } - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; @@ -1787,7 +1776,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) goto out_unlock; mark_all_dquot_dirty(dquots); out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); } EXPORT_SYMBOL(__dquot_free_space); @@ -1800,13 +1789,12 @@ void dquot_free_inode(const struct inode *inode) unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; struct dquot * const *dquots = inode->i_dquot; + int index; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return; - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; @@ -1821,7 +1809,7 @@ void dquot_free_inode(const struct inode *inode) } spin_unlock(&dq_data_lock); mark_all_dquot_dirty(dquots); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); } EXPORT_SYMBOL(dquot_free_inode); @@ -1835,6 +1823,8 @@ EXPORT_SYMBOL(dquot_free_inode); * This operation can block, but only after everything is updated * A transaction must be started when entering this function. * + * We are holding reference on transfer_from & transfer_to, no need to + * protect them by srcu_read_lock(). */ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { @@ -1847,8 +1837,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) struct dquot_warn warn_from_inodes[MAXQUOTAS]; struct dquot_warn warn_from_space[MAXQUOTAS]; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return 0; /* Initialize the arrays */ @@ -1857,12 +1845,12 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; } - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + + spin_lock(&dq_data_lock); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_unlock(&dq_data_lock); return 0; } - spin_lock(&dq_data_lock); cur_space = inode_get_bytes(inode); rsv_space = inode_get_rsv_space(inode); space = cur_space + rsv_space; @@ -1916,7 +1904,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) inode->i_dquot[cnt] = transfer_to[cnt]; } spin_unlock(&dq_data_lock); - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); mark_all_dquot_dirty(transfer_from); mark_all_dquot_dirty(transfer_to); @@ -1930,7 +1917,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) return 0; over_quota: spin_unlock(&dq_data_lock); - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); flush_warnings(warn_to); return ret; } diff --git a/fs/quota/kqid.c b/fs/quota/kqid.c index 2f97b0e2c501..ebc5e6285800 100644 --- a/fs/quota/kqid.c +++ b/fs/quota/kqid.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL(qid_lt); /** * from_kqid - Create a qid from a kqid user-namespace pair. * @targ: The user namespace we want a qid in. - * @kuid: The kernel internal quota identifier to start with. + * @kqid: The kernel internal quota identifier to start with. * * Map @kqid into the user-namespace specified by @targ and * return the resulting qid. diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index 72d29177998e..bb2869f5dfd8 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -32,8 +32,7 @@ static struct genl_family quota_genl_family = { /** * quota_send_warning - Send warning to userspace about exceeded quota - * @type: The quota type: USRQQUOTA, GRPQUOTA,... - * @id: The user or group id of the quota that was exceeded + * @qid: The kernel internal quota identifier. * @dev: The device on which the fs is mounted (sb->s_dev) * @warntype: The type of the warning: QUOTA_NL_... * diff --git a/fs/quota/quota.c b/fs/quota/quota.c index ff3f0b3cfdb3..75621649dbd7 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -79,13 +79,13 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr) { __u32 fmt; - down_read(&sb_dqopt(sb)->dqptr_sem); + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); if (!sb_has_quota_active(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return -ESRCH; } fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; - up_read(&sb_dqopt(sb)->dqptr_sem); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); if (copy_to_user(addr, &fmt, sizeof(fmt))) return -EFAULT; return 0; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index dda012ad4208..bbafbde3471a 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -222,7 +222,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, /* gang-find the pages */ ret = -ENOMEM; - pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); + pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto out_free; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index d9f5a60dd59b..0a7dc941aaf4 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -9,7 +9,7 @@ #include <linux/stat.h> #include <linux/buffer_head.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> extern const struct reiserfs_key MIN_KEY; diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 54fdf196bfb2..9c02d96d3a42 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -10,7 +10,7 @@ * and using buffers obtained after all above. */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/time.h> #include "reiserfs.h" #include <linux/buffer_head.h> @@ -286,12 +286,14 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) return 0; } -static void balance_leaf_insert_left(struct tree_balance *tb, - struct item_head *ih, const char *body) +static unsigned int balance_leaf_insert_left(struct tree_balance *tb, + struct item_head *const ih, + const char * const body) { int ret; struct buffer_info bi; int n = B_NR_ITEMS(tb->L[0]); + unsigned body_shift_bytes = 0; if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { /* part of new item falls into L[0] */ @@ -329,7 +331,7 @@ static void balance_leaf_insert_left(struct tree_balance *tb, put_ih_item_len(ih, new_item_len); if (tb->lbytes > tb->zeroes_num) { - body += (tb->lbytes - tb->zeroes_num); + body_shift_bytes = tb->lbytes - tb->zeroes_num; tb->zeroes_num = 0; } else tb->zeroes_num -= tb->lbytes; @@ -349,11 +351,12 @@ static void balance_leaf_insert_left(struct tree_balance *tb, tb->insert_size[0] = 0; tb->zeroes_num = 0; } + return body_shift_bytes; } static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb, - struct item_head *ih, - const char *body) + struct item_head * const ih, + const char * const body) { int n = B_NR_ITEMS(tb->L[0]); struct buffer_info bi; @@ -413,17 +416,18 @@ static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb, tb->pos_in_item -= tb->lbytes; } -static void balance_leaf_paste_left_shift(struct tree_balance *tb, - struct item_head *ih, - const char *body) +static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tb->L[0]); struct buffer_info bi; + int body_shift_bytes = 0; if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { balance_leaf_paste_left_shift_dirent(tb, ih, body); - return; + return 0; } RFALSE(tb->lbytes <= 0, @@ -497,7 +501,7 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb, * insert_size[0] */ if (l_n > tb->zeroes_num) { - body += (l_n - tb->zeroes_num); + body_shift_bytes = l_n - tb->zeroes_num; tb->zeroes_num = 0; } else tb->zeroes_num -= l_n; @@ -526,13 +530,14 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb, */ leaf_shift_left(tb, tb->lnum[0], tb->lbytes); } + return body_shift_bytes; } /* appended item will be in L[0] in whole */ static void balance_leaf_paste_left_whole(struct tree_balance *tb, - struct item_head *ih, - const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tb->L[0]); @@ -584,39 +589,44 @@ static void balance_leaf_paste_left_whole(struct tree_balance *tb, tb->zeroes_num = 0; } -static void balance_leaf_paste_left(struct tree_balance *tb, - struct item_head *ih, const char *body) +static unsigned int balance_leaf_paste_left(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) { /* we must shift the part of the appended item */ if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) - balance_leaf_paste_left_shift(tb, ih, body); + return balance_leaf_paste_left_shift(tb, ih, body); else balance_leaf_paste_left_whole(tb, ih, body); + return 0; } /* Shift lnum[0] items from S[0] to the left neighbor L[0] */ -static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih, - const char *body, int flag) +static unsigned int balance_leaf_left(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, int flag) { if (tb->lnum[0] <= 0) - return; + return 0; /* new item or it part falls to L[0], shift it too */ if (tb->item_pos < tb->lnum[0]) { BUG_ON(flag != M_INSERT && flag != M_PASTE); if (flag == M_INSERT) - balance_leaf_insert_left(tb, ih, body); + return balance_leaf_insert_left(tb, ih, body); else /* M_PASTE */ - balance_leaf_paste_left(tb, ih, body); + return balance_leaf_paste_left(tb, ih, body); } else /* new item doesn't fall into L[0] */ leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + return 0; } static void balance_leaf_insert_right(struct tree_balance *tb, - struct item_head *ih, const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); @@ -704,7 +714,8 @@ static void balance_leaf_insert_right(struct tree_balance *tb, static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb, - struct item_head *ih, const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); struct buffer_info bi; @@ -754,7 +765,8 @@ static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb, } static void balance_leaf_paste_right_shift(struct tree_balance *tb, - struct item_head *ih, const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n_shift, n_rem, r_zeroes_number, version; @@ -831,7 +843,8 @@ static void balance_leaf_paste_right_shift(struct tree_balance *tb, } static void balance_leaf_paste_right_whole(struct tree_balance *tb, - struct item_head *ih, const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tbS0); @@ -874,7 +887,8 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb, } static void balance_leaf_paste_right(struct tree_balance *tb, - struct item_head *ih, const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tbS0); @@ -896,8 +910,9 @@ static void balance_leaf_paste_right(struct tree_balance *tb, } /* shift rnum[0] items from S[0] to the right neighbor R[0] */ -static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih, - const char *body, int flag) +static void balance_leaf_right(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, int flag) { if (tb->rnum[0] <= 0) return; @@ -911,8 +926,8 @@ static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih, } static void balance_leaf_new_nodes_insert(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int i) @@ -1003,8 +1018,8 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb, /* we append to directory item */ static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int i) @@ -1058,8 +1073,8 @@ static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb, } static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int i) @@ -1131,8 +1146,8 @@ static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb, } static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int i) @@ -1184,8 +1199,8 @@ static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb, } static void balance_leaf_new_nodes_paste(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int i) @@ -1214,8 +1229,8 @@ static void balance_leaf_new_nodes_paste(struct tree_balance *tb, /* Fill new nodes that appear in place of S[0] */ static void balance_leaf_new_nodes(struct tree_balance *tb, - struct item_head *ih, - const char *body, + struct item_head * const ih, + const char * const body, struct item_head *insert_key, struct buffer_head **insert_ptr, int flag) @@ -1254,8 +1269,8 @@ static void balance_leaf_new_nodes(struct tree_balance *tb, } static void balance_leaf_finish_node_insert(struct tree_balance *tb, - struct item_head *ih, - const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); struct buffer_info bi; @@ -1271,8 +1286,8 @@ static void balance_leaf_finish_node_insert(struct tree_balance *tb, } static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb, - struct item_head *ih, - const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); struct item_head *pasted = item_head(tbS0, tb->item_pos); @@ -1305,8 +1320,8 @@ static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb, } static void balance_leaf_finish_node_paste(struct tree_balance *tb, - struct item_head *ih, - const char *body) + struct item_head * const ih, + const char * const body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); struct buffer_info bi; @@ -1349,8 +1364,8 @@ static void balance_leaf_finish_node_paste(struct tree_balance *tb, * of the affected item which remains in S */ static void balance_leaf_finish_node(struct tree_balance *tb, - struct item_head *ih, - const char *body, int flag) + struct item_head * const ih, + const char * const body, int flag) { /* if we must insert or append into buffer S[0] */ if (0 <= tb->item_pos && tb->item_pos < tb->s0num) { @@ -1402,7 +1417,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, && is_indirect_le_ih(item_head(tbS0, tb->item_pos))) tb->pos_in_item *= UNFM_P_SIZE; - balance_leaf_left(tb, ih, body, flag); + body += balance_leaf_left(tb, ih, body, flag); /* tb->lnum[0] > 0 */ /* Calculate new item position */ diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index db9e80ba53a0..751dd3f4346b 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -6,7 +6,7 @@ #include "reiserfs.h" #include "acl.h" #include "xattr.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/writeback.h> diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c index 73231b1ebdbe..b751eea32e20 100644 --- a/fs/reiserfs/ibalance.c +++ b/fs/reiserfs/ibalance.c @@ -2,7 +2,7 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/time.h> #include "reiserfs.h" diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 63b2b0ec49e6..a7eec9888f10 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -11,7 +11,7 @@ #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unaligned.h> #include <linux/buffer_head.h> #include <linux/mpage.h> diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 501ed6811a2b..6ec8a30a0911 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -7,7 +7,7 @@ #include <linux/mount.h> #include "reiserfs.h" #include <linux/time.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/compat.h> diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index cfaee912ee09..aca73dd73906 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -54,7 +54,7 @@ static void sd_print_item(struct item_head *ih, char *item) } else { struct stat_data *sd = (struct stat_data *)item; - printk("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd), + printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd), (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); } @@ -408,7 +408,7 @@ static void direntry_print_item(struct item_head *ih, char *item) namebuf[namelen + 2] = 0; } - printk("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", + printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n", i, namebuf, deh_dir_id(deh), deh_objectid(deh), GET_HASH_VALUE(deh_offset(deh)), diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index e8870de4627e..a88b1b3e7db3 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1947,8 +1947,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, } } - /* wait for all commits to finish */ - cancel_delayed_work(&SB_JOURNAL(sb)->j_work); /* * We must release the write lock here because @@ -1956,8 +1954,14 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, */ reiserfs_write_unlock(sb); + /* + * Cancel flushing of old commits. Note that neither of these works + * will be requeued because superblock is being shutdown and doesn't + * have MS_ACTIVE set. + */ cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); - flush_workqueue(REISERFS_SB(sb)->commit_wq); + /* wait for all commits to finish */ + cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work); free_journal_ram(sb); @@ -4292,9 +4296,15 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) if (flush) { flush_commit_list(sb, jl, 1); flush_journal_list(sb, jl, 1); - } else if (!(jl->j_state & LIST_COMMIT_PENDING)) - queue_delayed_work(REISERFS_SB(sb)->commit_wq, - &journal->j_work, HZ / 10); + } else if (!(jl->j_state & LIST_COMMIT_PENDING)) { + /* + * Avoid queueing work when sb is being shut down. Transaction + * will be flushed on journal shutdown. + */ + if (sb->s_flags & MS_ACTIVE) + queue_delayed_work(REISERFS_SB(sb)->commit_wq, + &journal->j_work, HZ / 10); + } /* * if the next transaction has any chance of wrapping, flush diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index d6744c8b24e1..249594a821e0 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -2,7 +2,7 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/time.h> #include "reiserfs.h" @@ -899,8 +899,9 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first, /* insert item into the leaf node in position before */ void leaf_insert_into_buf(struct buffer_info *bi, int before, - struct item_head *inserted_item_ih, - const char *inserted_item_body, int zeros_number) + struct item_head * const inserted_item_ih, + const char * const inserted_item_body, + int zeros_number) { struct buffer_head *bh = bi->bi_bh; int nr, free_space; diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index c9b47e91baf8..ae1dc841db3a 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -17,7 +17,7 @@ static char off_buf[80]; static char *reiserfs_cpu_offset(struct cpu_key *key) { if (cpu_key_k_type(key) == TYPE_DIRENTRY) - sprintf(off_buf, "%Lu(%Lu)", + sprintf(off_buf, "%llu(%llu)", (unsigned long long) GET_HASH_VALUE(cpu_key_k_offset(key)), (unsigned long long) @@ -34,7 +34,7 @@ static char *le_offset(struct reiserfs_key *key) version = le_key_version(key); if (le_key_k_type(version, key) == TYPE_DIRENTRY) - sprintf(off_buf, "%Lu(%Lu)", + sprintf(off_buf, "%llu(%llu)", (unsigned long long) GET_HASH_VALUE(le_key_k_offset(version, key)), (unsigned long long) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 02b0b7d0f7d5..621b9f381fe1 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -11,7 +11,7 @@ #include <linux/module.h> #include <linux/time.h> #include <linux/seq_file.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "reiserfs.h" #include <linux/init.h> #include <linux/proc_fs.h> diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index bf53888c7f59..735c2c2b4536 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -3216,11 +3216,12 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes); void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first, int del_num, int del_bytes); void leaf_insert_into_buf(struct buffer_info *bi, int before, - struct item_head *inserted_item_ih, - const char *inserted_item_body, int zeros_number); -void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num, - int pos_in_item, int paste_size, const char *body, + struct item_head * const inserted_item_ih, + const char * const inserted_item_body, int zeros_number); +void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num, + int pos_in_item, int paste_size, + const char * const body, int zeros_number); void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, int pos_in_item, int cut_size); void leaf_paste_entries(struct buffer_info *bi, int item_num, int before, diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index dd44468edc2b..24cbe013240f 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -2006,7 +2006,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, &s_search_path) == POSITION_FOUND); RFALSE(file_size > ROUND_UP(new_file_size), - "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", + "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d", new_file_size, file_size, s_item_key.on_disk_key.k_objectid); update_and_out: diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index a392cef6acc6..d46e88a33b02 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -15,7 +15,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/time.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "reiserfs.h" #include "acl.h" #include "xattr.h" @@ -100,7 +100,11 @@ void reiserfs_schedule_old_flush(struct super_block *s) struct reiserfs_sb_info *sbi = REISERFS_SB(s); unsigned long delay; - if (s->s_flags & MS_RDONLY) + /* + * Avoid scheduling flush when sb is being shut down. It can race + * with journal shutdown and free still queued delayed work. + */ + if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE)) return; spin_lock(&sbi->old_work_lock); @@ -331,7 +335,7 @@ static int finish_unfinished(struct super_block *s) * not completed truncate found. New size was * committed together with "save" link */ - reiserfs_info(s, "Truncating %k to %Ld ..", + reiserfs_info(s, "Truncating %k to %lld ..", INODE_PKEY(inode), inode->i_size); /* don't update modification time */ @@ -1577,7 +1581,7 @@ static int read_super_block(struct super_block *s, int offset) rs = (struct reiserfs_super_block *)bh->b_data; if (sb_blocksize(rs) != s->s_blocksize) { reiserfs_warning(s, "sh-2011", "can't find a reiserfs " - "filesystem on (dev %s, block %Lu, size %lu)", + "filesystem on (dev %s, block %llu, size %lu)", s->s_id, (unsigned long long)bh->b_blocknr, s->s_blocksize); @@ -2441,8 +2445,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh, *bh; if (!current->journal_info) { - printk(KERN_WARNING "reiserfs: Quota write (off=%Lu, len=%Lu)" - " cancelled because transaction is not started.\n", + printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n", (unsigned long long)off, (unsigned long long)len); return -EIO; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index ca416d099e7d..7c36898af402 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -45,7 +45,7 @@ #include <linux/xattr.h> #include "xattr.h" #include "acl.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <net/checksum.h> #include <linux/stat.h> #include <linux/quotaops.h> @@ -84,6 +84,7 @@ static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; + BUG_ON(!mutex_is_locked(&dir->i_mutex)); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); @@ -98,6 +99,7 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; + BUG_ON(!mutex_is_locked(&dir->i_mutex)); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); @@ -117,6 +119,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) { struct dentry *privroot = REISERFS_SB(sb)->priv_root; struct dentry *xaroot; + if (!privroot->d_inode) return ERR_PTR(-ENODATA); @@ -127,6 +130,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) xaroot = ERR_PTR(-ENODATA); else if (!xaroot->d_inode) { int err = -ENODATA; + if (xattr_may_create(flags)) err = xattr_mkdir(privroot->d_inode, xaroot, 0700); if (err) { @@ -157,6 +161,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); if (!IS_ERR(xadir) && !xadir->d_inode) { int err = -ENODATA; + if (xattr_may_create(flags)) err = xattr_mkdir(xaroot->d_inode, xadir, 0700); if (err) { @@ -188,6 +193,7 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset, { struct reiserfs_dentry_buf *dbuf = buf; struct dentry *dentry; + WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex)); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) @@ -218,6 +224,7 @@ static void cleanup_dentry_buf(struct reiserfs_dentry_buf *buf) { int i; + for (i = 0; i < buf->count; i++) if (buf->dentries[i]) dput(buf->dentries[i]); @@ -283,11 +290,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; + reiserfs_write_lock(inode->i_sb); err = journal_begin(&th, inode->i_sb, blocks); reiserfs_write_unlock(inode->i_sb); if (!err) { int jerror; + mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, I_MUTEX_XATTR); err = action(dir, data); @@ -340,6 +349,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data) int reiserfs_delete_xattrs(struct inode *inode) { int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL); + if (err) reiserfs_warning(inode->i_sb, "jdm-20004", "Couldn't delete all xattrs (%d)\n", err); @@ -350,6 +360,7 @@ int reiserfs_delete_xattrs(struct inode *inode) int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) { int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs); + if (err) reiserfs_warning(inode->i_sb, "jdm-20007", "Couldn't chown all xattrs (%d)\n", err); @@ -439,6 +450,7 @@ int reiserfs_commit_write(struct file *f, struct page *page, static void update_ctime(struct inode *inode) { struct timespec now = current_fs_time(inode->i_sb); + if (inode_unhashed(inode) || !inode->i_nlink || timespec_equal(&inode->i_ctime, &now)) return; @@ -514,6 +526,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, size_t chunk; size_t skip = 0; size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); + if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) chunk = PAGE_CACHE_SIZE; else @@ -530,6 +543,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (file_pos == 0) { struct reiserfs_xattr_header *rxh; + skip = file_pos = sizeof(struct reiserfs_xattr_header); if (chunk + skip > PAGE_CACHE_SIZE) chunk = PAGE_CACHE_SIZE - skip; @@ -659,6 +673,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, size_t chunk; char *data; size_t skip = 0; + if (isize - file_pos > PAGE_CACHE_SIZE) chunk = PAGE_CACHE_SIZE; else @@ -792,6 +807,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, int reiserfs_removexattr(struct dentry *dentry, const char *name) { const struct xattr_handler *handler; + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) @@ -813,9 +829,11 @@ static int listxattr_filler(void *buf, const char *name, int namelen, { struct listxattr_buf *b = (struct listxattr_buf *)buf; size_t size; + if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { const struct xattr_handler *handler; + handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); if (!handler) /* Unsupported xattr name */ @@ -885,6 +903,7 @@ static int create_privroot(struct dentry *dentry) { int err; struct inode *inode = dentry->d_parent->d_inode; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); err = xattr_mkdir(inode, dentry, 0700); @@ -1015,6 +1034,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) mutex_lock(&privroot->d_inode->i_mutex); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; + dentry = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME)); if (!IS_ERR(dentry)) diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 44503e293790..4b34b9dc03dd 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -9,7 +9,7 @@ #include <linux/posix_acl_xattr.h> #include "xattr.h" #include "acl.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 800a3cef6f62..e7f8939a4cb5 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -6,7 +6,7 @@ #include <linux/slab.h> #include "xattr.h" #include <linux/security.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index a0035719f66b..5eeb0c48ba46 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -5,7 +5,7 @@ #include <linux/pagemap.h> #include <linux/xattr.h> #include "xattr.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 8667491ae7c3..e50eab046471 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -4,7 +4,7 @@ #include <linux/pagemap.h> #include <linux/xattr.h> #include "xattr.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, diff --git a/fs/romfs/super.c b/fs/romfs/super.c index ef90e8bca95a..e98dd88197d5 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -56,6 +56,8 @@ * 2 of the Licence, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> @@ -380,7 +382,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) eio: ret = -EIO; error: - printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos); + pr_err("read error for inode 0x%lx\n", pos); return ERR_PTR(ret); } @@ -390,6 +392,7 @@ error: static struct inode *romfs_alloc_inode(struct super_block *sb) { struct romfs_inode_info *inode; + inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); return inode ? &inode->vfs_inode : NULL; } @@ -400,6 +403,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb) static void romfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } @@ -507,15 +511,13 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || img_size < ROMFH_SIZE) { if (!silent) - printk(KERN_WARNING "VFS:" - " Can't find a romfs filesystem on dev %s.\n", + pr_warn("VFS: Can't find a romfs filesystem on dev %s.\n", sb->s_id); goto error_rsb_inval; } if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { - printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n", - sb->s_id); + pr_err("bad initial checksum on dev %s.\n", sb->s_id); goto error_rsb_inval; } @@ -523,8 +525,8 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) len = strnlen(rsb->name, ROMFS_MAXFN); if (!silent) - printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n", - (unsigned) len, (unsigned) len, rsb->name, storage); + pr_notice("Mounting image '%*.*s' through %s\n", + (unsigned) len, (unsigned) len, rsb->name, storage); kfree(rsb); rsb = NULL; @@ -614,7 +616,7 @@ static int __init init_romfs_fs(void) { int ret; - printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n"); + pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n"); romfs_inode_cachep = kmem_cache_create("romfs_i", @@ -623,13 +625,12 @@ static int __init init_romfs_fs(void) romfs_i_init_once); if (!romfs_inode_cachep) { - printk(KERN_ERR - "ROMFS error: Failed to initialise inode cache\n"); + pr_err("Failed to initialise inode cache\n"); return -ENOMEM; } ret = register_filesystem(&romfs_fs_type); if (ret) { - printk(KERN_ERR "ROMFS error: Failed to register filesystem\n"); + pr_err("Failed to register filesystem\n"); goto error_register; } return 0; diff --git a/fs/seq_file.c b/fs/seq_file.c index 1d641bb108d2..3857b720cb1b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -8,8 +8,10 @@ #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> +#include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/cred.h> +#include <linux/mm.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -30,6 +32,16 @@ static void seq_set_overflow(struct seq_file *m) m->count = m->size; } +static void *seq_buf_alloc(unsigned long size) +{ + void *buf; + + buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!buf && size > PAGE_SIZE) + buf = vmalloc(size); + return buf; +} + /** * seq_open - initialize sequential file * @file: file we initialize @@ -96,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset) return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } @@ -135,9 +147,9 @@ static int traverse(struct seq_file *m, loff_t offset) Eoverflow: m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -192,7 +204,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } @@ -232,9 +244,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (m->count < m->size) goto Fill; m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; m->version = 0; @@ -350,7 +362,7 @@ EXPORT_SYMBOL(seq_lseek); int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - kfree(m->buf); + kvfree(m->buf); kfree(m); return 0; } @@ -605,13 +617,13 @@ EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { - char *buf = kmalloc(size, GFP_KERNEL); + char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { - kfree(buf); + kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 62a0de6632e1..43e7a7eddac0 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -44,7 +44,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) pages = end_index - start_index + 1; - page = kmalloc(sizeof(void *) * pages, GFP_KERNEL); + page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); if (page == NULL) return res; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 031c8d67fd51..5056babe00df 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -27,6 +27,8 @@ * the filesystem. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> @@ -448,8 +450,7 @@ static int __init init_squashfs_fs(void) return err; } - printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) " - "Phillip Lougher\n"); + pr_info("version 4.0 (2009/01/31) Phillip Lougher\n"); return 0; } diff --git a/fs/super.c b/fs/super.c index d20d5b11dedf..b9a214d2fe98 100644 --- a/fs/super.c +++ b/fs/super.c @@ -22,7 +22,6 @@ #include <linux/export.h> #include <linux/slab.h> -#include <linux/acct.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> @@ -218,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); mutex_init(&s->s_dquot.dqio_mutex); mutex_init(&s->s_dquot.dqonoff_mutex); - init_rwsem(&s->s_dquot.dqptr_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; @@ -702,12 +700,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) return -EACCES; #endif - if (flags & MS_RDONLY) - acct_auto_close(sb); - shrink_dcache_sb(sb); - remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + if (remount_ro) { + if (sb->s_pins.first) { + up_write(&sb->s_umount); + sb_pin_kill(sb); + down_write(&sb->s_umount); + if (!sb->s_root) + return 0; + if (sb->s_writers.frozen != SB_UNFROZEN) + return -EBUSY; + remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + } + } + shrink_dcache_sb(sb); + /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ if (remount_ro) { diff --git a/fs/timerfd.c b/fs/timerfd.c index 0013142c0475..80c350216ea8 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -35,8 +35,9 @@ struct timerfd_ctx { ktime_t moffs; wait_queue_head_t wqh; u64 ticks; - int expired; int clockid; + short unsigned expired; + short unsigned settime_flags; /* to show in fdinfo */ struct rcu_head rcu; struct list_head clist; bool might_cancel; @@ -92,7 +93,7 @@ static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, */ void timerfd_clock_was_set(void) { - ktime_t moffs = ktime_get_monotonic_offset(); + ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); struct timerfd_ctx *ctx; unsigned long flags; @@ -125,7 +126,7 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx) { if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) return false; - ctx->moffs = ktime_get_monotonic_offset(); + ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); return true; } @@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, if (timerfd_canceled(ctx)) return -ECANCELED; } + + ctx->settime_flags = flags & TFD_SETTIME_FLAGS; return 0; } @@ -284,11 +287,77 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, return res; } +#ifdef CONFIG_PROC_FS +static int timerfd_show(struct seq_file *m, struct file *file) +{ + struct timerfd_ctx *ctx = file->private_data; + struct itimerspec t; + + spin_lock_irq(&ctx->wqh.lock); + t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + t.it_interval = ktime_to_timespec(ctx->tintv); + spin_unlock_irq(&ctx->wqh.lock); + + return seq_printf(m, + "clockid: %d\n" + "ticks: %llu\n" + "settime flags: 0%o\n" + "it_value: (%llu, %llu)\n" + "it_interval: (%llu, %llu)\n", + ctx->clockid, (unsigned long long)ctx->ticks, + ctx->settime_flags, + (unsigned long long)t.it_value.tv_sec, + (unsigned long long)t.it_value.tv_nsec, + (unsigned long long)t.it_interval.tv_sec, + (unsigned long long)t.it_interval.tv_nsec); +} +#else +#define timerfd_show NULL +#endif + +#ifdef CONFIG_CHECKPOINT_RESTORE +static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct timerfd_ctx *ctx = file->private_data; + int ret = 0; + + switch (cmd) { + case TFD_IOC_SET_TICKS: { + u64 ticks; + + if (copy_from_user(&ticks, (u64 __user *)arg, sizeof(ticks))) + return -EFAULT; + if (!ticks) + return -EINVAL; + + spin_lock_irq(&ctx->wqh.lock); + if (!timerfd_canceled(ctx)) { + ctx->ticks = ticks; + if (ticks) + wake_up_locked(&ctx->wqh); + } else + ret = -ECANCELED; + spin_unlock_irq(&ctx->wqh.lock); + break; + } + default: + ret = -ENOTTY; + break; + } + + return ret; +} +#else +#define timerfd_ioctl NULL +#endif + static const struct file_operations timerfd_fops = { .release = timerfd_release, .poll = timerfd_poll, .read = timerfd_read, .llseek = noop_llseek, + .show_fdinfo = timerfd_show, + .unlocked_ioctl = timerfd_ioctl, }; static int timerfd_fget(int fd, struct fd *p) @@ -336,7 +405,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) else hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); - ctx->moffs = ktime_get_monotonic_offset(); + ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index ff8229340cd5..aa13ad053b14 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -174,7 +174,6 @@ static int do_commit(struct ubifs_info *c) if (err) goto out; - mutex_lock(&c->mst_mutex); c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); @@ -204,7 +203,6 @@ static int do_commit(struct ubifs_info *c) else c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS); err = ubifs_write_master(c); - mutex_unlock(&c->mst_mutex); if (err) goto out; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 2290d5866725..fb08b0c514b6 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -431,7 +431,7 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) /** * wbuf_timer_callback - write-buffer timer callback function. - * @data: timer data (write-buffer descriptor) + * @timer: timer data (write-buffer descriptor) * * This function is called when the write-buffer timer expires. */ diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index a902c5919e42..a47ddfc9be6b 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -240,6 +240,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + ubifs_assert(c->lhead_lnum != c->ltail_lnum); c->lhead_offs = 0; } @@ -404,15 +405,14 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) /* Switch to the next log LEB */ if (c->lhead_offs) { c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + ubifs_assert(c->lhead_lnum != c->ltail_lnum); c->lhead_offs = 0; } - if (c->lhead_offs == 0) { - /* Must ensure next LEB has been unmapped */ - err = ubifs_leb_unmap(c, c->lhead_lnum); - if (err) - goto out; - } + /* Must ensure next LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->lhead_lnum); + if (err) + goto out; len = ALIGN(len, c->min_io_size); dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len); diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index d46b19ec1815..421bd0a80424 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1464,7 +1464,6 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); - shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = ubifs_get_pnode(c, nnode, iip); if (IS_ERR(pnode)) return ERR_CAST(pnode); @@ -1604,7 +1603,6 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); - shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = ubifs_get_pnode(c, nnode, iip); if (IS_ERR(pnode)) return ERR_CAST(pnode); @@ -1964,7 +1962,6 @@ again: } } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); - shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = scan_get_pnode(c, path + h, nnode, iip); if (IS_ERR(pnode)) { err = PTR_ERR(pnode); @@ -2198,6 +2195,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, lprops->dirty); return -EINVAL; } + break; case LPROPS_FREEABLE: case LPROPS_FRDI_IDX: if (lprops->free + lprops->dirty != c->leb_size) { @@ -2206,6 +2204,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, lprops->dirty); return -EINVAL; } + break; } } return 0; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 45d4e96a6bac..d9c02928e992 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -304,7 +304,6 @@ static int layout_cnodes(struct ubifs_info *c) ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); } - done_ltab = 1; c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; @@ -514,7 +513,6 @@ static int write_cnodes(struct ubifs_info *c) if (err) return err; } - done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; dbg_chk_lpt_sz(c, 1, c->ltab_sz); @@ -1941,6 +1939,11 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) pr_err("LEB %d:%d, nnode, ", lnum, offs); err = ubifs_unpack_nnode(c, p, &nnode); + if (err) { + pr_err("failed to unpack_node, error %d\n", + err); + break; + } for (i = 0; i < UBIFS_LPT_FANOUT; i++) { pr_cont("%d:%d", nnode.nbranch[i].lnum, nnode.nbranch[i].offs); diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index ab83ace9910a..1a4bb9e8b3b8 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -352,10 +352,9 @@ int ubifs_read_master(struct ubifs_info *c) * ubifs_write_master - write master node. * @c: UBIFS file-system description object * - * This function writes the master node. The caller has to take the - * @c->mst_mutex lock before calling this function. Returns zero in case of - * success and a negative error code in case of failure. The master node is - * written twice to enable recovery. + * This function writes the master node. Returns zero in case of success and a + * negative error code in case of failure. The master node is written twice to + * enable recovery. */ int ubifs_write_master(struct ubifs_info *c) { diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index f1c3e5a1b315..4409f486ecef 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -346,7 +346,6 @@ static int write_orph_nodes(struct ubifs_info *c, int atomic) int lnum; /* Unmap any unused LEBs after consolidation */ - lnum = c->ohead_lnum + 1; for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) { err = ubifs_leb_unmap(c, lnum); if (err) diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index c14adb2f420c..c640938f62f0 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -596,7 +596,6 @@ static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs) * drop_last_node - drop the last node. * @sleb: scanned LEB information * @offs: offset of dropped nodes is returned here - * @grouped: non-zero if whole group of nodes have to be dropped * * This is a helper function for 'ubifs_recover_leb()' which drops the last * node of the scanned LEB. @@ -629,8 +628,8 @@ static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs) * * This function does a scan of a LEB, but caters for errors that might have * been caused by the unclean unmount from which we are attempting to recover. - * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is - * found, and a negative error code in case of failure. + * Returns the scanned information on success and a negative error code on + * failure. */ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf, int jhead) diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 4c37607a958e..79c6dbbc0e04 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -332,6 +332,8 @@ static int create_default_filesystem(struct ubifs_info *c) cs->ch.node_type = UBIFS_CS_NODE; err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0); kfree(cs); + if (err) + return err; ubifs_msg("default file-system created"); return 0; @@ -447,7 +449,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) goto failed; } - if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) { + if (c->default_compr >= UBIFS_COMPR_TYPES_CNT) { err = 13; goto failed; } diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 58aa05df2bb6..89adbc4d08ac 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -131,7 +131,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, * @offs: offset to start at (usually zero) * @sbuf: scan buffer (must be c->leb_size) * - * This function returns %0 on success and a negative error code on failure. + * This function returns the scanned information on success and a negative error + * code on failure. */ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, int offs, void *sbuf) @@ -157,9 +158,10 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, return ERR_PTR(err); } - if (err == -EBADMSG) - sleb->ecc = 1; - + /* + * Note, we ignore integrity errors (EBASMSG) because all the nodes are + * protected by CRC checksums. + */ return sleb; } @@ -169,8 +171,6 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, * @sleb: scanning information * @lnum: logical eraseblock number * @offs: offset to start at (usually zero) - * - * This function returns %0 on success and a negative error code on failure. */ void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, int lnum, int offs) @@ -257,7 +257,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, * @quiet: print no messages * * This function scans LEB number @lnum and returns complete information about - * its contents. Returns the scaned information in case of success and, + * its contents. Returns the scanned information in case of success and, * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case * of failure. * diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 3904c8574ef9..106bf20629ce 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -75,7 +75,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) return 1; } - if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { + if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { ubifs_err("unknown compression type %d", ui->compr_type); return 2; } @@ -424,19 +424,19 @@ static int ubifs_show_options(struct seq_file *s, struct dentry *root) struct ubifs_info *c = root->d_sb->s_fs_info; if (c->mount_opts.unmount_mode == 2) - seq_printf(s, ",fast_unmount"); + seq_puts(s, ",fast_unmount"); else if (c->mount_opts.unmount_mode == 1) - seq_printf(s, ",norm_unmount"); + seq_puts(s, ",norm_unmount"); if (c->mount_opts.bulk_read == 2) - seq_printf(s, ",bulk_read"); + seq_puts(s, ",bulk_read"); else if (c->mount_opts.bulk_read == 1) - seq_printf(s, ",no_bulk_read"); + seq_puts(s, ",no_bulk_read"); if (c->mount_opts.chk_data_crc == 2) - seq_printf(s, ",chk_data_crc"); + seq_puts(s, ",chk_data_crc"); else if (c->mount_opts.chk_data_crc == 1) - seq_printf(s, ",no_chk_data_crc"); + seq_puts(s, ",no_chk_data_crc"); if (c->mount_opts.override_compr) { seq_printf(s, ",compr=%s", @@ -796,8 +796,8 @@ static int alloc_wbufs(struct ubifs_info *c) { int i, err; - c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead), - GFP_KERNEL); + c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead), + GFP_KERNEL); if (!c->jheads) return -ENOMEM; @@ -1963,7 +1963,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) mutex_init(&c->lp_mutex); mutex_init(&c->tnc_mutex); mutex_init(&c->log_mutex); - mutex_init(&c->mst_mutex); mutex_init(&c->umount_mutex); mutex_init(&c->bu_mutex); mutex_init(&c->write_reserve_mutex); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 8a40cf9c02d7..6793db0754f6 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -3294,7 +3294,6 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, goto out_unlock; if (err) { - err = -EINVAL; key = &from_key; goto out_dump; } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 3600994f8411..7a205e046776 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -389,7 +389,6 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) ubifs_dump_lprops(c); } /* Try to commit anyway */ - err = 0; break; } p++; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index c1f71fe17cc0..c4fe900c67ab 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -314,7 +314,6 @@ struct ubifs_scan_node { * @nodes_cnt: number of nodes scanned * @nodes: list of struct ubifs_scan_node * @endpt: end point (and therefore the start of empty space) - * @ecc: read returned -EBADMSG * @buf: buffer containing entire LEB scanned */ struct ubifs_scan_leb { @@ -322,7 +321,6 @@ struct ubifs_scan_leb { int nodes_cnt; struct list_head nodes; int endpt; - int ecc; void *buf; }; @@ -1051,7 +1049,6 @@ struct ubifs_debug_info; * * @mst_node: master node * @mst_offs: offset of valid master node - * @mst_mutex: protects the master node area, @mst_node, and @mst_offs * * @max_bu_buf_len: maximum bulk-read buffer length * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu @@ -1292,7 +1289,6 @@ struct ubifs_info { struct ubifs_mst_node *mst_node; int mst_offs; - struct mutex mst_mutex; int max_bu_buf_len; struct mutex bu_mutex; diff --git a/fs/udf/file.c b/fs/udf/file.c index d80738fdf424..86c6743ec1fe 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -27,7 +27,7 @@ #include "udfdecl.h" #include <linux/fs.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/string.h> /* memset */ #include <linux/capability.h> @@ -100,24 +100,6 @@ static int udf_adinicb_write_begin(struct file *file, return 0; } -static int udf_adinicb_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = mapping->host; - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - char *kaddr; - struct udf_inode_info *iinfo = UDF_I(inode); - - kaddr = kmap_atomic(page); - memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset, - kaddr + offset, copied); - kunmap_atomic(kaddr); - - return simple_write_end(file, mapping, pos, len, copied, page, fsdata); -} - static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset) @@ -130,7 +112,7 @@ const struct address_space_operations udf_adinicb_aops = { .readpage = udf_adinicb_readpage, .writepage = udf_adinicb_writepage, .write_begin = udf_adinicb_write_begin, - .write_end = udf_adinicb_write_end, + .write_end = simple_write_end, .direct_IO = udf_adinicb_direct_IO, }; diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 6583fe9b0645..6ad5a453af97 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -21,7 +21,7 @@ #include <linux/blkdev.h> #include <linux/cdrom.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "udf_sb.h" diff --git a/fs/udf/super.c b/fs/udf/super.c index 3286db047a40..813da94d447b 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -63,7 +63,7 @@ #include "udf_i.h" #include <linux/init.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #define VDS_POS_PRIMARY_VOL_DESC 0 #define VDS_POS_UNALLOC_SPACE_DESC 1 diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index d7c6dbe4194b..6fb7945c1e6e 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -20,7 +20,7 @@ */ #include "udfdecl.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/time.h> diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 44b815e57f94..afd470e588ff 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -412,7 +412,6 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int extIndex = 0, newExtIndex = 0, hasExt = 0; unsigned short valueCRC; uint8_t curr; - const uint8_t hexChar[] = "0123456789ABCDEF"; if (udfName[0] == '.' && (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) { @@ -477,10 +476,10 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, newIndex = 250; newName[newIndex++] = CRC_MARK; valueCRC = crc_itu_t(0, fidName, fidNameLen); - newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; - newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; - newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; - newName[newIndex++] = hexChar[(valueCRC & 0x000f)]; + newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8); + newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8); + newName[newIndex++] = hex_asc_upper_hi(valueCRC); + newName[newIndex++] = hex_asc_upper_lo(valueCRC); if (hasExt) { newName[newIndex++] = EXT_MARK; diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile index dd39980437fc..4d0e02b022b3 100644 --- a/fs/ufs/Makefile +++ b/fs/ufs/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_UFS_FS) += ufs.o ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ namei.o super.o symlink.o truncate.o util.o +ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 61e8a9b021dd..7c580c97990e 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -158,16 +158,16 @@ out: /** * ufs_inode_getfrag() - allocate new fragment(s) - * @inode - pointer to inode - * @fragment - number of `fragment' which hold pointer + * @inode: pointer to inode + * @fragment: number of `fragment' which hold pointer * to new allocated fragment(s) - * @new_fragment - number of new allocated fragment(s) - * @required - how many fragment(s) we require - * @err - we set it if something wrong - * @phys - pointer to where we save physical number of new allocated fragments, + * @new_fragment: number of new allocated fragment(s) + * @required: how many fragment(s) we require + * @err: we set it if something wrong + * @phys: pointer to where we save physical number of new allocated fragments, * NULL if we allocate not data(indirect blocks for example). - * @new - we set it if we allocate new block - * @locked_page - for ufs_new_fragments() + * @new: we set it if we allocate new block + * @locked_page: for ufs_new_fragments() */ static struct buffer_head * ufs_inode_getfrag(struct inode *inode, u64 fragment, @@ -315,16 +315,16 @@ repeat2: /** * ufs_inode_getblock() - allocate new block - * @inode - pointer to inode - * @bh - pointer to block which hold "pointer" to new allocated block - * @fragment - number of `fragment' which hold pointer + * @inode: pointer to inode + * @bh: pointer to block which hold "pointer" to new allocated block + * @fragment: number of `fragment' which hold pointer * to new allocated block - * @new_fragment - number of new allocated fragment + * @new_fragment: number of new allocated fragment * (block will hold this fragment and also uspi->s_fpb-1) - * @err - see ufs_inode_getfrag() - * @phys - see ufs_inode_getfrag() - * @new - see ufs_inode_getfrag() - * @locked_page - see ufs_inode_getfrag() + * @err: see ufs_inode_getfrag() + * @phys: see ufs_inode_getfrag() + * @new: see ufs_inode_getfrag() + * @locked_page: see ufs_inode_getfrag() */ static struct buffer_head * ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, diff --git a/fs/ufs/super.c b/fs/ufs/super.c index b879f1ba3439..da73801301d5 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -65,7 +65,6 @@ * Evgeniy Dushistov <dushistov@mail.ru>, 2007 */ - #include <linux/exportfs.h> #include <linux/module.h> #include <linux/bitops.h> @@ -172,73 +171,73 @@ static void ufs_print_super_stuff(struct super_block *sb, { u32 magic = fs32_to_cpu(sb, usb3->fs_magic); - printk("ufs_print_super_stuff\n"); - printk(" magic: 0x%x\n", magic); + pr_debug("ufs_print_super_stuff\n"); + pr_debug(" magic: 0x%x\n", magic); if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) { - printk(" fs_size: %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); - printk(" fs_dsize: %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); - printk(" bsize: %u\n", - fs32_to_cpu(sb, usb1->fs_bsize)); - printk(" fsize: %u\n", - fs32_to_cpu(sb, usb1->fs_fsize)); - printk(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); - printk(" fs_sblockloc: %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); - printk(" cs_ndir(No of dirs): %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); - printk(" cs_nbfree(No of free blocks): %llu\n", - (unsigned long long) - fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); - printk(KERN_INFO" cs_nifree(Num of free inodes): %llu\n", - (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); - printk(KERN_INFO" cs_nffree(Num of free frags): %llu\n", - (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); - printk(KERN_INFO" fs_maxsymlinklen: %u\n", - fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); + pr_debug(" fs_size: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); + pr_debug(" fs_dsize: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); + pr_debug(" bsize: %u\n", + fs32_to_cpu(sb, usb1->fs_bsize)); + pr_debug(" fsize: %u\n", + fs32_to_cpu(sb, usb1->fs_fsize)); + pr_debug(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); + pr_debug(" fs_sblockloc: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); + pr_debug(" cs_ndir(No of dirs): %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); + pr_debug(" cs_nbfree(No of free blocks): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); + pr_info(" cs_nifree(Num of free inodes): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); + pr_info(" cs_nffree(Num of free frags): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); + pr_info(" fs_maxsymlinklen: %u\n", + fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); } else { - printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); - printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); - printk(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); - printk(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); - printk(" cgoffset: %u\n", - fs32_to_cpu(sb, usb1->fs_cgoffset)); - printk(" ~cgmask: 0x%x\n", - ~fs32_to_cpu(sb, usb1->fs_cgmask)); - printk(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); - printk(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); - printk(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); - printk(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); - printk(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); - printk(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); - printk(" fragshift: %u\n", - fs32_to_cpu(sb, usb1->fs_fragshift)); - printk(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); - printk(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); - printk(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); - printk(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); - printk(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); - printk(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); - printk(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); - printk(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); - printk(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); - printk(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); - printk(" fstodb: %u\n", - fs32_to_cpu(sb, usb1->fs_fsbtodb)); - printk(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); - printk(" ndir %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); - printk(" nifree %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); - printk(" nbfree %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); - printk(" nffree %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); + pr_debug(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); + pr_debug(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); + pr_debug(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); + pr_debug(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); + pr_debug(" cgoffset: %u\n", + fs32_to_cpu(sb, usb1->fs_cgoffset)); + pr_debug(" ~cgmask: 0x%x\n", + ~fs32_to_cpu(sb, usb1->fs_cgmask)); + pr_debug(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); + pr_debug(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); + pr_debug(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); + pr_debug(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); + pr_debug(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); + pr_debug(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); + pr_debug(" fragshift: %u\n", + fs32_to_cpu(sb, usb1->fs_fragshift)); + pr_debug(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); + pr_debug(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); + pr_debug(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); + pr_debug(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); + pr_debug(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); + pr_debug(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); + pr_debug(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); + pr_debug(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); + pr_debug(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); + pr_debug(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); + pr_debug(" fstodb: %u\n", + fs32_to_cpu(sb, usb1->fs_fsbtodb)); + pr_debug(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); + pr_debug(" ndir %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); + pr_debug(" nifree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); + pr_debug(" nbfree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); + pr_debug(" nffree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); } - printk("\n"); + pr_debug("\n"); } /* @@ -247,38 +246,38 @@ static void ufs_print_super_stuff(struct super_block *sb, static void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg) { - printk("\nufs_print_cylinder_stuff\n"); - printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); - printk(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); - printk(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); - printk(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); - printk(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); - printk(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); - printk(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); - printk(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); - printk(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); - printk(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); - printk(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); - printk(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); - printk(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); - printk(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); - printk(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", + pr_debug("\nufs_print_cylinder_stuff\n"); + pr_debug("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); + pr_debug(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); + pr_debug(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); + pr_debug(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); + pr_debug(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); + pr_debug(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); + pr_debug(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); + pr_debug(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); + pr_debug(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); + pr_debug(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); + pr_debug(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); + pr_debug(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); + pr_debug(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); + pr_debug(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); + pr_debug(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]), fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]), fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]), fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7])); - printk(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); - printk(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); - printk(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); - printk(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); - printk(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); - printk(" clustersumoff %u\n", - fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); - printk(" clusteroff %u\n", - fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); - printk(" nclusterblks %u\n", - fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); - printk("\n"); + pr_debug(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); + pr_debug(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); + pr_debug(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); + pr_debug(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); + pr_debug(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); + pr_debug(" clustersumoff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); + pr_debug(" clusteroff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); + pr_debug(" nclusterblks %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); + pr_debug("\n"); } #else # define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/ @@ -287,13 +286,12 @@ static void ufs_print_cylinder_stuff(struct super_block *sb, static const struct super_operations ufs_super_ops; -static char error_buf[1024]; - void ufs_error (struct super_block * sb, const char * function, const char * fmt, ...) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; + struct va_format vaf; va_list args; uspi = UFS_SB(sb)->s_uspi; @@ -305,20 +303,21 @@ void ufs_error (struct super_block * sb, const char * function, ufs_mark_sb_dirty(sb); sb->s_flags |= MS_RDONLY; } - va_start (args, fmt); - vsnprintf (error_buf, sizeof(error_buf), fmt, args); - va_end (args); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) { case UFS_MOUNT_ONERROR_PANIC: - panic ("UFS-fs panic (device %s): %s: %s\n", - sb->s_id, function, error_buf); + panic("panic (device %s): %s: %pV\n", + sb->s_id, function, &vaf); case UFS_MOUNT_ONERROR_LOCK: case UFS_MOUNT_ONERROR_UMOUNT: case UFS_MOUNT_ONERROR_REPAIR: - printk (KERN_CRIT "UFS-fs error (device %s): %s: %s\n", - sb->s_id, function, error_buf); - } + pr_crit("error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + } + va_end(args); } void ufs_panic (struct super_block * sb, const char * function, @@ -326,6 +325,7 @@ void ufs_panic (struct super_block * sb, const char * function, { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; + struct va_format vaf; va_list args; uspi = UFS_SB(sb)->s_uspi; @@ -336,24 +336,27 @@ void ufs_panic (struct super_block * sb, const char * function, ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); } - va_start (args, fmt); - vsnprintf (error_buf, sizeof(error_buf), fmt, args); - va_end (args); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; sb->s_flags |= MS_RDONLY; - printk (KERN_CRIT "UFS-fs panic (device %s): %s: %s\n", - sb->s_id, function, error_buf); + pr_crit("panic (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); } void ufs_warning (struct super_block * sb, const char * function, const char * fmt, ...) { + struct va_format vaf; va_list args; - va_start (args, fmt); - vsnprintf (error_buf, sizeof(error_buf), fmt, args); - va_end (args); - printk (KERN_WARNING "UFS-fs warning (device %s): %s: %s\n", - sb->s_id, function, error_buf); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); } enum { @@ -464,14 +467,12 @@ static int ufs_parse_options (char * options, unsigned * mount_options) ufs_set_opt (*mount_options, ONERROR_UMOUNT); break; case Opt_onerror_repair: - printk("UFS-fs: Unable to do repair on error, " - "will lock lock instead\n"); + pr_err("Unable to do repair on error, will lock lock instead\n"); ufs_clear_opt (*mount_options, ONERROR); ufs_set_opt (*mount_options, ONERROR_REPAIR); break; default: - printk("UFS-fs: Invalid option: \"%s\" " - "or missing value\n", p); + pr_err("Invalid option: \"%s\" or missing value\n", p); return 0; } } @@ -788,8 +789,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) #ifndef CONFIG_UFS_FS_WRITE if (!(sb->s_flags & MS_RDONLY)) { - printk("ufs was compiled with read-only support, " - "can't be mounted as read-write\n"); + pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); return -EROFS; } #endif @@ -812,12 +812,12 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_mount_opt = 0; ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK); if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) { - printk("wrong mount options\n"); + pr_err("wrong mount options\n"); goto failed; } if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) { if (!silent) - printk("You didn't specify the type of your ufs filesystem\n\n" + pr_err("You didn't specify the type of your ufs filesystem\n\n" "mount -t ufs -o ufstype=" "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n" ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, " @@ -868,7 +868,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_SUNOS: - UFSD(("ufstype=sunos\n")) + UFSD("ufstype=sunos\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; @@ -900,7 +900,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=old is supported read-only\n"); + pr_info("ufstype=old is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -916,7 +916,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=nextstep is supported read-only\n"); + pr_info("ufstype=nextstep is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -932,7 +932,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=nextstep-cd is supported read-only\n"); + pr_info("ufstype=nextstep-cd is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -948,7 +948,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=openstep is supported read-only\n"); + pr_info("ufstype=openstep is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -963,19 +963,19 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=hp is supported read-only\n"); + pr_info("ufstype=hp is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; default: if (!silent) - printk("unknown ufstype\n"); + pr_err("unknown ufstype\n"); goto failed; } again: if (!sb_set_blocksize(sb, block_size)) { - printk(KERN_ERR "UFS: failed to set blocksize\n"); + pr_err("failed to set blocksize\n"); goto failed; } @@ -1034,7 +1034,7 @@ again: goto again; } if (!silent) - printk("ufs_read_super: bad magic number\n"); + pr_err("%s(): bad magic number\n", __func__); goto failed; magic_found: @@ -1048,33 +1048,33 @@ magic_found: uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); if (!is_power_of_2(uspi->s_fsize)) { - printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n", - uspi->s_fsize); - goto failed; + pr_err("%s(): fragment size %u is not a power of 2\n", + __func__, uspi->s_fsize); + goto failed; } if (uspi->s_fsize < 512) { - printk(KERN_ERR "ufs_read_super: fragment size %u is too small\n", - uspi->s_fsize); + pr_err("%s(): fragment size %u is too small\n", + __func__, uspi->s_fsize); goto failed; } if (uspi->s_fsize > 4096) { - printk(KERN_ERR "ufs_read_super: fragment size %u is too large\n", - uspi->s_fsize); + pr_err("%s(): fragment size %u is too large\n", + __func__, uspi->s_fsize); goto failed; } if (!is_power_of_2(uspi->s_bsize)) { - printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n", - uspi->s_bsize); + pr_err("%s(): block size %u is not a power of 2\n", + __func__, uspi->s_bsize); goto failed; } if (uspi->s_bsize < 4096) { - printk(KERN_ERR "ufs_read_super: block size %u is too small\n", - uspi->s_bsize); + pr_err("%s(): block size %u is too small\n", + __func__, uspi->s_bsize); goto failed; } if (uspi->s_bsize / uspi->s_fsize > 8) { - printk(KERN_ERR "ufs_read_super: too many fragments per block (%u)\n", - uspi->s_bsize / uspi->s_fsize); + pr_err("%s(): too many fragments per block (%u)\n", + __func__, uspi->s_bsize / uspi->s_fsize); goto failed; } if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) { @@ -1113,20 +1113,21 @@ magic_found: UFSD("fs is DEC OSF/1\n"); break; case UFS_FSACTIVE: - printk("ufs_read_super: fs is active\n"); + pr_err("%s(): fs is active\n", __func__); sb->s_flags |= MS_RDONLY; break; case UFS_FSBAD: - printk("ufs_read_super: fs is bad\n"); + pr_err("%s(): fs is bad\n", __func__); sb->s_flags |= MS_RDONLY; break; default: - printk("ufs_read_super: can't grok fs_clean 0x%x\n", usb1->fs_clean); + pr_err("%s(): can't grok fs_clean 0x%x\n", + __func__, usb1->fs_clean); sb->s_flags |= MS_RDONLY; break; } } else { - printk("ufs_read_super: fs needs fsck\n"); + pr_err("%s(): fs needs fsck\n", __func__); sb->s_flags |= MS_RDONLY; } @@ -1299,7 +1300,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { - printk("ufstype can't be changed during remount\n"); + pr_err("ufstype can't be changed during remount\n"); unlock_ufs(sb); return -EINVAL; } @@ -1328,8 +1329,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) * fs was mounted as ro, remounting rw */ #ifndef CONFIG_UFS_FS_WRITE - printk("ufs was compiled with read-only support, " - "can't be mounted as read-write\n"); + pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); unlock_ufs(sb); return -EINVAL; #else @@ -1338,12 +1338,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_44BSD && ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { - printk("this ufstype is read-only supported\n"); + pr_err("this ufstype is read-only supported\n"); unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { - printk("failed during remounting\n"); + pr_err("failed during remounting\n"); unlock_ufs(sb); return -EPERM; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 343e6fc571e5..2a07396d5f9e 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -1,6 +1,12 @@ #ifndef _UFS_UFS_H #define _UFS_UFS_H 1 +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #define UFS_MAX_GROUP_LOADED 8 #define UFS_CGNO_EMPTY ((unsigned)-1) @@ -71,9 +77,9 @@ struct ufs_inode_info { */ #ifdef CONFIG_UFS_DEBUG # define UFSD(f, a...) { \ - printk ("UFSD (%s, %d): %s:", \ + pr_debug("UFSD (%s, %d): %s:", \ __FILE__, __LINE__, __func__); \ - printk (f, ## a); \ + pr_debug(f, ## a); \ } #else # define UFSD(f, a...) /**/ diff --git a/fs/xattr.c b/fs/xattr.c index 3377dff18404..c69e6d43a0d2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -843,7 +843,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) /* wrap around? */ len = sizeof(*new_xattr) + size; - if (len <= sizeof(*new_xattr)) + if (len < sizeof(*new_xattr)) return NULL; new_xattr = kmalloc(len, GFP_KERNEL); |