161 files changed, 4365 insertions, 2535 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index a9ea73d6dcf3..2b7a032c37bc 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -90,7 +90,7 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
 
 	v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
 						&v9fs_cache_session_index_def,
-						v9ses);
+						v9ses, true);
 	p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
 		 v9ses, v9ses->fscache);
 }
@@ -204,7 +204,7 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
 	v9ses = v9fs_inode2v9ses(inode);
 	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
-						  v9inode);
+						  v9inode, true);
 
 	p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
 		 inode, v9inode->fscache);
@@ -271,7 +271,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
 	v9ses = v9fs_inode2v9ses(inode);
 	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
-						  v9inode);
+						  v9inode, true);
 	p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
 		 inode, old, v9inode->fscache);
 
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 58e6cbce4156..08f2e1e9a7e6 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -603,10 +603,11 @@ static int v9fs_cache_register(void)
 	if (ret < 0)
 		return ret;
 #ifdef CONFIG_9P_FSCACHE
-	return fscache_register_netfs(&v9fs_cache_netfs);
-#else
-	return ret;
+	ret = fscache_register_netfs(&v9fs_cache_netfs);
+	if (ret < 0)
+		v9fs_destroy_inode_cache();
 #endif
+	return ret;
 }
 
 static void v9fs_cache_unregister(void)
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 53687bbf2296..a7c481402c46 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -267,14 +267,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	}
 
 	/* Only creates */
-	if (!(flags & O_CREAT))
+	if (!(flags & O_CREAT) || dentry->d_inode)
 		return	finish_no_open(file, res);
-	else if (dentry->d_inode) {
-		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
-			return -EEXIST;
-		else
-			return finish_no_open(file, res);
-	}
 
 	v9ses = v9fs_inode2v9ses(dir);
 
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 3c090b7555ea..ca0a3cf93791 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -179,7 +179,7 @@ struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
 	/* put it up for caching (this never returns an error) */
 	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
 					     &afs_cell_cache_index_def,
-					     cell);
+					     cell, true);
 #endif
 
 	/* add to the cell lists */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 646337dc5201..529300327f45 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -600,9 +600,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 
 	/* lock down the parent dentry so we can peer at it */
 	parent = dget_parent(dentry);
-	if (!parent->d_inode)
-		goto out_bad;
-
 	dir = AFS_FS_I(parent->d_inode);
 
 	/* validate the parent directory */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 789bc253b5f6..ce25d755b7aa 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -259,7 +259,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 #ifdef CONFIG_AFS_FSCACHE
 	vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
 					      &afs_vnode_cache_index_def,
-					      vnode);
+					      vnode, true);
 #endif
 
 	ret = afs_inode_map_status(vnode, key);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 57bcb1596530..b6df2e83809f 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -308,7 +308,8 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
 	/* see if we have an in-cache copy (will set vl->valid if there is) */
 #ifdef CONFIG_AFS_FSCACHE
 	vl->cache = fscache_acquire_cookie(vl->cell->cache,
-					   &afs_vlocation_cache_index_def, vl);
+					   &afs_vlocation_cache_index_def, vl,
+					   true);
 #endif
 
 	if (vl->valid) {
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 401eeb21869f..2b607257820c 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -131,7 +131,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 #ifdef CONFIG_AFS_FSCACHE
 	volume->cache = fscache_acquire_cookie(vlocation->cache,
 					       &afs_volume_cache_index_def,
-					       volume);
+					       volume, true);
 #endif
 	afs_get_vlocation(vlocation);
 	volume->vlocation = vlocation;
diff --git a/fs/aio.c b/fs/aio.c
index 6b868f0e0c4c..067e3d340c35 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -167,10 +167,25 @@ static int __init aio_setup(void)
 }
 __initcall(aio_setup);
 
+static void put_aio_ring_file(struct kioctx *ctx)
+{
+	struct file *aio_ring_file = ctx->aio_ring_file;
+	if (aio_ring_file) {
+		truncate_setsize(aio_ring_file->f_inode, 0);
+
+		/* Prevent further access to the kioctx from migratepages */
+		spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
+		aio_ring_file->f_inode->i_mapping->private_data = NULL;
+		ctx->aio_ring_file = NULL;
+		spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
+
+		fput(aio_ring_file);
+	}
+}
+
 static void aio_free_ring(struct kioctx *ctx)
 {
 	int i;
-	struct file *aio_ring_file = ctx->aio_ring_file;
 
 	for (i = 0; i < ctx->nr_pages; i++) {
 		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
@@ -178,14 +193,10 @@ static void aio_free_ring(struct kioctx *ctx)
 		put_page(ctx->ring_pages[i]);
 	}
 
+	put_aio_ring_file(ctx);
+
 	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
 		kfree(ctx->ring_pages);
-
-	if (aio_ring_file) {
-		truncate_setsize(aio_ring_file->f_inode, 0);
-		fput(aio_ring_file);
-		ctx->aio_ring_file = NULL;
-	}
 }
 
 static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
@@ -207,9 +218,8 @@ static int aio_set_page_dirty(struct page *page)
 static int aio_migratepage(struct address_space *mapping, struct page *new,
 			struct page *old, enum migrate_mode mode)
 {
-	struct kioctx *ctx = mapping->private_data;
+	struct kioctx *ctx;
 	unsigned long flags;
-	unsigned idx = old->index;
 	int rc;
 
 	/* Writeback must be complete */
@@ -224,10 +234,23 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 
 	get_page(new);
 
-	spin_lock_irqsave(&ctx->completion_lock, flags);
-	migrate_page_copy(new, old);
-	ctx->ring_pages[idx] = new;
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	/* We can potentially race against kioctx teardown here.  Use the
+	 * address_space's private data lock to protect the mapping's
+	 * private_data.
+	 */
+	spin_lock(&mapping->private_lock);
+	ctx = mapping->private_data;
+	if (ctx) {
+		pgoff_t idx;
+		spin_lock_irqsave(&ctx->completion_lock, flags);
+		migrate_page_copy(new, old);
+		idx = old->index;
+		if (idx < (pgoff_t)ctx->nr_pages)
+			ctx->ring_pages[idx] = new;
+		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	} else
+		rc = -EBUSY;
+	spin_unlock(&mapping->private_lock);
 
 	return rc;
 }
@@ -617,8 +640,7 @@ out_freepcpu:
 out_freeref:
 	free_percpu(ctx->users.pcpu_count);
 out_freectx:
-	if (ctx->aio_ring_file)
-		fput(ctx->aio_ring_file);
+	put_aio_ring_file(ctx);
 	kmem_cache_free(kioctx_cachep, ctx);
 	pr_debug("error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 3db70dae40d3..689e40d983ad 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -109,13 +109,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 	pkt.hdr.proto_version = sbi->version;
 	pkt.hdr.type = type;
-	mutex_lock(&sbi->wq_mutex);
 
-	/* Check if we have become catatonic */
-	if (sbi->catatonic) {
-		mutex_unlock(&sbi->wq_mutex);
-		return;
-	}
 	switch (type) {
 	/* Kernel protocol v4 missing and expire packets */
 	case autofs_ptype_missing:
@@ -427,7 +421,6 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->tgid = current->tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
 		wq->wait_ctr = 2;
-		mutex_unlock(&sbi->wq_mutex);
 
 		if (sbi->version < 5) {
 			if (notify == NFY_MOUNT)
@@ -449,15 +442,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 			(unsigned long) wq->wait_queue_token, wq->name.len,
 			wq->name.name, notify);
 
-		/* autofs4_notify_daemon() may block */
+		/* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
 		autofs4_notify_daemon(sbi, wq, type);
 	} else {
 		wq->wait_ctr++;
-		mutex_unlock(&sbi->wq_mutex);
-		kfree(qstr.name);
 		DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
 			(unsigned long) wq->wait_queue_token, wq->name.len,
 			wq->name.name, notify);
+		mutex_unlock(&sbi->wq_mutex);
+		kfree(qstr.name);
 	}
 
 	/*
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 100edcc5e312..4c94a79991bb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1413,7 +1413,7 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
  *   long file_ofs
  * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
  */
-static void fill_files_note(struct memelfnote *note)
+static int fill_files_note(struct memelfnote *note)
 {
 	struct vm_area_struct *vma;
 	unsigned count, size, names_ofs, remaining, n;
@@ -1428,11 +1428,11 @@ static void fill_files_note(struct memelfnote *note)
 	names_ofs = (2 + 3 * count) * sizeof(data[0]);
  alloc:
 	if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
-		goto err;
+		return -EINVAL;
 	size = round_up(size, PAGE_SIZE);
 	data = vmalloc(size);
 	if (!data)
-		goto err;
+		return -ENOMEM;
 
 	start_end_ofs = data + 2;
 	name_base = name_curpos = ((char *)data) + names_ofs;
@@ -1485,7 +1485,7 @@ static void fill_files_note(struct memelfnote *note)
 
 	size = name_curpos - (char *)data;
 	fill_note(note, "CORE", NT_FILE, size, data);
- err: ;
+	return 0;
 }
 
 #ifdef CORE_DUMP_USE_REGSET
@@ -1686,8 +1686,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	fill_auxv_note(&info->auxv, current->mm);
 	info->size += notesize(&info->auxv);
 
-	fill_files_note(&info->files);
-	info->size += notesize(&info->files);
+	if (fill_files_note(&info->files) == 0)
+		info->size += notesize(&info->files);
 
 	return 1;
 }
@@ -1719,7 +1719,8 @@ static int write_note_info(struct elf_note_info *info,
 			return 0;
 		if (first && !writenote(&info->auxv, file, foffset))
 			return 0;
-		if (first && !writenote(&info->files, file, foffset))
+		if (first && info->files.data &&
+				!writenote(&info->files, file, foffset))
 			return 0;
 
 		for (i = 1; i < info->thread_notes; ++i)
@@ -1806,6 +1807,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 
 struct elf_note_info {
 	struct memelfnote *notes;
+	struct memelfnote *notes_files;
 	struct elf_prstatus *prstatus;	/* NT_PRSTATUS */
 	struct elf_prpsinfo *psinfo;	/* NT_PRPSINFO */
 	struct list_head thread_list;
@@ -1896,9 +1898,12 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 
 	fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
 	fill_auxv_note(info->notes + 3, current->mm);
-	fill_files_note(info->notes + 4);
+	info->numnote = 4;
 
-	info->numnote = 5;
+	if (fill_files_note(info->notes + info->numnote) == 0) {
+		info->notes_files = info->notes + info->numnote;
+		info->numnote++;
+	}
 
 	/* Try to dump the FPU. */
 	info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
@@ -1960,8 +1965,9 @@ static void free_note_info(struct elf_note_info *info)
 		kfree(list_entry(tmp, struct elf_thread_status, list));
 	}
 
-	/* Free data allocated by fill_files_note(): */
-	vfree(info->notes[4].data);
+	/* Free data possibly allocated by fill_files_note(): */
+	if (info->notes_files)
+		vfree(info->notes_files->data);
 
 	kfree(info->prstatus);
 	kfree(info->psinfo);
@@ -2044,7 +2050,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 	struct vm_area_struct *vma, *gate_vma;
 	struct elfhdr *elf = NULL;
 	loff_t offset = 0, dataoff, foffset;
-	struct elf_note_info info;
+	struct elf_note_info info = { };
 	struct elf_phdr *phdr4note = NULL;
 	struct elf_shdr *shdr4extnum = NULL;
 	Elf_Half e_phnum;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 60250847929f..fc60b31453ee 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -735,7 +735,7 @@ void bioset_integrity_free(struct bio_set *bs)
 		mempool_destroy(bs->bio_integrity_pool);
 
 	if (bs->bvec_integrity_pool)
-		mempool_destroy(bs->bio_integrity_pool);
+		mempool_destroy(bs->bvec_integrity_pool);
 }
 EXPORT_SYMBOL(bioset_integrity_free);
 
diff --git a/fs/bio.c b/fs/bio.c
index b3b20ed9510e..ea5035da4d9a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -917,8 +917,8 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 		src_p = kmap_atomic(src_bv->bv_page);
 		dst_p = kmap_atomic(dst_bv->bv_page);
 
-		memcpy(dst_p + dst_bv->bv_offset,
-		       src_p + src_bv->bv_offset,
+		memcpy(dst_p + dst_offset,
+		       src_p + src_offset,
 		       bytes);
 
 		kunmap_atomic(dst_p);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 58b7d14b08ee..08cc08f037a6 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -107,7 +107,8 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
 		worker->idle = 1;
 
 		/* the list may be empty if the worker is just starting */
-		if (!list_empty(&worker->worker_list)) {
+		if (!list_empty(&worker->worker_list) &&
+		    !worker->workers->stopping) {
 			list_move(&worker->worker_list,
 				 &worker->workers->idle_list);
 		}
@@ -127,7 +128,8 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
 		spin_lock_irqsave(&worker->workers->lock, flags);
 		worker->idle = 0;
 
-		if (!list_empty(&worker->worker_list)) {
+		if (!list_empty(&worker->worker_list) &&
+		    !worker->workers->stopping) {
 			list_move_tail(&worker->worker_list,
 				      &worker->workers->worker_list);
 		}
@@ -412,6 +414,7 @@ void btrfs_stop_workers(struct btrfs_workers *workers)
 	int can_stop;
 
 	spin_lock_irq(&workers->lock);
+	workers->stopping = 1;
 	list_splice_init(&workers->idle_list, &workers->worker_list);
 	while (!list_empty(&workers->worker_list)) {
 		cur = workers->worker_list.next;
@@ -455,6 +458,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
 	workers->ordered = 0;
 	workers->atomic_start_pending = 0;
 	workers->atomic_worker_start = async_helper;
+	workers->stopping = 0;
 }
 
 /*
@@ -480,15 +484,19 @@ static int __btrfs_start_workers(struct btrfs_workers *workers)
 	atomic_set(&worker->num_pending, 0);
 	atomic_set(&worker->refs, 1);
 	worker->workers = workers;
-	worker->task = kthread_run(worker_loop, worker,
-				   "btrfs-%s-%d", workers->name,
-				   workers->num_workers + 1);
+	worker->task = kthread_create(worker_loop, worker,
+				      "btrfs-%s-%d", workers->name,
+				      workers->num_workers + 1);
 	if (IS_ERR(worker->task)) {
 		ret = PTR_ERR(worker->task);
-		kfree(worker);
 		goto fail;
 	}
+
 	spin_lock_irq(&workers->lock);
+	if (workers->stopping) {
+		spin_unlock_irq(&workers->lock);
+		goto fail_kthread;
+	}
 	list_add_tail(&worker->worker_list, &workers->idle_list);
 	worker->idle = 1;
 	workers->num_workers++;
@@ -496,8 +504,13 @@ static int __btrfs_start_workers(struct btrfs_workers *workers)
 	WARN_ON(workers->num_workers_starting < 0);
 	spin_unlock_irq(&workers->lock);
 
+	wake_up_process(worker->task);
 	return 0;
+
+fail_kthread:
+	kthread_stop(worker->task);
 fail:
+	kfree(worker);
 	spin_lock_irq(&workers->lock);
 	workers->num_workers_starting--;
 	spin_unlock_irq(&workers->lock);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 063698b90ce2..1f26792683ed 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -107,6 +107,8 @@ struct btrfs_workers {
 
 	/* extra name for this worker, used for current->name */
 	char *name;
+
+	int stopping;
 };
 
 void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d0ae226926ee..71f074e1870b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -213,7 +213,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
 static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 {
 	if (BTRFS_I(inode)->logged_trans == generation &&
-	    BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
+	    BTRFS_I(inode)->last_sub_trans <=
+	    BTRFS_I(inode)->last_log_commit &&
+	    BTRFS_I(inode)->last_sub_trans <=
+	    BTRFS_I(inode)->root->last_log_commit)
 		return 1;
 	return 0;
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 64346721173f..61b5bcd57b7e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1005,8 +1005,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	if (root->ref_cows)
-		btrfs_reloc_cow_block(trans, root, buf, cow);
+	if (root->ref_cows) {
+		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+		if (ret)
+			return ret;
+	}
 
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3c1da6f98a4d..0506f40ede83 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1118,15 +1118,6 @@ struct btrfs_space_info {
 	 */
 	struct percpu_counter total_bytes_pinned;
 
-	/*
-	 * we bump reservation progress every time we decrement
-	 * bytes_reserved.  This way people waiting for reservations
-	 * know something good has happened and they can check
-	 * for progress.  The number here isn't to be trusted, it
-	 * just shows reclaim activity
-	 */
-	unsigned long reservation_progress;
-
 	unsigned int full:1;	/* indicates that we cannot allocate any more
 				   chunks for this space */
 	unsigned int chunk_alloc:1;	/* set if we are allocating a chunk */
@@ -3135,7 +3126,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
 						 unsigned num_items)
 {
 	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
-		3 * num_items;
+		2 * num_items;
 }
 
 /*
@@ -3939,9 +3930,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 int btrfs_recover_relocation(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
-void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct extent_buffer *buf,
-			   struct extent_buffer *cow);
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct extent_buffer *cow);
 void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
 			      struct btrfs_pending_snapshot *pending,
 			      u64 *bytes_to_reserve);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index a64435359385..9efb94e95858 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 	btrfs_dev_replace_unlock(dev_replace);
 
-	btrfs_wait_all_ordered_extents(root->fs_info, 0);
+	btrfs_wait_all_ordered_extents(root->fs_info);
 
 	/* force writing the updated state information to disk */
 	trans = btrfs_start_transaction(root, 0);
@@ -475,7 +475,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return ret;
 	}
-	btrfs_wait_all_ordered_extents(root->fs_info, 0);
+	btrfs_wait_all_ordered_extents(root->fs_info);
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
@@ -535,10 +535,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 
 	btrfs_rm_dev_replace_srcdev(fs_info, src_device);
-	if (src_device->bdev) {
-		/* zero out the old super */
-		btrfs_scratch_superblock(src_device);
-	}
+
 	/*
 	 * this is again a consistent state where no dev_replace procedure
 	 * is running, the target device is part of the filesystem, the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4cbb00af92ff..62176ad89846 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -157,6 +157,7 @@ static struct btrfs_lockdep_keyset {
 	{ .id = BTRFS_TREE_LOG_OBJECTID,	.name_stem = "log"	},
 	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	},
 	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
+	{ .id = BTRFS_UUID_TREE_OBJECTID,	.name_stem = "uuid"	},
 	{ .id = 0,				.name_stem = "tree"	},
 };
 
@@ -1560,8 +1561,9 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
-					      struct btrfs_key *location)
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+				     struct btrfs_key *location,
+				     bool check_ref)
 {
 	struct btrfs_root *root;
 	int ret;
@@ -1585,7 +1587,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 again:
 	root = btrfs_lookup_fs_root(fs_info, location->objectid);
 	if (root) {
-		if (btrfs_root_refs(&root->root_item) == 0)
+		if (check_ref && btrfs_root_refs(&root->root_item) == 0)
 			return ERR_PTR(-ENOENT);
 		return root;
 	}
@@ -1594,7 +1596,7 @@ again:
 	if (IS_ERR(root))
 		return root;
 
-	if (btrfs_root_refs(&root->root_item) == 0) {
+	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
 		ret = -ENOENT;
 		goto fail;
 	}
@@ -3415,6 +3417,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	if (total_errors > max_errors) {
 		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
 		       total_errors);
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 		/* FUA is masked off if unsupported and can't be the reason */
 		btrfs_error(root->fs_info, -EIO,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b71acd6e1e5b..5ce2a7da8b11 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,8 +68,17 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
 int btrfs_init_fs_root(struct btrfs_root *root);
 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
 			 struct btrfs_root *root);
-struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
-					      struct btrfs_key *location);
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+				     struct btrfs_key *key,
+				     bool check_ref);
+static inline struct btrfs_root *
+btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+			   struct btrfs_key *location)
+{
+	return btrfs_get_fs_root(fs_info, location, true);
+}
+
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
 void btrfs_btree_balance_dirty(struct btrfs_root *root);
 void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cfb3cf711b34..d58bef130a41 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3925,7 +3925,6 @@ static int can_overcommit(struct btrfs_root *root,
 	u64 space_size;
 	u64 avail;
 	u64 used;
-	u64 to_add;
 
 	used = space_info->bytes_used + space_info->bytes_reserved +
 		space_info->bytes_pinned + space_info->bytes_readonly;
@@ -3959,25 +3958,17 @@ static int can_overcommit(struct btrfs_root *root,
 		       BTRFS_BLOCK_GROUP_RAID10))
 		avail >>= 1;
 
-	to_add = space_info->total_bytes;
-
 	/*
 	 * If we aren't flushing all things, let us overcommit up to
 	 * 1/2th of the space. If we can flush, don't let us overcommit
 	 * too much, let it overcommit up to 1/8 of the space.
 	 */
 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
-		to_add >>= 3;
+		avail >>= 3;
 	else
-		to_add >>= 1;
-
-	/*
-	 * Limit the overcommit to the amount of free space we could possibly
-	 * allocate for chunks.
-	 */
-	to_add = min(avail, to_add);
+		avail >>= 1;
 
-	if (used + bytes < space_info->total_bytes + to_add)
+	if (used + bytes < space_info->total_bytes + avail)
 		return 1;
 	return 0;
 }
@@ -4000,7 +3991,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
 		 */
 		btrfs_start_all_delalloc_inodes(root->fs_info, 0);
 		if (!current->journal_info)
-			btrfs_wait_all_ordered_extents(root->fs_info, 0);
+			btrfs_wait_all_ordered_extents(root->fs_info);
 	}
 }
 
@@ -4030,7 +4021,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	if (delalloc_bytes == 0) {
 		if (trans)
 			return;
-		btrfs_wait_all_ordered_extents(root->fs_info, 0);
+		btrfs_wait_all_ordered_extents(root->fs_info);
 		return;
 	}
 
@@ -4058,7 +4049,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
 		loops++;
 		if (wait_ordered && !trans) {
-			btrfs_wait_all_ordered_extents(root->fs_info, 0);
+			btrfs_wait_all_ordered_extents(root->fs_info);
 		} else {
 			time_left = schedule_timeout_killable(1);
 			if (time_left)
@@ -4465,7 +4456,6 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 			space_info->bytes_may_use -= num_bytes;
 			trace_btrfs_space_reservation(fs_info, "space_info",
 					space_info->flags, num_bytes, 0);
-			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
 	}
@@ -4666,7 +4656,6 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 		sinfo->bytes_may_use -= num_bytes;
 		trace_btrfs_space_reservation(fs_info, "space_info",
 				      sinfo->flags, num_bytes, 0);
-		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
 	}
@@ -5446,7 +5435,6 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 			space_info->bytes_readonly += num_bytes;
 		cache->reserved -= num_bytes;
 		space_info->bytes_reserved -= num_bytes;
-		space_info->reservation_progress++;
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&space_info->lock);
@@ -6117,10 +6105,13 @@ enum btrfs_loop_type {
 /*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
- * ins->objectid == block start
+ * ins->objectid == start position
  * ins->flags = BTRFS_EXTENT_ITEM_KEY
- * ins->offset == number of blocks
+ * ins->offset == the size of the hole.
  * Any available blocks before search_start are skipped.
+ *
+ * If there is no suitable free space, we will record the max size of
+ * the free space extent currently.
  */
 static noinline int find_free_extent(struct btrfs_root *orig_root,
 				     u64 num_bytes, u64 empty_size,
@@ -6133,6 +6124,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
 	struct btrfs_block_group_cache *block_group = NULL;
 	struct btrfs_block_group_cache *used_block_group;
 	u64 search_start = 0;
+	u64 max_extent_size = 0;
 	int empty_cluster = 2 * 1024 * 1024;
 	struct btrfs_space_info *space_info;
 	int loop = 0;
@@ -6292,7 +6284,10 @@ have_block_group:
 				btrfs_get_block_group(used_block_group);
 
 			offset = btrfs_alloc_from_cluster(used_block_group,
-			  last_ptr, num_bytes, used_block_group->key.objectid);
+						last_ptr,
+						num_bytes,
+						used_block_group->key.objectid,
+						&max_extent_size);
 			if (offset) {
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
@@ -6355,8 +6350,10 @@ refill_cluster:
 				 * cluster
 				 */
 				offset = btrfs_alloc_from_cluster(block_group,
-						  last_ptr, num_bytes,
-						  search_start);
+							last_ptr,
+							num_bytes,
+							search_start,
+							&max_extent_size);
 				if (offset) {
 					/* we found one, proceed */
 					spin_unlock(&last_ptr->refill_lock);
@@ -6391,13 +6388,18 @@ unclustered_alloc:
 		if (cached &&
 		    block_group->free_space_ctl->free_space <
 		    num_bytes + empty_cluster + empty_size) {
+			if (block_group->free_space_ctl->free_space >
+			    max_extent_size)
+				max_extent_size =
+					block_group->free_space_ctl->free_space;
 			spin_unlock(&block_group->free_space_ctl->tree_lock);
 			goto loop;
 		}
 		spin_unlock(&block_group->free_space_ctl->tree_lock);
 
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
-						    num_bytes, empty_size);
+						    num_bytes, empty_size,
+						    &max_extent_size);
 		/*
 		 * If we didn't find a chunk, and we haven't failed on this
 		 * block group before, and this block group is in the middle of
@@ -6515,7 +6517,8 @@ loop:
 		ret = 0;
 	}
 out:
-
+	if (ret == -ENOSPC)
+		ins->offset = max_extent_size;
 	return ret;
 }
 
@@ -6573,8 +6576,8 @@ again:
 			       flags);
 
 	if (ret == -ENOSPC) {
-		if (!final_tried) {
-			num_bytes = num_bytes >> 1;
+		if (!final_tried && ins->offset) {
+			num_bytes = min(num_bytes >> 1, ins->offset);
 			num_bytes = round_down(num_bytes, root->sectorsize);
 			num_bytes = max(num_bytes, min_alloc_size);
 			if (num_bytes == min_alloc_size)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 09582b81640c..51731b76900d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -145,8 +145,16 @@ int __init extent_io_init(void)
 				     offsetof(struct btrfs_io_bio, bio));
 	if (!btrfs_bioset)
 		goto free_buffer_cache;
+
+	if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
+		goto free_bioset;
+
 	return 0;
 
+free_bioset:
+	bioset_free(btrfs_bioset);
+	btrfs_bioset = NULL;
+
 free_buffer_cache:
 	kmem_cache_destroy(extent_buffer_cache);
 	extent_buffer_cache = NULL;
@@ -1481,11 +1489,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
 		*end = state->end;
 		cur_start = state->end + 1;
 		node = rb_next(node);
-		if (!node)
-			break;
 		total_bytes += state->end - state->start + 1;
 		if (total_bytes >= max_bytes)
 			break;
+		if (!node)
+			break;
 	}
 out:
 	spin_unlock(&tree->lock);
@@ -1612,7 +1620,7 @@ again:
 		*start = delalloc_start;
 		*end = delalloc_end;
 		free_extent_state(cached_state);
-		return found;
+		return 0;
 	}
 
 	/*
@@ -1625,10 +1633,9 @@ again:
 
 	/*
 	 * make sure to limit the number of pages we try to lock down
-	 * if we're looping.
 	 */
-	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
-		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+	if (delalloc_end + 1 - delalloc_start > max_bytes)
+		delalloc_end = delalloc_start + max_bytes - 1;
 
 	/* step two, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
@@ -1639,8 +1646,7 @@ again:
 		 */
 		free_extent_state(cached_state);
 		if (!loops) {
-			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
-			max_bytes = PAGE_CACHE_SIZE - offset;
+			max_bytes = PAGE_CACHE_SIZE;
 			loops = 1;
 			goto again;
 		} else {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bc5072b2db53..72da4df53c9a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1859,8 +1859,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	ret = btrfs_log_dentry_safe(trans, root, dentry);
 	if (ret < 0) {
-		mutex_unlock(&inode->i_mutex);
-		goto out;
+		/* Fallthrough and commit/free transaction. */
+		ret = 1;
 	}
 
 	/* we've logged all the items and now have a consistent
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3f0ddfce96e6..b4f9904c4c6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1431,13 +1431,19 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
 	ctl->free_space += bytes;
 }
 
+/*
+ * If we can not find suitable extent, we will use bytes to record
+ * the size of the max extent.
+ */
 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 			 struct btrfs_free_space *bitmap_info, u64 *offset,
 			 u64 *bytes)
 {
 	unsigned long found_bits = 0;
+	unsigned long max_bits = 0;
 	unsigned long bits, i;
 	unsigned long next_zero;
+	unsigned long extent_bits;
 
 	i = offset_to_bit(bitmap_info->offset, ctl->unit,
 			  max_t(u64, *offset, bitmap_info->offset));
@@ -1446,9 +1452,12 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
 		next_zero = find_next_zero_bit(bitmap_info->bitmap,
 					       BITS_PER_BITMAP, i);
-		if ((next_zero - i) >= bits) {
-			found_bits = next_zero - i;
+		extent_bits = next_zero - i;
+		if (extent_bits >= bits) {
+			found_bits = extent_bits;
 			break;
+		} else if (extent_bits > max_bits) {
+			max_bits = extent_bits;
 		}
 		i = next_zero;
 	}
@@ -1459,38 +1468,41 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 		return 0;
 	}
 
+	*bytes = (u64)(max_bits) * ctl->unit;
 	return -1;
 }
 
+/* Cache the size of the max extent in bytes */
 static struct btrfs_free_space *
 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
-		unsigned long align)
+		unsigned long align, u64 *max_extent_size)
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
-	u64 ctl_off;
 	u64 tmp;
 	u64 align_off;
 	int ret;
 
 	if (!ctl->free_space_offset.rb_node)
-		return NULL;
+		goto out;
 
 	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
 	if (!entry)
-		return NULL;
+		goto out;
 
 	for (node = &entry->offset_index; node; node = rb_next(node)) {
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
-		if (entry->bytes < *bytes)
+		if (entry->bytes < *bytes) {
+			if (entry->bytes > *max_extent_size)
+				*max_extent_size = entry->bytes;
 			continue;
+		}
 
 		/* make sure the space returned is big enough
 		 * to match our requested alignment
 		 */
 		if (*bytes >= align) {
-			ctl_off = entry->offset - ctl->start;
-			tmp = ctl_off + align - 1;;
+			tmp = entry->offset - ctl->start + align - 1;
 			do_div(tmp, align);
 			tmp = tmp * align + ctl->start;
 			align_off = tmp - entry->offset;
@@ -1499,14 +1511,22 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 			tmp = entry->offset;
 		}
 
-		if (entry->bytes < *bytes + align_off)
+		if (entry->bytes < *bytes + align_off) {
+			if (entry->bytes > *max_extent_size)
+				*max_extent_size = entry->bytes;
 			continue;
+		}
 
 		if (entry->bitmap) {
-			ret = search_bitmap(ctl, entry, &tmp, bytes);
+			u64 size = *bytes;
+
+			ret = search_bitmap(ctl, entry, &tmp, &size);
 			if (!ret) {
 				*offset = tmp;
+				*bytes = size;
 				return entry;
+			} else if (size > *max_extent_size) {
+				*max_extent_size = size;
 			}
 			continue;
 		}
@@ -1515,7 +1535,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 		*bytes = entry->bytes - align_off;
 		return entry;
 	}
-
+out:
 	return NULL;
 }
 
@@ -2116,7 +2136,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 }
 
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
-			       u64 offset, u64 bytes, u64 empty_size)
+			       u64 offset, u64 bytes, u64 empty_size,
+			       u64 *max_extent_size)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry = NULL;
@@ -2127,7 +2148,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 
 	spin_lock(&ctl->tree_lock);
 	entry = find_free_space(ctl, &offset, &bytes_search,
-				block_group->full_stripe_len);
+				block_group->full_stripe_len, max_extent_size);
 	if (!entry)
 		goto out;
 
@@ -2137,7 +2158,6 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 		if (!entry->bytes)
 			free_bitmap(ctl, entry);
 	} else {
-
 		unlink_free_space(ctl, entry);
 		align_gap_len = offset - entry->offset;
 		align_gap = entry->offset;
@@ -2151,7 +2171,6 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 		else
 			link_free_space(ctl, entry);
 	}
-
 out:
 	spin_unlock(&ctl->tree_lock);
 
@@ -2206,7 +2225,8 @@ int btrfs_return_cluster_to_free_space(
 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 				   struct btrfs_free_cluster *cluster,
 				   struct btrfs_free_space *entry,
-				   u64 bytes, u64 min_start)
+				   u64 bytes, u64 min_start,
+				   u64 *max_extent_size)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	int err;
@@ -2218,8 +2238,11 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 	search_bytes = bytes;
 
 	err = search_bitmap(ctl, entry, &search_start, &search_bytes);
-	if (err)
+	if (err) {
+		if (search_bytes > *max_extent_size)
+			*max_extent_size = search_bytes;
 		return 0;
+	}
 
 	ret = search_start;
 	__bitmap_clear_bits(ctl, entry, ret, bytes);
@@ -2234,7 +2257,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
  */
 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster, u64 bytes,
-			     u64 min_start)
+			     u64 min_start, u64 *max_extent_size)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry = NULL;
@@ -2254,6 +2277,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 
 	entry = rb_entry(node, struct btrfs_free_space, offset_index);
 	while(1) {
+		if (entry->bytes < bytes && entry->bytes > *max_extent_size)
+			*max_extent_size = entry->bytes;
+
 		if (entry->bytes < bytes ||
 		    (!entry->bitmap && entry->offset < min_start)) {
 			node = rb_next(&entry->offset_index);
@@ -2267,7 +2293,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 		if (entry->bitmap) {
 			ret = btrfs_alloc_from_bitmap(block_group,
 						      cluster, entry, bytes,
-						      cluster->window_start);
+						      cluster->window_start,
+						      max_extent_size);
 			if (ret == 0) {
 				node = rb_next(&entry->offset_index);
 				if (!node)
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index c74904167476..e737f92cf6d0 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -94,7 +94,8 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
 				     *block_group);
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
-			       u64 offset, u64 bytes, u64 empty_size);
+			       u64 offset, u64 bytes, u64 empty_size,
+			       u64 *max_extent_size);
 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 			   u64 bytes);
@@ -105,7 +106,7 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster, u64 bytes,
-			     u64 min_start);
+			     u64 min_start, u64 *max_extent_size);
 int btrfs_return_cluster_to_free_space(
 			       struct btrfs_block_group_cache *block_group,
 			       struct btrfs_free_cluster *cluster);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f338c5672d58..51e3afa78354 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4688,11 +4688,11 @@ static void inode_tree_add(struct inode *inode)
 	struct btrfs_inode *entry;
 	struct rb_node **p;
 	struct rb_node *parent;
+	struct rb_node *new = &BTRFS_I(inode)->rb_node;
 	u64 ino = btrfs_ino(inode);
 
 	if (inode_unhashed(inode))
 		return;
-again:
 	parent = NULL;
 	spin_lock(&root->inode_lock);
 	p = &root->inode_tree.rb_node;
@@ -4707,14 +4707,14 @@ again:
 		else {
 			WARN_ON(!(entry->vfs_inode.i_state &
 				  (I_WILL_FREE | I_FREEING)));
-			rb_erase(parent, &root->inode_tree);
+			rb_replace_node(parent, new, &root->inode_tree);
 			RB_CLEAR_NODE(parent);
 			spin_unlock(&root->inode_lock);
-			goto again;
+			return;
 		}
 	}
-	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
-	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+	rb_link_node(new, parent, p);
+	rb_insert_color(new, &root->inode_tree);
 	spin_unlock(&root->inode_lock);
 }
 
@@ -6437,6 +6437,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 
 	if (btrfs_extent_readonly(root, disk_bytenr))
 		goto out;
+	btrfs_release_path(path);
 
 	/*
 	 * look for other files referencing this extent, if we
@@ -7986,7 +7987,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 
 	/* check for collisions, even if the  name isn't there */
-	ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
+	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
 			     new_dentry->d_name.name,
 			     new_dentry->d_name.len);
 
@@ -8216,6 +8217,10 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 
 		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
 		if (unlikely(!work)) {
+			if (delay_iput)
+				btrfs_add_delayed_iput(inode);
+			else
+				iput(inode);
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -8613,11 +8618,13 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 	.get_acl	= btrfs_get_acl,
+	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_dir_ro_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.permission	= btrfs_permission,
 	.get_acl	= btrfs_get_acl,
+	.update_time	= btrfs_update_time,
 };
 
 static const struct file_operations btrfs_dir_file_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1a5b9462dd9a..9d46f60cb943 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -574,7 +574,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	if (ret)
 		return ret;
 
-	btrfs_wait_ordered_extents(root, 0);
+	btrfs_wait_ordered_extents(root);
 
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
 	if (!pending_snapshot)
@@ -2696,9 +2696,9 @@ out_unlock:
 static long btrfs_ioctl_file_extent_same(struct file *file,
 					 void __user *argp)
 {
-	struct btrfs_ioctl_same_args *args = argp;
-	struct btrfs_ioctl_same_args same;
-	struct btrfs_ioctl_same_extent_info info;
+	struct btrfs_ioctl_same_args tmp;
+	struct btrfs_ioctl_same_args *same;
+	struct btrfs_ioctl_same_extent_info *info;
 	struct inode *src = file->f_dentry->d_inode;
 	struct file *dst_file = NULL;
 	struct inode *dst;
@@ -2706,6 +2706,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 	u64 len;
 	int i;
 	int ret;
+	unsigned long size;
 	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
 	bool is_admin = capable(CAP_SYS_ADMIN);
 
@@ -2716,15 +2717,30 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 	if (ret)
 		return ret;
 
-	if (copy_from_user(&same,
+	if (copy_from_user(&tmp,
 			   (struct btrfs_ioctl_same_args __user *)argp,
-			   sizeof(same))) {
+			   sizeof(tmp))) {
 		ret = -EFAULT;
 		goto out;
 	}
 
-	off = same.logical_offset;
-	len = same.length;
+	size = sizeof(tmp) +
+		tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info);
+
+	same = kmalloc(size, GFP_NOFS);
+	if (!same) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (copy_from_user(same,
+			   (struct btrfs_ioctl_same_args __user *)argp, size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	off = same->logical_offset;
+	len = same->length;
 
 	/*
 	 * Limit the total length we will dedupe for each operation.
@@ -2752,27 +2768,28 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 	if (!S_ISREG(src->i_mode))
 		goto out;
 
-	ret = 0;
-	for (i = 0; i < same.dest_count; i++) {
-		if (copy_from_user(&info, &args->info[i], sizeof(info))) {
-			ret = -EFAULT;
-			goto out;
-		}
+	/* pre-format output fields to sane values */
+	for (i = 0; i < same->dest_count; i++) {
+		same->info[i].bytes_deduped = 0ULL;
+		same->info[i].status = 0;
+	}
 
-		info.bytes_deduped = 0;
+	ret = 0;
+	for (i = 0; i < same->dest_count; i++) {
+		info = &same->info[i];
 
-		dst_file = fget(info.fd);
+		dst_file = fget(info->fd);
 		if (!dst_file) {
-			info.status = -EBADF;
+			info->status = -EBADF;
 			goto next;
 		}
 
 		if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
-			info.status = -EINVAL;
+			info->status = -EINVAL;
 			goto next;
 		}
 
-		info.status = -EXDEV;
+		info->status = -EXDEV;
 		if (file->f_path.mnt != dst_file->f_path.mnt)
 			goto next;
 
@@ -2781,32 +2798,29 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 			goto next;
 
 		if (S_ISDIR(dst->i_mode)) {
-			info.status = -EISDIR;
+			info->status = -EISDIR;
 			goto next;
 		}
 
 		if (!S_ISREG(dst->i_mode)) {
-			info.status = -EACCES;
+			info->status = -EACCES;
 			goto next;
 		}
 
-		info.status = btrfs_extent_same(src, off, len, dst,
-						info.logical_offset);
-		if (info.status == 0)
-			info.bytes_deduped += len;
+		info->status = btrfs_extent_same(src, off, len, dst,
+						info->logical_offset);
+		if (info->status == 0)
+			info->bytes_deduped += len;
 
 next:
 		if (dst_file)
 			fput(dst_file);
-
-		if (__put_user_unaligned(info.status, &args->info[i].status) ||
-		    __put_user_unaligned(info.bytes_deduped,
-					 &args->info[i].bytes_deduped)) {
-			ret = -EFAULT;
-			goto out;
-		}                                                               
 	}
 
+	ret = copy_to_user(argp, same, size);
+	if (ret)
+		ret = -EFAULT;
+
 out:
 	mnt_drop_write_file(file);
 	return ret;
@@ -3310,7 +3324,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	}
 
 	if (!objectid)
-		objectid = root->root_key.objectid;
+		objectid = BTRFS_FS_TREE_OBJECTID;
 
 	location.objectid = objectid;
 	location.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 966b413a33b8..c702cb62f78a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -563,11 +563,10 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
+void btrfs_wait_ordered_extents(struct btrfs_root *root)
 {
 	struct list_head splice, works;
 	struct btrfs_ordered_extent *ordered, *next;
-	struct inode *inode;
 
 	INIT_LIST_HEAD(&splice);
 	INIT_LIST_HEAD(&works);
@@ -580,15 +579,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 					   root_extent_list);
 		list_move_tail(&ordered->root_extent_list,
 			       &root->ordered_extents);
-		/*
-		 * the inode may be getting freed (in sys_unlink path).
-		 */
-		inode = igrab(ordered->inode);
-		if (!inode) {
-			cond_resched_lock(&root->ordered_extent_lock);
-			continue;
-		}
-
 		atomic_inc(&ordered->refs);
 		spin_unlock(&root->ordered_extent_lock);
 
@@ -605,21 +595,13 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 	list_for_each_entry_safe(ordered, next, &works, work_list) {
 		list_del_init(&ordered->work_list);
 		wait_for_completion(&ordered->completion);
-
-		inode = ordered->inode;
 		btrfs_put_ordered_extent(ordered);
-		if (delay_iput)
-			btrfs_add_delayed_iput(inode);
-		else
-			iput(inode);
-
 		cond_resched();
 	}
 	mutex_unlock(&root->fs_info->ordered_operations_mutex);
 }
 
-void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
-				    int delay_iput)
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root;
 	struct list_head splice;
@@ -637,7 +619,7 @@ void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
 			       &fs_info->ordered_roots);
 		spin_unlock(&fs_info->ordered_root_lock);
 
-		btrfs_wait_ordered_extents(root, delay_iput);
+		btrfs_wait_ordered_extents(root);
 		btrfs_put_fs_root(root);
 
 		spin_lock(&fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d9a5aa097b4f..0c0b35612d7a 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -195,9 +195,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
-void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
-				    int delay_iput);
+void btrfs_wait_ordered_extents(struct btrfs_root *root);
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info);
 void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index aacc2121e87c..4a355726151e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -588,7 +588,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
 	else
 		key.offset = (u64)-1;
 
-	return btrfs_read_fs_root_no_name(fs_info, &key);
+	return btrfs_get_fs_root(fs_info, &key, false);
 }
 
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -1548,7 +1548,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
 	       btrfs_file_extent_other_encoding(leaf, fi));
 
 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
-		ret = 1;
+		ret = -EINVAL;
 		goto out;
 	}
 
@@ -1579,7 +1579,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 	u64 end;
 	u32 nritems;
 	u32 i;
-	int ret;
+	int ret = 0;
 	int first = 1;
 	int dirty = 0;
 
@@ -1642,11 +1642,13 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 
 		ret = get_new_location(rc->data_inode, &new_bytenr,
 				       bytenr, num_bytes);
-		if (ret > 0) {
-			WARN_ON(1);
-			continue;
+		if (ret) {
+			/*
+			 * Don't have to abort since we've not changed anything
+			 * in the file extent yet.
+			 */
+			break;
 		}
-		BUG_ON(ret < 0);
 
 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
 		dirty = 1;
@@ -1656,18 +1658,24 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 					   num_bytes, parent,
 					   btrfs_header_owner(leaf),
 					   key.objectid, key.offset, 1);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			break;
+		}
 
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					parent, btrfs_header_owner(leaf),
 					key.objectid, key.offset, 1);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			break;
+		}
 	}
 	if (dirty)
 		btrfs_mark_buffer_dirty(leaf);
 	if (inode)
 		btrfs_add_delayed_iput(inode);
-	return 0;
+	return ret;
 }
 
 static noinline_for_stack
@@ -4238,7 +4246,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		err = ret;
 		goto out;
 	}
-	btrfs_wait_all_ordered_extents(fs_info, 0);
+	btrfs_wait_all_ordered_extents(fs_info);
 
 	while (1) {
 		mutex_lock(&fs_info->cleaner_mutex);
@@ -4499,19 +4507,19 @@ out:
 	return ret;
 }
 
-void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct extent_buffer *buf,
-			   struct extent_buffer *cow)
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct extent_buffer *cow)
 {
 	struct reloc_control *rc;
 	struct backref_node *node;
 	int first_cow = 0;
 	int level;
-	int ret;
+	int ret = 0;
 
 	rc = root->fs_info->reloc_ctl;
 	if (!rc)
-		return;
+		return 0;
 
 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
@@ -4547,10 +4555,9 @@ void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 			rc->nodes_relocated += buf->len;
 	}
 
-	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
 		ret = replace_file_extents(trans, rc, root, cow);
-		BUG_ON(ret);
-	}
+	return ret;
 }
 
 /*
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0b1f4ef8db98..ec71ea44d2b4 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -299,11 +299,6 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 			continue;
 		}
 
-		if (btrfs_root_refs(&root->root_item) == 0) {
-			btrfs_add_dead_root(root);
-			continue;
-		}
-
 		err = btrfs_init_fs_root(root);
 		if (err) {
 			btrfs_free_fs_root(root);
@@ -318,6 +313,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 			btrfs_free_fs_root(root);
 			break;
 		}
+
+		if (btrfs_root_refs(&root->root_item) == 0)
+			btrfs_add_dead_root(root);
 	}
 
 	btrfs_free_path(path);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0afcd452fcb3..a18e0e23f6a6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -158,12 +158,20 @@ struct scrub_fixup_nodatasum {
 	int			mirror_num;
 };
 
+struct scrub_nocow_inode {
+	u64			inum;
+	u64			offset;
+	u64			root;
+	struct list_head	list;
+};
+
 struct scrub_copy_nocow_ctx {
 	struct scrub_ctx	*sctx;
 	u64			logical;
 	u64			len;
 	int			mirror_num;
 	u64			physical_for_dev_replace;
+	struct list_head	inodes;
 	struct btrfs_work	work;
 };
 
@@ -245,7 +253,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 static int write_page_nocow(struct scrub_ctx *sctx,
 			    u64 physical_for_dev_replace, struct page *page);
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
-				      void *ctx);
+				      struct scrub_copy_nocow_ctx *ctx);
 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 			    int mirror_num, u64 physical_for_dev_replace);
 static void copy_nocow_pages_worker(struct btrfs_work *work);
@@ -3126,12 +3134,30 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 	nocow_ctx->mirror_num = mirror_num;
 	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
 	nocow_ctx->work.func = copy_nocow_pages_worker;
+	INIT_LIST_HEAD(&nocow_ctx->inodes);
 	btrfs_queue_worker(&fs_info->scrub_nocow_workers,
 			   &nocow_ctx->work);
 
 	return 0;
 }
 
+static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
+	struct scrub_nocow_inode *nocow_inode;
+
+	nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
+	if (!nocow_inode)
+		return -ENOMEM;
+	nocow_inode->inum = inum;
+	nocow_inode->offset = offset;
+	nocow_inode->root = root;
+	list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
+	return 0;
+}
+
+#define COPY_COMPLETE 1
+
 static void copy_nocow_pages_worker(struct btrfs_work *work)
 {
 	struct scrub_copy_nocow_ctx *nocow_ctx =
@@ -3167,8 +3193,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
 	}
 
 	ret = iterate_inodes_from_logical(logical, fs_info, path,
-					  copy_nocow_pages_for_inode,
-					  nocow_ctx);
+					  record_inode_for_nocow, nocow_ctx);
 	if (ret != 0 && ret != -ENOENT) {
 		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
 			logical, physical_for_dev_replace, len, mirror_num,
@@ -3177,7 +3202,33 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
 		goto out;
 	}
 
+	btrfs_end_transaction(trans, root);
+	trans = NULL;
+	while (!list_empty(&nocow_ctx->inodes)) {
+		struct scrub_nocow_inode *entry;
+		entry = list_first_entry(&nocow_ctx->inodes,
+					 struct scrub_nocow_inode,
+					 list);
+		list_del_init(&entry->list);
+		ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
+						 entry->root, nocow_ctx);
+		kfree(entry);
+		if (ret == COPY_COMPLETE) {
+			ret = 0;
+			break;
+		} else if (ret) {
+			break;
+		}
+	}
 out:
+	while (!list_empty(&nocow_ctx->inodes)) {
+		struct scrub_nocow_inode *entry;
+		entry = list_first_entry(&nocow_ctx->inodes,
+					 struct scrub_nocow_inode,
+					 list);
+		list_del_init(&entry->list);
+		kfree(entry);
+	}
 	if (trans && !IS_ERR(trans))
 		btrfs_end_transaction(trans, root);
 	if (not_written)
@@ -3190,20 +3241,25 @@ out:
 	scrub_pending_trans_workers_dec(sctx);
 }
 
-static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+				      struct scrub_copy_nocow_ctx *nocow_ctx)
 {
-	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
 	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
 	struct btrfs_key key;
 	struct inode *inode;
 	struct page *page;
 	struct btrfs_root *local_root;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_map *em;
+	struct extent_state *cached_state = NULL;
+	struct extent_io_tree *io_tree;
 	u64 physical_for_dev_replace;
-	u64 len;
+	u64 len = nocow_ctx->len;
+	u64 lockstart = offset, lockend = offset + len - 1;
 	unsigned long index;
 	int srcu_index;
-	int ret;
-	int err;
+	int ret = 0;
+	int err = 0;
 
 	key.objectid = root;
 	key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3229,9 +3285,33 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
 	mutex_lock(&inode->i_mutex);
 	inode_dio_wait(inode);
 
-	ret = 0;
 	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
-	len = nocow_ctx->len;
+	io_tree = &BTRFS_I(inode)->io_tree;
+
+	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		goto out_unlock;
+	}
+
+	em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_unlock;
+	}
+
+	/*
+	 * This extent does not actually cover the logical extent anymore,
+	 * move on to the next inode.
+	 */
+	if (em->block_start > nocow_ctx->logical ||
+	    em->block_start + em->block_len < nocow_ctx->logical + len) {
+		free_extent_map(em);
+		goto out_unlock;
+	}
+	free_extent_map(em);
+
 	while (len >= PAGE_CACHE_SIZE) {
 		index = offset >> PAGE_CACHE_SHIFT;
 again:
@@ -3247,10 +3327,9 @@ again:
 				goto next_page;
 		} else {
 			ClearPageError(page);
-			err = extent_read_full_page(&BTRFS_I(inode)->
-							 io_tree,
-							page, btrfs_get_extent,
-							nocow_ctx->mirror_num);
+			err = extent_read_full_page_nolock(io_tree, page,
+							   btrfs_get_extent,
+							   nocow_ctx->mirror_num);
 			if (err) {
 				ret = err;
 				goto next_page;
@@ -3264,6 +3343,7 @@ again:
 			 * page in the page cache.
 			 */
 			if (page->mapping != inode->i_mapping) {
+				unlock_page(page);
 				page_cache_release(page);
 				goto again;
 			}
@@ -3287,6 +3367,10 @@ next_page:
 		physical_for_dev_replace += PAGE_CACHE_SIZE;
 		len -= PAGE_CACHE_SIZE;
 	}
+	ret = COPY_COMPLETE;
+out_unlock:
+	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+			     GFP_NOFS);
 out:
 	mutex_unlock(&inode->i_mutex);
 	iput(inode);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3aab10ce63e8..e913328d0f2a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -921,7 +921,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 
-	btrfs_wait_all_ordered_extents(fs_info, 1);
+	btrfs_wait_all_ordered_extents(fs_info);
 
 	trans = btrfs_attach_transaction_barrier(root);
 	if (IS_ERR(trans)) {
@@ -1340,6 +1340,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (ret)
 			goto restore;
 	} else {
+		if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+			btrfs_err(fs_info,
+				"Remounting read-write after error is not allowed\n");
+			ret = -EINVAL;
+			goto restore;
+		}
 		if (fs_info->fs_devices->rw_devices == 0) {
 			ret = -EACCES;
 			goto restore;
@@ -1377,6 +1383,16 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 			pr_warn("btrfs: failed to resume dev_replace\n");
 			goto restore;
 		}
+
+		if (!fs_info->uuid_root) {
+			pr_info("btrfs: creating UUID tree\n");
+			ret = btrfs_create_uuid_tree(fs_info);
+			if (ret) {
+				pr_warn("btrfs: failed to create the uuid tree"
+					"%d\n", ret);
+				goto restore;
+			}
+		}
 		sb->s_flags &= ~MS_RDONLY;
 	}
 out:
@@ -1762,6 +1778,9 @@ static void btrfs_print_info(void)
 #ifdef CONFIG_BTRFS_DEBUG
 			", debug=on"
 #endif
+#ifdef CONFIG_BTRFS_ASSERT
+			", assert=on"
+#endif
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 			", integrity-checker=on"
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cac4a3f76323..8c81bdc1ef9b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1603,7 +1603,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
 	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-		btrfs_wait_all_ordered_extents(fs_info, 1);
+		btrfs_wait_all_ordered_extents(fs_info);
 }
 
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1838,11 +1838,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	assert_qgroups_uptodate(trans);
 	update_super_roots(root);
 
-	if (!root->fs_info->log_root_recovering) {
-		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
-		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
-	}
-
+	btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+	btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
 	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
 	       sizeof(*root->fs_info->super_copy));
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0d9613c3f5e5..79f057c0619a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -93,7 +93,8 @@
  */
 #define LOG_WALK_PIN_ONLY 0
 #define LOG_WALK_REPLAY_INODES 1
-#define LOG_WALK_REPLAY_ALL 2
+#define LOG_WALK_REPLAY_DIR_INDEX 2
+#define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
@@ -393,6 +394,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 		if (inode_item) {
 			struct btrfs_inode_item *item;
 			u64 nbytes;
+			u32 mode;
 
 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 					      struct btrfs_inode_item);
@@ -400,9 +402,19 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 			item = btrfs_item_ptr(eb, slot,
 					      struct btrfs_inode_item);
 			btrfs_set_inode_nbytes(eb, item, nbytes);
+
+			/*
+			 * If this is a directory we need to reset the i_size to
+			 * 0 so that we can set it up properly when replaying
+			 * the rest of the items in this log.
+			 */
+			mode = btrfs_inode_mode(eb, item);
+			if (S_ISDIR(mode))
+				btrfs_set_inode_size(eb, item, 0);
 		}
 	} else if (inode_item) {
 		struct btrfs_inode_item *item;
+		u32 mode;
 
 		/*
 		 * New inode, set nbytes to 0 so that the nbytes comes out
@@ -410,6 +422,15 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 		 */
 		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 		btrfs_set_inode_nbytes(eb, item, 0);
+
+		/*
+		 * If this is a directory we need to reset the i_size to 0 so
+		 * that we can set it up properly when replaying the rest of
+		 * the items in this log.
+		 */
+		mode = btrfs_inode_mode(eb, item);
+		if (S_ISDIR(mode))
+			btrfs_set_inode_size(eb, item, 0);
 	}
 insert:
 	btrfs_release_path(path);
@@ -1496,6 +1517,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
 		iput(inode);
 		return -EIO;
 	}
+
 	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
 
 	/* FIXME, put inode into FIXUP list */
@@ -1534,6 +1556,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	u8 log_type;
 	int exists;
 	int ret = 0;
+	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
 
 	dir = read_one_inode(root, key->objectid);
 	if (!dir)
@@ -1604,6 +1627,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 		goto insert;
 out:
 	btrfs_release_path(path);
+	if (!ret && update_size) {
+		btrfs_i_size_write(dir, dir->i_size + name_len * 2);
+		ret = btrfs_update_inode(trans, root, dir);
+	}
 	kfree(name);
 	iput(dir);
 	return ret;
@@ -1614,6 +1641,7 @@ insert:
 			      name, name_len, log_type, &log_key);
 	if (ret && ret != -ENOENT)
 		goto out;
+	update_size = false;
 	ret = 0;
 	goto out;
 }
@@ -2027,6 +2055,15 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			if (ret)
 				break;
 		}
+
+		if (key.type == BTRFS_DIR_INDEX_KEY &&
+		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
+			ret = replay_one_dir_item(wc->trans, root, path,
+						  eb, i, &key);
+			if (ret)
+				break;
+		}
+
 		if (wc->stage < LOG_WALK_REPLAY_ALL)
 			continue;
 
@@ -2048,8 +2085,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 						eb, i, &key);
 			if (ret)
 				break;
-		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
-			   key.type == BTRFS_DIR_INDEX_KEY) {
+		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
 			ret = replay_one_dir_item(wc->trans, root, path,
 						  eb, i, &key);
 			if (ret)
@@ -3805,6 +3841,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	struct btrfs_root *root;
 	struct dentry *old_parent = NULL;
+	struct inode *orig_inode = inode;
 
 	/*
 	 * for regular files, if its inode is already on disk, we don't
@@ -3824,7 +3861,14 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 	}
 
 	while (1) {
-		BTRFS_I(inode)->logged_trans = trans->transid;
+		/*
+		 * If we are logging a directory then we start with our inode,
+		 * not our parents inode, so we need to skipp setting the
+		 * logged_trans so that further down in the log code we don't
+		 * think this inode has already been logged.
+		 */
+		if (inode != orig_inode)
+			BTRFS_I(inode)->logged_trans = trans->transid;
 		smp_mb();
 
 		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0052ca8264d9..043b215769c2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -796,7 +796,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			fs_devices->rotating = 1;
 
 		fs_devices->open_devices++;
-		if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+		if (device->writeable &&
+		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 			fs_devices->rw_devices++;
 			list_add(&device->dev_alloc_list,
 				 &fs_devices->alloc_list);
@@ -911,9 +912,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	if (disk_super->label[0]) {
 		if (disk_super->label[BTRFS_LABEL_SIZE - 1])
 			disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
-		printk(KERN_INFO "device label %s ", disk_super->label);
+		printk(KERN_INFO "btrfs: device label %s ", disk_super->label);
 	} else {
-		printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
+		printk(KERN_INFO "btrfs: device fsid %pU ", disk_super->fsid);
 	}
 
 	printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
@@ -1715,6 +1716,7 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
 				 struct btrfs_device *srcdev)
 {
 	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+
 	list_del_rcu(&srcdev->dev_list);
 	list_del_rcu(&srcdev->dev_alloc_list);
 	fs_info->fs_devices->num_devices--;
@@ -1724,9 +1726,13 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
 	}
 	if (srcdev->can_discard)
 		fs_info->fs_devices->num_can_discard--;
-	if (srcdev->bdev)
+	if (srcdev->bdev) {
 		fs_info->fs_devices->open_devices--;
 
+		/* zero out the old super */
+		btrfs_scratch_superblock(srcdev);
+	}
+
 	call_rcu(&srcdev->rcu, free_device);
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 4d7433534f5c..6024877335ca 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	struct buffer_head *bh;
 	sector_t end_block;
 	int ret = 0;		/* Will call free_more_memory() */
+	gfp_t gfp_mask;
 
-	page = find_or_create_page(inode->i_mapping, index,
-		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
+	gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
+	gfp_mask |= __GFP_MOVABLE;
+	/*
+	 * XXX: __getblk_slow() can not really deal with failure and
+	 * will endlessly loop on improvised global reclaim.  Prefer
+	 * looping in the allocator rather than here, at least that
+	 * code knows what it's doing.
+	 */
+	gfp_mask |= __GFP_NOFAIL;
+
+	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
 	if (!page)
 		return ret;
 
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 43eb5592cdea..00baf1419989 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -270,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 #endif
 
 	/* delete retired objects */
-	if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
+	if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) &&
 	    _object != cache->cache.fsdef
 	    ) {
 		_debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 25badd1aec5c..f4a08d7fa2f7 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -56,7 +56,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
 		       object->fscache.cookie->parent,
 		       object->fscache.cookie->netfs_data,
 		       object->fscache.cookie->flags);
-		if (keybuf)
+		if (keybuf && cookie->def)
 			keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
 						      CACHEFILES_KEYBUF_SIZE);
 		else
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 34c88b83e39f..12b0eef84183 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -162,8 +162,9 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
 int cachefiles_check_auxdata(struct cachefiles_object *object)
 {
 	struct cachefiles_xattr *auxbuf;
+	enum fscache_checkaux validity;
 	struct dentry *dentry = object->dentry;
-	unsigned int dlen;
+	ssize_t xlen;
 	int ret;
 
 	ASSERT(dentry);
@@ -174,22 +175,22 @@ int cachefiles_check_auxdata(struct cachefiles_object *object)
 	if (!auxbuf)
 		return -ENOMEM;
 
-	auxbuf->len = vfs_getxattr(dentry, cachefiles_xattr_cache,
-				   &auxbuf->type, 512 + 1);
-	if (auxbuf->len < 1)
-		return -ESTALE;
-
-	if (auxbuf->type != object->fscache.cookie->def->type)
-		return -ESTALE;
+	xlen = vfs_getxattr(dentry, cachefiles_xattr_cache,
+			    &auxbuf->type, 512 + 1);
+	ret = -ESTALE;
+	if (xlen < 1 ||
+	    auxbuf->type != object->fscache.cookie->def->type)
+		goto error;
 
-	dlen = auxbuf->len - 1;
-	ret = fscache_check_aux(&object->fscache, &auxbuf->data, dlen);
+	xlen--;
+	validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen);
+	if (validity != FSCACHE_CHECKAUX_OKAY)
+		goto error;
 
+	ret = 0;
+error:
 	kfree(auxbuf);
-	if (ret != FSCACHE_CHECKAUX_OKAY)
-		return -ESTALE;
-
-	return 0;
+	return ret;
 }
 
 /*
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 6bfe65e0b038..7db2e6ca4b8f 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -68,7 +68,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
 {
 	fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
 					      &ceph_fscache_fsid_object_def,
-					      fsc);
+					      fsc, true);
 
 	if (fsc->fscache == NULL) {
 		pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
@@ -204,7 +204,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
 
 	ci->fscache = fscache_acquire_cookie(fsc->fscache,
 					     &ceph_fscache_inode_object_def,
-					     ci);
+					     ci, true);
 done:
 	mutex_unlock(&inode->i_mutex);
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a16b4e58bcc6..77fc5e181077 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -120,14 +120,16 @@ cifs_read_super(struct super_block *sb)
 {
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb;
+	struct cifs_tcon *tcon;
 	int rc = 0;
 
 	cifs_sb = CIFS_SB(sb);
+	tcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
 		sb->s_flags |= MS_POSIXACL;
 
-	if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES)
+	if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)
 		sb->s_maxbytes = MAX_LFS_FILESIZE;
 	else
 		sb->s_maxbytes = MAX_NON_LFS;
@@ -147,7 +149,7 @@ cifs_read_super(struct super_block *sb)
 		goto out_no_root;
 	}
 
-	if (cifs_sb_master_tcon(cifs_sb)->nocase)
+	if (tcon->nocase)
 		sb->s_d_op = &cifs_ci_dentry_ops;
 	else
 		sb->s_d_op = &cifs_dentry_ops;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index ea723a5e8226..6d0b07217ac9 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -132,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "2.01"
+#define CIFS_VERSION   "2.02"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index cfa14c80ef3b..26b1c1dc93f6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -278,6 +278,8 @@ struct smb_version_operations {
 	/* set attributes */
 	int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
 			     const unsigned int);
+	int (*set_compression)(const unsigned int, struct cifs_tcon *,
+			       struct cifsFileInfo *);
 	/* check if we can send an echo or nor */
 	bool (*can_echo)(struct TCP_Server_Info *);
 	/* send echo request */
@@ -547,9 +549,6 @@ struct TCP_Server_Info {
 	unsigned int max_rw;	/* maxRw specifies the maximum */
 	/* message size the server can send or receive for */
 	/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
-	unsigned int max_vcs;	/* maximum number of smb sessions, at least
-				   those that can be specified uniquely with
-				   vcnumbers */
 	unsigned int capabilities; /* selective disabling of caps by smb sess */
 	int timeAdj;  /* Adjust for difference in server time zone in sec */
 	__u64 CurrentMid;         /* multiplex id - rotating counter */
@@ -623,11 +622,34 @@ set_credits(struct TCP_Server_Info *server, const int val)
 }
 
 static inline __u64
-get_next_mid(struct TCP_Server_Info *server)
+get_next_mid64(struct TCP_Server_Info *server)
 {
 	return server->ops->get_next_mid(server);
 }
 
+static inline __le16
+get_next_mid(struct TCP_Server_Info *server)
+{
+	__u16 mid = get_next_mid64(server);
+	/*
+	 * The value in the SMB header should be little endian for easy
+	 * on-the-wire decoding.
+	 */
+	return cpu_to_le16(mid);
+}
+
+static inline __u16
+get_mid(const struct smb_hdr *smb)
+{
+	return le16_to_cpu(smb->Mid);
+}
+
+static inline bool
+compare_mid(__u16 mid, const struct smb_hdr *smb)
+{
+	return mid == le16_to_cpu(smb->Mid);
+}
+
 /*
  * When the server supports very large reads and writes via POSIX extensions,
  * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
@@ -715,7 +737,6 @@ struct cifs_ses {
 	enum statusEnum status;
 	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
-	__u16 vcnum;
 	char *serverOS;		/* name of operating system underlying server */
 	char *serverNOS;	/* name of network operating system of server */
 	char *serverDomain;	/* security realm of server */
@@ -832,6 +853,8 @@ struct cifs_tcon {
 	__u32 maximal_access;
 	__u32 vol_serial_number;
 	__le64 vol_create_time;
+	__u32 ss_flags;		/* sector size flags */
+	__u32 perf_sector_size; /* best sector size for perf */
 #endif /* CONFIG_CIFS_SMB2 */
 #ifdef CONFIG_CIFS_FSCACHE
 	u64 resource_id;		/* server resource id */
@@ -1272,6 +1295,7 @@ struct dfs_info3_param {
 #define CIFS_FATTR_DELETE_PENDING	0x2
 #define CIFS_FATTR_NEED_REVAL		0x4
 #define CIFS_FATTR_INO_COLLISION	0x8
+#define CIFS_FATTR_UNKNOWN_NLINK	0x10
 
 struct cifs_fattr {
 	u32		cf_flags;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 948676db8e2e..9e5ee34de986 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -428,7 +428,7 @@ struct smb_hdr {
 	__u16 Tid;
 	__le16 Pid;
 	__u16 Uid;
-	__u16 Mid;
+	__le16 Mid;
 	__u8 WordCount;
 } __attribute__((packed));
 
@@ -1352,6 +1352,35 @@ typedef struct smb_com_transaction_ioctl_req {
 	__u8 Data[1];
 } __attribute__((packed)) TRANSACT_IOCTL_REQ;
 
+typedef struct smb_com_transaction_compr_ioctl_req {
+	struct smb_hdr hdr;	/* wct = 23 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* four setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 2 = IOCTL/FSCTL */
+	__le32 FunctionCode;
+	__u16 Fid;
+	__u8 IsFsctl;  /* 1 = File System Control 0 = device control (IOCTL) */
+	__u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */
+	__le16 ByteCount;
+	__u8 Pad[3];
+	__le16 compression_state;  /* See below for valid flags */
+} __attribute__((packed)) TRANSACT_COMPR_IOCTL_REQ;
+
+/* compression state flags */
+#define COMPRESSION_FORMAT_NONE		0x0000
+#define COMPRESSION_FORMAT_DEFAULT	0x0001
+#define COMPRESSION_FORMAT_LZNT1	0x0002
+
 typedef struct smb_com_transaction_ioctl_rsp {
 	struct smb_hdr hdr;	/* wct = 19 */
 	__u8 Reserved[3];
@@ -1491,15 +1520,30 @@ struct file_notify_information {
 	__u8  FileName[0];
 } __attribute__((packed));
 
-struct reparse_data {
-	__u32	ReparseTag;
-	__u16	ReparseDataLength;
+/* For IO_REPARSE_TAG_SYMLINK */
+struct reparse_symlink_data {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
 	__u16	Reserved;
-	__u16	SubstituteNameOffset;
-	__u16	SubstituteNameLength;
-	__u16	PrintNameOffset;
-	__u16	PrintNameLength;
-	__u32	Flags;
+	__le16	SubstituteNameOffset;
+	__le16	SubstituteNameLength;
+	__le16	PrintNameOffset;
+	__le16	PrintNameLength;
+	__le32	Flags;
+	char	PathBuffer[0];
+} __attribute__((packed));
+
+/* For IO_REPARSE_TAG_NFS */
+#define NFS_SPECFILE_LNK	0x00000000014B4E4C
+#define NFS_SPECFILE_CHR	0x0000000000524843
+#define NFS_SPECFILE_BLK	0x00000000004B4C42
+#define NFS_SPECFILE_FIFO	0x000000004F464946
+#define NFS_SPECFILE_SOCK	0x000000004B434F53
+struct reparse_posix_data {
+	__le32	ReparseTag;
+	__le16	ReparseDataLength;
+	__u16	Reserved;
+	__le64	InodeType; /* LNK, FIFO, CHR etc. */
 	char	PathBuffer[0];
 } __attribute__((packed));
 
@@ -2200,6 +2244,9 @@ typedef struct {
 	__le32 DeviceCharacteristics;
 } __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */
 
+/* minimum includes first three fields, and empty FS Name */
+#define MIN_FS_ATTR_INFO_SIZE 12
+
 typedef struct {
 	__le32 Attributes;
 	__le32 MaxPathNameComponentLength;
@@ -2652,26 +2699,7 @@ typedef struct file_xattr_info {
 } __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
 					      level 0x205 */
 
-
-/* flags for chattr command */
-#define EXT_SECURE_DELETE		0x00000001 /* EXT3_SECRM_FL */
-#define EXT_ENABLE_UNDELETE		0x00000002 /* EXT3_UNRM_FL */
-/* Reserved for compress file 0x4 */
-#define EXT_SYNCHRONOUS			0x00000008 /* EXT3_SYNC_FL */
-#define EXT_IMMUTABLE_FL		0x00000010 /* EXT3_IMMUTABLE_FL */
-#define EXT_OPEN_APPEND_ONLY		0x00000020 /* EXT3_APPEND_FL */
-#define EXT_DO_NOT_BACKUP		0x00000040 /* EXT3_NODUMP_FL */
-#define EXT_NO_UPDATE_ATIME		0x00000080 /* EXT3_NOATIME_FL */
-/* 0x100 through 0x800 reserved for compression flags and are GET-ONLY */
-#define EXT_HASH_TREE_INDEXED_DIR	0x00001000 /* GET-ONLY EXT3_INDEX_FL */
-/* 0x2000 reserved for IMAGIC_FL */
-#define EXT_JOURNAL_THIS_FILE	0x00004000 /* GET-ONLY EXT3_JOURNAL_DATA_FL */
-/* 0x8000 reserved for EXT3_NOTAIL_FL */
-#define EXT_SYNCHRONOUS_DIR		0x00010000 /* EXT3_DIRSYNC_FL */
-#define EXT_TOPDIR			0x00020000 /* EXT3_TOPDIR_FL */
-
-#define EXT_SET_MASK			0x000300FF
-#define EXT_GET_MASK			0x0003DFFF
+/* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */
 
 typedef struct file_chattr_info {
 	__le64	mask; /* list of all possible attribute bits */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b5ec2a268f56..aa3397620342 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -360,6 +360,8 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid,
 extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
 			       __u16 fid, char **symlinkinfo,
 			       const struct nls_table *nls_codepage);
+extern int CIFSSMB_set_compression(const unsigned int xid,
+				   struct cifs_tcon *tcon, __u16 fid);
 extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
 			const char *fileName, const int disposition,
 			const int access_flags, const int omode,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a3d74fea1623..93b29474714a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -463,7 +463,6 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
 			       cifs_max_pending);
 	set_credits(server, server->maxReq);
 	server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
-	server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
 	/* even though we do not use raw we might as well set this
 	accurately, in case we ever find a need for it */
 	if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -3089,7 +3088,8 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
 	bool is_unicode;
 	unsigned int sub_len;
 	char *sub_start;
-	struct reparse_data *reparse_buf;
+	struct reparse_symlink_data *reparse_buf;
+	struct reparse_posix_data *posix_buf;
 	__u32 data_offset, data_count;
 	char *end_of_smb;
 
@@ -3138,20 +3138,47 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
 		goto qreparse_out;
 	}
 	end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
-	reparse_buf = (struct reparse_data *)
+	reparse_buf = (struct reparse_symlink_data *)
 				((char *)&pSMBr->hdr.Protocol + data_offset);
 	if ((char *)reparse_buf >= end_of_smb) {
 		rc = -EIO;
 		goto qreparse_out;
 	}
-	if ((reparse_buf->PathBuffer + reparse_buf->PrintNameOffset +
-				reparse_buf->PrintNameLength) > end_of_smb) {
+	if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) {
+		cifs_dbg(FYI, "NFS style reparse tag\n");
+		posix_buf =  (struct reparse_posix_data *)reparse_buf;
+
+		if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) {
+			cifs_dbg(FYI, "unsupported file type 0x%llx\n",
+				 le64_to_cpu(posix_buf->InodeType));
+			rc = -EOPNOTSUPP;
+			goto qreparse_out;
+		}
+		is_unicode = true;
+		sub_len = le16_to_cpu(reparse_buf->ReparseDataLength);
+		if (posix_buf->PathBuffer + sub_len > end_of_smb) {
+			cifs_dbg(FYI, "reparse buf beyond SMB\n");
+			rc = -EIO;
+			goto qreparse_out;
+		}
+		*symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer,
+				sub_len, is_unicode, nls_codepage);
+		goto qreparse_out;
+	} else if (reparse_buf->ReparseTag !=
+			cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) {
+		rc = -EOPNOTSUPP;
+		goto qreparse_out;
+	}
+
+	/* Reparse tag is NTFS symlink */
+	sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) +
+				reparse_buf->PathBuffer;
+	sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength);
+	if (sub_start + sub_len > end_of_smb) {
 		cifs_dbg(FYI, "reparse buf beyond SMB\n");
 		rc = -EIO;
 		goto qreparse_out;
 	}
-	sub_start = reparse_buf->SubstituteNameOffset + reparse_buf->PathBuffer;
-	sub_len = reparse_buf->SubstituteNameLength;
 	if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
 		is_unicode = true;
 	else
@@ -3172,6 +3199,60 @@ qreparse_out:
 	return rc;
 }
 
+int
+CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		    __u16 fid)
+{
+	int rc = 0;
+	int bytes_returned;
+	struct smb_com_transaction_compr_ioctl_req *pSMB;
+	struct smb_com_transaction_ioctl_rsp *pSMBr;
+
+	cifs_dbg(FYI, "Set compression for %u\n", fid);
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+
+	pSMB->TotalParameterCount = 0;
+	pSMB->TotalDataCount = __constant_cpu_to_le32(2);
+	pSMB->MaxParameterCount = 0;
+	pSMB->MaxDataCount = 0;
+	pSMB->MaxSetupCount = 4;
+	pSMB->Reserved = 0;
+	pSMB->ParameterOffset = 0;
+	pSMB->DataCount = __constant_cpu_to_le32(2);
+	pSMB->DataOffset =
+		cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
+				compression_state) - 4);  /* 84 */
+	pSMB->SetupCount = 4;
+	pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL);
+	pSMB->ParameterCount = 0;
+	pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION);
+	pSMB->IsFsctl = 1; /* FSCTL */
+	pSMB->IsRootFlag = 0;
+	pSMB->Fid = fid; /* file handle always le */
+	/* 3 byte pad, followed by 2 byte compress state */
+	pSMB->ByteCount = __constant_cpu_to_le16(5);
+	inc_rfc1001_len(pSMB, 5);
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc)
+		cifs_dbg(FYI, "Send error in SetCompression = %d\n", rc);
+
+	cifs_buf_release(pSMB);
+
+	/*
+	 * Note: On -EAGAIN error only caller can retry on handle based calls
+	 * since file handle passed in no longer valid.
+	 */
+	return rc;
+}
+
+
 #ifdef CONFIG_CIFS_POSIX
 
 /*Convert an Access Control Entry from wire format to local POSIX xattr format*/
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a279ffc0bc29..62a55147400a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2242,6 +2242,8 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+		if (ses->status == CifsExiting)
+			continue;
 		if (!match_session(ses, vol))
 			continue;
 		++ses->ses_count;
@@ -2255,24 +2257,37 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 static void
 cifs_put_smb_ses(struct cifs_ses *ses)
 {
-	unsigned int xid;
+	unsigned int rc, xid;
 	struct TCP_Server_Info *server = ses->server;
 
 	cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
+
 	spin_lock(&cifs_tcp_ses_lock);
+	if (ses->status == CifsExiting) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
 	if (--ses->ses_count > 0) {
 		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	}
-
-	list_del_init(&ses->smb_ses_list);
+	if (ses->status == CifsGood)
+		ses->status = CifsExiting;
 	spin_unlock(&cifs_tcp_ses_lock);
 
-	if (ses->status == CifsGood && server->ops->logoff) {
+	if (ses->status == CifsExiting && server->ops->logoff) {
 		xid = get_xid();
-		server->ops->logoff(xid, ses);
+		rc = server->ops->logoff(xid, ses);
+		if (rc)
+			cifs_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
+				__func__, rc);
 		_free_xid(xid);
 	}
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_del_init(&ses->smb_ses_list);
+	spin_unlock(&cifs_tcp_ses_lock);
+
 	sesInfoFree(ses);
 	cifs_put_tcp_session(server);
 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index d3e2eaa503a6..5384c2a640ca 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -500,6 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
 		if (server->ops->close)
 			server->ops->close(xid, tcon, &fid);
 		cifs_del_pending_open(&open);
+		fput(file);
 		rc = -ENOMEM;
 	}
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index eb955b525e55..7ddddf2e2504 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3254,6 +3254,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	/*
 	 * Reads as many pages as possible from fscache. Returns -ENOBUFS
 	 * immediately if the cookie is negative
+	 *
+	 * After this point, every page in the list might have PG_fscache set,
+	 * so we will need to clean that up off of every page we don't use.
 	 */
 	rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
 					 &num_pages);
@@ -3376,6 +3379,11 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		kref_put(&rdata->refcount, cifs_readdata_release);
 	}
 
+	/* Any pages that have been shown to fscache but didn't get added to
+	 * the pagecache must be uncached before they get returned to the
+	 * allocator.
+	 */
+	cifs_fscache_readpages_cancel(mapping->host, page_list);
 	return rc;
 }
 
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 2f4bc5a58054..8d4b7bc8ae91 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -27,7 +27,7 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
 {
 	server->fscache =
 		fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
-				&cifs_fscache_server_index_def, server);
+				&cifs_fscache_server_index_def, server, true);
 	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
 		 __func__, server, server->fscache);
 }
@@ -46,7 +46,7 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
 
 	tcon->fscache =
 		fscache_acquire_cookie(server->fscache,
-				&cifs_fscache_super_index_def, tcon);
+				&cifs_fscache_super_index_def, tcon, true);
 	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
 		 __func__, server->fscache, tcon->fscache);
 }
@@ -69,7 +69,7 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
 		cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
-				&cifs_fscache_inode_object_def, cifsi);
+				&cifs_fscache_inode_object_def, cifsi, true);
 		cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
 			 __func__, tcon->fscache, cifsi->fscache);
 	}
@@ -119,7 +119,7 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
 		cifsi->fscache = fscache_acquire_cookie(
 					cifs_sb_master_tcon(cifs_sb)->fscache,
 					&cifs_fscache_inode_object_def,
-					cifsi);
+					cifsi, true);
 		cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
 			 __func__, cifsi->fscache, old);
 	}
@@ -223,6 +223,13 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
 		fscache_uncache_page(CIFS_I(inode)->fscache, page);
 }
 
+void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages)
+{
+	cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n",
+		 __func__, CIFS_I(inode)->fscache, inode);
+	fscache_readpages_cancel(CIFS_I(inode)->fscache, pages);
+}
+
 void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
 {
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 63539323e0b9..24794b6cd8ec 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -54,6 +54,7 @@ extern int __cifs_readpages_from_fscache(struct inode *,
 					 struct address_space *,
 					 struct list_head *,
 					 unsigned *);
+extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *);
 
 extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
 
@@ -91,6 +92,13 @@ static inline void cifs_readpage_to_fscache(struct inode *inode,
 		__cifs_readpage_to_fscache(inode, page);
 }
 
+static inline void cifs_fscache_readpages_cancel(struct inode *inode,
+						 struct list_head *pages)
+{
+	if (CIFS_I(inode)->fscache)
+		return __cifs_fscache_readpages_cancel(inode, pages);
+}
+
 #else /* CONFIG_CIFS_FSCACHE */
 static inline int cifs_fscache_register(void) { return 0; }
 static inline void cifs_fscache_unregister(void) {}
@@ -131,6 +139,11 @@ static inline int cifs_readpages_from_fscache(struct inode *inode,
 static inline void cifs_readpage_to_fscache(struct inode *inode,
 			struct page *page) {}
 
+static inline void cifs_fscache_readpages_cancel(struct inode *inode,
+						 struct list_head *pages)
+{
+}
+
 #endif /* CONFIG_CIFS_FSCACHE */
 
 #endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f9ff9c173f78..867b7cdc794a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -120,6 +120,33 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 	cifs_i->invalid_mapping = true;
 }
 
+/*
+ * copy nlink to the inode, unless it wasn't provided.  Provide
+ * sane values if we don't have an existing one and none was provided
+ */
+static void
+cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
+{
+	/*
+	 * if we're in a situation where we can't trust what we
+	 * got from the server (readdir, some non-unix cases)
+	 * fake reasonable values
+	 */
+	if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) {
+		/* only provide fake values on a new inode */
+		if (inode->i_state & I_NEW) {
+			if (fattr->cf_cifsattrs & ATTR_DIRECTORY)
+				set_nlink(inode, 2);
+			else
+				set_nlink(inode, 1);
+		}
+		return;
+	}
+
+	/* we trust the server, so update it */
+	set_nlink(inode, fattr->cf_nlink);
+}
+
 /* populate an inode with info from a cifs_fattr struct */
 void
 cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -134,7 +161,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	inode->i_mtime = fattr->cf_mtime;
 	inode->i_ctime = fattr->cf_ctime;
 	inode->i_rdev = fattr->cf_rdev;
-	set_nlink(inode, fattr->cf_nlink);
+	cifs_nlink_fattr_to_inode(inode, fattr);
 	inode->i_uid = fattr->cf_uid;
 	inode->i_gid = fattr->cf_gid;
 
@@ -541,6 +568,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
 	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
 
+	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
 	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
 		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
 		fattr->cf_dtype = DT_DIR;
@@ -548,7 +576,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 		 * Server can return wrong NumberOfLinks value for directories
 		 * when Unix extensions are disabled - fake it.
 		 */
-		fattr->cf_nlink = 2;
+		if (!tcon->unix_ext)
+			fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
 	} else if (fattr->cf_cifsattrs & ATTR_REPARSE) {
 		fattr->cf_mode = S_IFLNK;
 		fattr->cf_dtype = DT_LNK;
@@ -561,11 +590,15 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 		if (fattr->cf_cifsattrs & ATTR_READONLY)
 			fattr->cf_mode &= ~(S_IWUGO);
 
-		fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
-		if (fattr->cf_nlink < 1) {
-			cifs_dbg(1, "replacing bogus file nlink value %u\n",
+		/*
+		 * Don't accept zero nlink from non-unix servers unless
+		 * delete is pending.  Instead mark it as unknown.
+		 */
+		if ((fattr->cf_nlink < 1) && !tcon->unix_ext &&
+		    !info->DeletePending) {
+			cifs_dbg(1, "bogus file nlink value %u\n",
 				fattr->cf_nlink);
-			fattr->cf_nlink = 1;
+			fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
 		}
 	}
 
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 3e0845585853..ba54bf6ab116 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -3,7 +3,7 @@
  *
  *   vfs operations that deal with io control
  *
- *   Copyright (C) International Business Machines  Corp., 2005,2007
+ *   Copyright (C) International Business Machines  Corp., 2005,2013
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -34,13 +34,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	int rc = -ENOTTY; /* strange error - but the precedent */
 	unsigned int xid;
 	struct cifs_sb_info *cifs_sb;
-#ifdef CONFIG_CIFS_POSIX
 	struct cifsFileInfo *pSMBFile = filep->private_data;
 	struct cifs_tcon *tcon;
 	__u64	ExtAttrBits = 0;
-	__u64	ExtAttrMask = 0;
 	__u64   caps;
-#endif /* CONFIG_CIFS_POSIX */
 
 	xid = get_xid();
 
@@ -49,13 +46,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	cifs_sb = CIFS_SB(inode->i_sb);
 
 	switch (command) {
-#ifdef CONFIG_CIFS_POSIX
 		case FS_IOC_GETFLAGS:
 			if (pSMBFile == NULL)
 				break;
 			tcon = tlink_tcon(pSMBFile->tlink);
 			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+#ifdef CONFIG_CIFS_POSIX
 			if (CIFS_UNIX_EXTATTR_CAP & caps) {
+				__u64	ExtAttrMask = 0;
 				rc = CIFSGetExtAttr(xid, tcon,
 						    pSMBFile->fid.netfid,
 						    &ExtAttrBits, &ExtAttrMask);
@@ -63,29 +61,50 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 					rc = put_user(ExtAttrBits &
 						FS_FL_USER_VISIBLE,
 						(int __user *)arg);
+				if (rc != EOPNOTSUPP)
+					break;
+			}
+#endif /* CONFIG_CIFS_POSIX */
+			rc = 0;
+			if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) {
+				/* add in the compressed bit */
+				ExtAttrBits = FS_COMPR_FL;
+				rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE,
+					      (int __user *)arg);
 			}
 			break;
-
 		case FS_IOC_SETFLAGS:
 			if (pSMBFile == NULL)
 				break;
 			tcon = tlink_tcon(pSMBFile->tlink);
 			caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
-			if (CIFS_UNIX_EXTATTR_CAP & caps) {
-				if (get_user(ExtAttrBits, (int __user *)arg)) {
-					rc = -EFAULT;
-					break;
-				}
-				/*
-				 * rc = CIFSGetExtAttr(xid, tcon,
-				 *		       pSMBFile->fid.netfid,
-				 *		       extAttrBits,
-				 *		       &ExtAttrMask);
-				 */
+
+			if (get_user(ExtAttrBits, (int __user *)arg)) {
+				rc = -EFAULT;
+				break;
+			}
+
+			/*
+			 * if (CIFS_UNIX_EXTATTR_CAP & caps)
+			 *	rc = CIFSSetExtAttr(xid, tcon,
+			 *		       pSMBFile->fid.netfid,
+			 *		       extAttrBits,
+			 *		       &ExtAttrMask);
+			 * if (rc != EOPNOTSUPP)
+			 *	break;
+			 */
+
+			/* Currently only flag we can set is compressed flag */
+			if ((ExtAttrBits & FS_COMPR_FL) == 0)
+				break;
+
+			/* Try to set compress flag */
+			if (tcon->ses->server->ops->set_compression) {
+				rc = tcon->ses->server->ops->set_compression(
+							xid, tcon, pSMBFile);
+				cifs_dbg(FYI, "set compress flag rc %d\n", rc);
 			}
-			cifs_dbg(FYI, "set flags not implemented yet\n");
 			break;
-#endif /* CONFIG_CIFS_POSIX */
 		default:
 			cifs_dbg(FYI, "unsupported ioctl\n");
 			break;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 138a011633fe..2f9f3790679d 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -278,7 +278,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 }
 
 static int
-check_smb_hdr(struct smb_hdr *smb, __u16 mid)
+check_smb_hdr(struct smb_hdr *smb)
 {
 	/* does it have the right SMB "signature" ? */
 	if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
@@ -287,13 +287,6 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 		return 1;
 	}
 
-	/* Make sure that message ids match */
-	if (mid != smb->Mid) {
-		cifs_dbg(VFS, "Mids do not match. received=%u expected=%u\n",
-			 smb->Mid, mid);
-		return 1;
-	}
-
 	/* if it's a response then accept */
 	if (smb->Flags & SMBFLG_RESPONSE)
 		return 0;
@@ -302,7 +295,8 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 	if (smb->Command == SMB_COM_LOCKING_ANDX)
 		return 0;
 
-	cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", smb->Mid);
+	cifs_dbg(VFS, "Server sent request, not response. mid=%u\n",
+		 get_mid(smb));
 	return 1;
 }
 
@@ -310,7 +304,6 @@ int
 checkSMB(char *buf, unsigned int total_read)
 {
 	struct smb_hdr *smb = (struct smb_hdr *)buf;
-	__u16 mid = smb->Mid;
 	__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
 	__u32 clc_len;  /* calculated length */
 	cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n",
@@ -348,7 +341,7 @@ checkSMB(char *buf, unsigned int total_read)
 	}
 
 	/* otherwise, there is enough to get to the BCC */
-	if (check_smb_hdr(smb, mid))
+	if (check_smb_hdr(smb))
 		return -EIO;
 	clc_len = smbCalcSize(smb);
 
@@ -359,6 +352,7 @@ checkSMB(char *buf, unsigned int total_read)
 	}
 
 	if (4 + rfclen != clc_len) {
+		__u16 mid = get_mid(smb);
 		/* check if bcc wrapped around for large read responses */
 		if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
 			/* check if lengths match mod 64K */
@@ -366,11 +360,11 @@ checkSMB(char *buf, unsigned int total_read)
 				return 0; /* bcc wrapped */
 		}
 		cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n",
-			 clc_len, 4 + rfclen, smb->Mid);
+			 clc_len, 4 + rfclen, mid);
 
 		if (4 + rfclen < clc_len) {
 			cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n",
-				 rfclen, smb->Mid);
+				 rfclen, mid);
 			return -EIO;
 		} else if (rfclen > clc_len + 512) {
 			/*
@@ -383,7 +377,7 @@ checkSMB(char *buf, unsigned int total_read)
 			 * data to 512 bytes.
 			 */
 			cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n",
-				 rfclen, smb->Mid);
+				 rfclen, mid);
 			return -EIO;
 		}
 	}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index af847e1cf1c1..651a5279607b 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -780,7 +780,9 @@ static const struct {
 	ERRDOS, ERRnoaccess, 0xc0000290}, {
 	ERRDOS, ERRbadfunc, 0xc000029c}, {
 	ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, {
-	ERRDOS, ERRinvlevel, 0x007c0001}, };
+	ERRDOS, ERRinvlevel, 0x007c0001}, {
+	0, 0, 0 }
+};
 
 /*****************************************************************************
  Print an error message from the status code
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 42ef03be089f..53a75f3d0179 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -180,6 +180,9 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 		fattr->cf_dtype = DT_REG;
 	}
 
+	/* non-unix readdir doesn't provide nlink */
+	fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
+
 	if (fattr->cf_cifsattrs & ATTR_READONLY)
 		fattr->cf_mode &= ~S_IWUGO;
 
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5f99b7f19e78..e87387dbf39f 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,88 +32,6 @@
 #include <linux/slab.h>
 #include "cifs_spnego.h"
 
-/*
- * Checks if this is the first smb session to be reconnected after
- * the socket has been reestablished (so we know whether to use vc 0).
- * Called while holding the cifs_tcp_ses_lock, so do not block
- */
-static bool is_first_ses_reconnect(struct cifs_ses *ses)
-{
-	struct list_head *tmp;
-	struct cifs_ses *tmp_ses;
-
-	list_for_each(tmp, &ses->server->smb_ses_list) {
-		tmp_ses = list_entry(tmp, struct cifs_ses,
-				     smb_ses_list);
-		if (tmp_ses->need_reconnect == false)
-			return false;
-	}
-	/* could not find a session that was already connected,
-	   this must be the first one we are reconnecting */
-	return true;
-}
-
-/*
- *	vc number 0 is treated specially by some servers, and should be the
- *      first one we request.  After that we can use vcnumbers up to maxvcs,
- *	one for each smb session (some Windows versions set maxvcs incorrectly
- *	so maxvc=1 can be ignored).  If we have too many vcs, we can reuse
- *	any vc but zero (some servers reset the connection on vcnum zero)
- *
- */
-static __le16 get_next_vcnum(struct cifs_ses *ses)
-{
-	__u16 vcnum = 0;
-	struct list_head *tmp;
-	struct cifs_ses *tmp_ses;
-	__u16 max_vcs = ses->server->max_vcs;
-	__u16 i;
-	int free_vc_found = 0;
-
-	/* Quoting the MS-SMB specification: "Windows-based SMB servers set this
-	field to one but do not enforce this limit, which allows an SMB client
-	to establish more virtual circuits than allowed by this value ... but
-	other server implementations can enforce this limit." */
-	if (max_vcs < 2)
-		max_vcs = 0xFFFF;
-
-	spin_lock(&cifs_tcp_ses_lock);
-	if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
-			goto get_vc_num_exit;  /* vcnum will be zero */
-	for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
-		if (i == 0) /* this is the only connection, use vc 0 */
-			break;
-
-		free_vc_found = 1;
-
-		list_for_each(tmp, &ses->server->smb_ses_list) {
-			tmp_ses = list_entry(tmp, struct cifs_ses,
-					     smb_ses_list);
-			if (tmp_ses->vcnum == i) {
-				free_vc_found = 0;
-				break; /* found duplicate, try next vcnum */
-			}
-		}
-		if (free_vc_found)
-			break; /* we found a vcnumber that will work - use it */
-	}
-
-	if (i == 0)
-		vcnum = 0; /* for most common case, ie if one smb session, use
-			      vc zero.  Also for case when no free vcnum, zero
-			      is safest to send (some clients only send zero) */
-	else if (free_vc_found == 0)
-		vcnum = 1;  /* we can not reuse vc=0 safely, since some servers
-				reset all uids on that, but 1 is ok. */
-	else
-		vcnum = i;
-	ses->vcnum = vcnum;
-get_vc_num_exit:
-	spin_unlock(&cifs_tcp_ses_lock);
-
-	return cpu_to_le16(vcnum);
-}
-
 static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
 {
 	__u32 capabilities = 0;
@@ -128,7 +46,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
 					CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
 					USHRT_MAX));
 	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-	pSMB->req.VcNumber = get_next_vcnum(ses);
+	pSMB->req.VcNumber = __constant_cpu_to_le16(1);
 
 	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
 
@@ -582,9 +500,9 @@ select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
 				return NTLMv2;
 			if (global_secflags & CIFSSEC_MAY_NTLM)
 				return NTLM;
-			/* Fallthrough */
 		default:
-			return Unspecified;
+			/* Fallthrough to attempt LANMAN authentication next */
+			break;
 		}
 	case CIFS_NEGFLAVOR_LANMAN:
 		switch (requested) {
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 8233b174de3d..384cffe42850 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -67,7 +67,7 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf,
 	mutex_unlock(&server->srv_mutex);
 
 	cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n",
-		 in_buf->Mid, rc);
+		 get_mid(in_buf), rc);
 
 	return rc;
 }
@@ -101,7 +101,7 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
 
 	spin_lock(&GlobalMid_Lock);
 	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
-		if (mid->mid == buf->Mid &&
+		if (compare_mid(mid->mid, buf) &&
 		    mid->mid_state == MID_REQUEST_SUBMITTED &&
 		    le16_to_cpu(mid->command) == buf->Command) {
 			spin_unlock(&GlobalMid_Lock);
@@ -807,6 +807,13 @@ out:
 }
 
 static int
+cifs_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile)
+{
+	return CIFSSMB_set_compression(xid, tcon, cfile->fid.netfid);
+}
+
+static int
 cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 		     const char *path, struct cifs_sb_info *cifs_sb,
 		     struct cifs_fid *fid, __u16 search_flags,
@@ -956,6 +963,7 @@ struct smb_version_operations smb1_operations = {
 	.set_path_size = CIFSSMBSetEOF,
 	.set_file_size = CIFSSMBSetFileSize,
 	.set_file_info = smb_set_file_info,
+	.set_compression = cifs_set_compression,
 	.echo = CIFSSMBEcho,
 	.mkdir = CIFSSMBMkDir,
 	.mkdir_setinfo = cifs_mkdir_setinfo,
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 861b33214144..c571be8cb76e 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -209,6 +209,94 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
 	return rsize;
 }
 
+#ifdef CONFIG_CIFS_STATS2
+static int
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	unsigned int ret_data_len = 0;
+	struct network_interface_info_ioctl_rsp *out_buf;
+
+	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
+			FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
+			NULL /* no data input */, 0 /* no data input */,
+			(char **)&out_buf, &ret_data_len);
+
+	if ((rc == 0)  && (ret_data_len > 0)) {
+		/* Dump info on first interface */
+		cifs_dbg(FYI, "Adapter Capability 0x%x\t",
+			le32_to_cpu(out_buf->Capability));
+		cifs_dbg(FYI, "Link Speed %lld\n",
+			le64_to_cpu(out_buf->LinkSpeed));
+	} else
+		cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
+
+	return rc;
+}
+#endif /* STATS2 */
+
+static void
+smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	__le16 srch_path = 0; /* Null - open root of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (rc)
+		return;
+
+#ifdef CONFIG_CIFS_STATS2
+	SMB3_request_interfaces(xid, tcon);
+#endif /* STATS2 */
+
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_ATTRIBUTE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_DEVICE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	return;
+}
+
+static void
+smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc;
+	__le16 srch_path = 0; /* Null - open root of share */
+	u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+	if (rc)
+		return;
+
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_ATTRIBUTE_INFORMATION);
+	SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+			FS_DEVICE_INFORMATION);
+	SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+	return;
+}
+
 static int
 smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
 			struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -304,7 +392,19 @@ smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
 		seq_puts(m, " ASYMMETRIC,");
 	if (tcon->capabilities == 0)
 		seq_puts(m, " None");
+	if (tcon->ss_flags & SSINFO_FLAGS_ALIGNED_DEVICE)
+		seq_puts(m, " Aligned,");
+	if (tcon->ss_flags & SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE)
+		seq_puts(m, " Partition Aligned,");
+	if (tcon->ss_flags & SSINFO_FLAGS_NO_SEEK_PENALTY)
+		seq_puts(m, " SSD,");
+	if (tcon->ss_flags & SSINFO_FLAGS_TRIM_ENABLED)
+		seq_puts(m, " TRIM-support,");
+
 	seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+	if (tcon->perf_sector_size)
+		seq_printf(m, "\tOptimal sector size: 0x%x",
+			   tcon->perf_sector_size);
 }
 
 static void
@@ -446,6 +546,14 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
 }
 
 static int
+smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile)
+{
+	return SMB2_set_compression(xid, tcon, cfile->fid.persistent_fid,
+			    cfile->fid.volatile_fid);
+}
+
+static int
 smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 		     const char *path, struct cifs_sb_info *cifs_sb,
 		     struct cifs_fid *fid, __u16 search_flags,
@@ -865,6 +973,7 @@ struct smb_version_operations smb20_operations = {
 	.logoff = SMB2_logoff,
 	.tree_connect = SMB2_tcon,
 	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb2_qfs_tcon,
 	.is_path_accessible = smb2_is_path_accessible,
 	.can_echo = smb2_can_echo,
 	.echo = SMB2_echo,
@@ -874,6 +983,7 @@ struct smb_version_operations smb20_operations = {
 	.set_path_size = smb2_set_path_size,
 	.set_file_size = smb2_set_file_size,
 	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
 	.mkdir = smb2_mkdir,
 	.mkdir_setinfo = smb2_mkdir_setinfo,
 	.rmdir = smb2_rmdir,
@@ -936,6 +1046,7 @@ struct smb_version_operations smb21_operations = {
 	.logoff = SMB2_logoff,
 	.tree_connect = SMB2_tcon,
 	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb2_qfs_tcon,
 	.is_path_accessible = smb2_is_path_accessible,
 	.can_echo = smb2_can_echo,
 	.echo = SMB2_echo,
@@ -945,6 +1056,7 @@ struct smb_version_operations smb21_operations = {
 	.set_path_size = smb2_set_path_size,
 	.set_file_size = smb2_set_file_size,
 	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
 	.mkdir = smb2_mkdir,
 	.mkdir_setinfo = smb2_mkdir_setinfo,
 	.rmdir = smb2_rmdir,
@@ -1008,6 +1120,7 @@ struct smb_version_operations smb30_operations = {
 	.logoff = SMB2_logoff,
 	.tree_connect = SMB2_tcon,
 	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb3_qfs_tcon,
 	.is_path_accessible = smb2_is_path_accessible,
 	.can_echo = smb2_can_echo,
 	.echo = SMB2_echo,
@@ -1017,6 +1130,7 @@ struct smb_version_operations smb30_operations = {
 	.set_path_size = smb2_set_path_size,
 	.set_file_size = smb2_set_file_size,
 	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
 	.mkdir = smb2_mkdir,
 	.mkdir_setinfo = smb2_mkdir_setinfo,
 	.rmdir = smb2_rmdir,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index eba0efde66d7..8ab05b0d6778 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -687,6 +687,10 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
 	else
 		return -EIO;
 
+	/* no need to send SMB logoff if uid already closed due to reconnect */
+	if (ses->need_reconnect)
+		goto smb2_session_already_dead;
+
 	rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req);
 	if (rc)
 		return rc;
@@ -701,6 +705,8 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
 	 * No tcon so can't do
 	 * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
 	 */
+
+smb2_session_already_dead:
 	return rc;
 }
 
@@ -1131,6 +1137,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 
 	cifs_dbg(FYI, "SMB2 IOCTL\n");
 
+	*out_data = NULL;
 	/* zero out returned data len, in case of error */
 	if (plen)
 		*plen = 0;
@@ -1176,11 +1183,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 		req->Flags = 0;
 
 	iov[0].iov_base = (char *)req;
-	/* 4 for rfc1002 length field */
-	iov[0].iov_len = get_rfc1002_length(req) + 4;
 
-	if (indatalen)
-		inc_rfc1001_len(req, indatalen);
+	/*
+	 * If no input data, the size of ioctl struct in
+	 * protocol spec still includes a 1 byte data buffer,
+	 * but if input data passed to ioctl, we do not
+	 * want to double count this, so we do not send
+	 * the dummy one byte of data in iovec[0] if sending
+	 * input data (in iovec[1]). We also must add 4 bytes
+	 * in first iovec to allow for rfc1002 length field.
+	 */
+
+	if (indatalen) {
+		iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+		inc_rfc1001_len(req, indatalen - 1);
+	} else
+		iov[0].iov_len = get_rfc1002_length(req) + 4;
+
 
 	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
 	rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
@@ -1228,6 +1247,33 @@ ioctl_exit:
 	return rc;
 }
 
+/*
+ *   Individual callers to ioctl worker function follow
+ */
+
+int
+SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid)
+{
+	int rc;
+	char *res_key = NULL;
+	struct  compress_ioctl fsctl_input;
+	char *ret_data = NULL;
+
+	fsctl_input.CompressionState =
+			__constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+
+	rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
+			FSCTL_SET_COMPRESSION, true /* is_fsctl */,
+			(char *)&fsctl_input /* data input */,
+			2 /* in data len */, &ret_data /* out data */, NULL);
+
+	cifs_dbg(FYI, "set compression rc %d\n", rc);
+	kfree(res_key);
+
+	return rc;
+}
+
 int
 SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	   u64 persistent_fid, u64 volatile_fid)
@@ -2293,7 +2339,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
-		goto qinf_exit;
+		goto qfsinf_exit;
 	}
 	rsp = (struct smb2_query_info_rsp *)iov.iov_base;
 
@@ -2305,7 +2351,70 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 	if (!rc)
 		copy_fs_info_to_kstatfs(info, fsdata);
 
-qinf_exit:
+qfsinf_exit:
+	free_rsp_buf(resp_buftype, iov.iov_base);
+	return rc;
+}
+
+int
+SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
+	      u64 persistent_fid, u64 volatile_fid, int level)
+{
+	struct smb2_query_info_rsp *rsp = NULL;
+	struct kvec iov;
+	int rc = 0;
+	int resp_buftype, max_len, min_len;
+	struct cifs_ses *ses = tcon->ses;
+	unsigned int rsp_len, offset;
+
+	if (level == FS_DEVICE_INFORMATION) {
+		max_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
+		min_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
+	} else if (level == FS_ATTRIBUTE_INFORMATION) {
+		max_len = sizeof(FILE_SYSTEM_ATTRIBUTE_INFO);
+		min_len = MIN_FS_ATTR_INFO_SIZE;
+	} else if (level == FS_SECTOR_SIZE_INFORMATION) {
+		max_len = sizeof(struct smb3_fs_ss_info);
+		min_len = sizeof(struct smb3_fs_ss_info);
+	} else {
+		cifs_dbg(FYI, "Invalid qfsinfo level %d\n", level);
+		return -EINVAL;
+	}
+
+	rc = build_qfs_info_req(&iov, tcon, level, max_len,
+				persistent_fid, volatile_fid);
+	if (rc)
+		return rc;
+
+	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
+	if (rc) {
+		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+		goto qfsattr_exit;
+	}
+	rsp = (struct smb2_query_info_rsp *)iov.iov_base;
+
+	rsp_len = le32_to_cpu(rsp->OutputBufferLength);
+	offset = le16_to_cpu(rsp->OutputBufferOffset);
+	rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len);
+	if (rc)
+		goto qfsattr_exit;
+
+	if (level == FS_ATTRIBUTE_INFORMATION)
+		memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset
+			+ (char *)&rsp->hdr, min_t(unsigned int,
+			rsp_len, max_len));
+	else if (level == FS_DEVICE_INFORMATION)
+		memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset
+			+ (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO));
+	else if (level == FS_SECTOR_SIZE_INFORMATION) {
+		struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *)
+			(4 /* RFC1001 len */ + offset + (char *)&rsp->hdr);
+		tcon->ss_flags = le32_to_cpu(ss_info->Flags);
+		tcon->perf_sector_size =
+			le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf);
+	}
+
+qfsattr_exit:
 	free_rsp_buf(resp_buftype, iov.iov_base);
 	return rc;
 }
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index b83d0118a757..6183b1b7550f 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -569,6 +569,10 @@ struct network_interface_info_ioctl_rsp {
 
 #define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
 
+struct compress_ioctl {
+	__le16 CompressionState; /* See cifspdu.h for possible flag values */
+} __packed;
+
 struct smb2_ioctl_req {
 	struct smb2_hdr hdr;
 	__le16 StructureSize;	/* Must be 57 */
@@ -584,7 +588,7 @@ struct smb2_ioctl_req {
 	__le32 MaxOutputResponse;
 	__le32 Flags;
 	__u32  Reserved2;
-	char   Buffer[0];
+	__u8   Buffer[0];
 } __packed;
 
 struct smb2_ioctl_rsp {
@@ -870,14 +874,16 @@ struct smb2_lease_ack {
 
 /* File System Information Classes */
 #define FS_VOLUME_INFORMATION		1 /* Query */
-#define FS_LABEL_INFORMATION		2 /* Set */
+#define FS_LABEL_INFORMATION		2 /* Local only */
 #define FS_SIZE_INFORMATION		3 /* Query */
 #define FS_DEVICE_INFORMATION		4 /* Query */
 #define FS_ATTRIBUTE_INFORMATION	5 /* Query */
 #define FS_CONTROL_INFORMATION		6 /* Query, Set */
 #define FS_FULL_SIZE_INFORMATION	7 /* Query */
 #define FS_OBJECT_ID_INFORMATION	8 /* Query, Set */
-#define FS_DRIVER_PATH_INFORMATION	9 /* Query */
+#define FS_DRIVER_PATH_INFORMATION	9 /* Local only */
+#define FS_VOLUME_FLAGS_INFORMATION	10 /* Local only */
+#define FS_SECTOR_SIZE_INFORMATION	11 /* SMB3 or later. Query */
 
 struct smb2_fs_full_size_info {
 	__le64 TotalAllocationUnits;
@@ -887,6 +893,22 @@ struct smb2_fs_full_size_info {
 	__le32 BytesPerSector;
 } __packed;
 
+#define SSINFO_FLAGS_ALIGNED_DEVICE		0x00000001
+#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
+#define SSINFO_FLAGS_NO_SEEK_PENALTY		0x00000004
+#define SSINFO_FLAGS_TRIM_ENABLED		0x00000008
+
+/* sector size info struct */
+struct smb3_fs_ss_info {
+	__le32 LogicalBytesPerSector;
+	__le32 PhysicalBytesPerSectorForAtomicity;
+	__le32 PhysicalBytesPerSectorForPerf;
+	__le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity;
+	__le32 Flags;
+	__le32 ByteOffsetForSectorAlignment;
+	__le32 ByteOffsetForPartitionAlignment;
+} __packed;
+
 /* partial list of QUERY INFO levels */
 #define FILE_DIRECTORY_INFORMATION	1
 #define FILE_FULL_DIRECTORY_INFORMATION 2
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index e3fb4801ee96..313813e4c19b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -142,12 +142,16 @@ extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
 extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 			 u64 persistent_fid, u64 volatile_fid,
 			 FILE_BASIC_INFO *buf);
+extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+				u64 persistent_fid, u64 volatile_fid);
 extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
 			     const u64 persistent_fid, const u64 volatile_fid,
 			     const __u8 oplock_level);
 extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 			 u64 persistent_file_id, u64 volatile_file_id,
 			 struct kstatfs *FSData);
+extern int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
+			 u64 persistent_file_id, u64 volatile_file_id, int lvl);
 extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon,
 		     const __u64 persist_fid, const __u64 volatile_fid,
 		     const __u32 pid, const __u64 length, const __u64 offset,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 340abca3aa52..59c748ce872f 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -466,7 +466,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 static inline void
 smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr)
 {
-	hdr->MessageId = get_next_mid(server);
+	hdr->MessageId = get_next_mid64(server);
 }
 
 static struct mid_q_entry *
@@ -516,13 +516,19 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf,
 		return -EAGAIN;
 	}
 
-	if (ses->status != CifsGood) {
-		/* check if SMB2 session is bad because we are setting it up */
+	if (ses->status == CifsNew) {
 		if ((buf->Command != SMB2_SESSION_SETUP) &&
 		    (buf->Command != SMB2_NEGOTIATE))
 			return -EAGAIN;
 		/* else ok - we are setting up session */
 	}
+
+	if (ses->status == CifsExiting) {
+		if (buf->Command != SMB2_LOGOFF)
+			return -EAGAIN;
+		/* else ok - we are shutting down the session */
+	}
+
 	*mid = smb2_mid_entry_alloc(buf, ses->server);
 	if (*mid == NULL)
 		return -ENOMEM;
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index d952ee48f4dc..a4b2391fe66e 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -97,9 +97,23 @@
 #define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
 #define FSCTL_SRV_READ_HASH          0x001441BB /* BB add struct */
 
+/* See FSCC 2.1.2.5 */
 #define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
 #define IO_REPARSE_TAG_HSM           0xC0000004
 #define IO_REPARSE_TAG_SIS           0x80000007
+#define IO_REPARSE_TAG_HSM2          0x80000006
+#define IO_REPARSE_TAG_DRIVER_EXTENDER 0x80000005
+/* Used by the DFS filter. See MS-DFSC */
+#define IO_REPARSE_TAG_DFS           0x8000000A
+/* Used by the DFS filter See MS-DFSC */
+#define IO_REPARSE_TAG_DFSR          0x80000012
+#define IO_REPARSE_TAG_FILTER_MANAGER 0x8000000B
+/* See section MS-FSCC 2.1.2.4 */
+#define IO_REPARSE_TAG_SYMLINK       0xA000000C
+#define IO_REPARSE_TAG_DEDUP         0x80000013
+#define IO_REPARSE_APPXSTREAM	     0xC0000014
+/* NFS symlinks, Win 8/SMB3 and later */
+#define IO_REPARSE_TAG_NFS           0x80000014
 
 /* fsctl flags */
 /* If Flags is set to this value, the request is an FSCTL not ioctl request */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 6fdcb1b4a106..b37570952846 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -58,7 +58,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 		return temp;
 	else {
 		memset(temp, 0, sizeof(struct mid_q_entry));
-		temp->mid = smb_buffer->Mid;	/* always LE */
+		temp->mid = get_mid(smb_buffer);
 		temp->pid = current->pid;
 		temp->command = cpu_to_le16(smb_buffer->Command);
 		cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
@@ -410,8 +410,13 @@ static int
 wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
 		      const int optype)
 {
-	return wait_for_free_credits(server, timeout,
-				server->ops->get_credits_field(server, optype));
+	int *val;
+
+	val = server->ops->get_credits_field(server, optype);
+	/* Since an echo is already inflight, no need to wait to send another */
+	if (*val <= 0 && optype == CIFS_ECHO_OP)
+		return -EAGAIN;
+	return wait_for_free_credits(server, timeout, val);
 }
 
 static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
@@ -426,13 +431,20 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
 		return -EAGAIN;
 	}
 
-	if (ses->status != CifsGood) {
-		/* check if SMB session is bad because we are setting it up */
+	if (ses->status == CifsNew) {
 		if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
 			(in_buf->Command != SMB_COM_NEGOTIATE))
 			return -EAGAIN;
 		/* else ok - we are setting up session */
 	}
+
+	if (ses->status == CifsExiting) {
+		/* check if SMB session is bad because we are setting it up */
+		if (in_buf->Command != SMB_COM_LOGOFF_ANDX)
+			return -EAGAIN;
+		/* else ok - we are shutting down session */
+	}
+
 	*ppmidQ = AllocMidQEntry(in_buf, ses->server);
 	if (*ppmidQ == NULL)
 		return -ENOMEM;
diff --git a/fs/dcache.c b/fs/dcache.c
index 41000305d716..ae6ebb88ceff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -542,7 +542,7 @@ EXPORT_SYMBOL(d_drop);
  * If ref is non-zero, then decrement the refcount too.
  * Returns dentry requiring refcount drop, or NULL if we're done.
  */
-static inline struct dentry *
+static struct dentry *
 dentry_kill(struct dentry *dentry, int unlock_on_failure)
 	__releases(dentry->d_lock)
 {
@@ -630,7 +630,8 @@ repeat:
 			goto kill_it;
 	}
 
-	dentry->d_flags |= DCACHE_REFERENCED;
+	if (!(dentry->d_flags & DCACHE_REFERENCED))
+		dentry->d_flags |= DCACHE_REFERENCED;
 	dentry_lru_add(dentry);
 
 	dentry->d_lockref.count--;
@@ -1331,14 +1332,6 @@ rename_retry:
  * list is non-empty and continue searching.
  */
 
-/**
- * have_submounts - check for mounts over a dentry
- * @parent: dentry to check.
- *
- * Return true if the parent or its subdirectories contain
- * a mount point
- */
-
 static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
 {
 	int *ret = data;
@@ -1349,6 +1342,13 @@ static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
 	return D_WALK_CONTINUE;
 }
 
+/**
+ * have_submounts - check for mounts over a dentry
+ * @parent: dentry to check.
+ *
+ * Return true if the parent or its subdirectories contain
+ * a mount point
+ */
 int have_submounts(struct dentry *parent)
 {
 	int ret = 0;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c88e355f7635..000eae2782b6 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -408,7 +408,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
 				    struct page *page)
 {
 	return ecryptfs_lower_header_size(crypt_stat) +
-	       (page->index << PAGE_CACHE_SHIFT);
+	       ((loff_t)page->index << PAGE_CACHE_SHIFT);
 }
 
 /**
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 7d52806c2119..4725a07f003c 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1149,7 +1149,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	struct ecryptfs_msg_ctx *msg_ctx;
 	struct ecryptfs_message *msg = NULL;
 	char *auth_tok_sig;
-	char *payload;
+	char *payload = NULL;
 	size_t payload_len = 0;
 	int rc;
 
@@ -1203,6 +1203,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	}
 out:
 	kfree(msg);
+	kfree(payload);
 	return rc;
 }
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 473e09da7d02..810c28fb8c3c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,7 +34,6 @@
 #include <linux/mutex.h>
 #include <linux/anon_inodes.h>
 #include <linux/device.h>
-#include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/mman.h>
@@ -1605,8 +1604,7 @@ fetch_events:
 			}
 
 			spin_unlock_irqrestore(&ep->lock, flags);
-			if (!freezable_schedule_hrtimeout_range(to, slack,
-								HRTIMER_MODE_ABS))
+			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
 				timed_out = 1;
 
 			spin_lock_irqsave(&ep->lock, flags);
diff --git a/fs/exec.c b/fs/exec.c
index 8875dd10ae7a..2ea437e5acf4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1547,6 +1547,7 @@ static int do_execve_common(const char *filename,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	acct_update_integrals(current);
+	task_numa_free(current);
 	free_bprm(bprm);
 	if (displaced)
 		put_files_struct(displaced);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 1194b1f0f839..f8cde46de9cd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1783,7 +1783,7 @@ retry:
 		d_tmpfile(dentry, inode);
 		err = ext3_orphan_add(handle, inode);
 		if (err)
-			goto err_drop_inode;
+			goto err_unlock_inode;
 		mark_inode_dirty(inode);
 		unlock_new_inode(inode);
 	}
@@ -1791,10 +1791,9 @@ retry:
 	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
-err_drop_inode:
+err_unlock_inode:
 	ext3_journal_stop(handle);
 	unlock_new_inode(inode);
-	iput(inode);
 	return err;
 }
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0d424d7ac02b..e274e9c1171f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2563,7 +2563,7 @@ retry:
 			break;
 	}
 	blk_finish_plug(&plug);
-	if (!ret && !cycled) {
+	if (!ret && !cycled && wbc->nr_to_write > 0) {
 		cycled = 1;
 		mpd.last_page = writeback_index - 1;
 		mpd.first_page = 0;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1bec5a5c1e45..5a0408d7b114 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2319,7 +2319,7 @@ retry:
 		d_tmpfile(dentry, inode);
 		err = ext4_orphan_add(handle, inode);
 		if (err)
-			goto err_drop_inode;
+			goto err_unlock_inode;
 		mark_inode_dirty(inode);
 		unlock_new_inode(inode);
 	}
@@ -2328,10 +2328,9 @@ retry:
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
-err_drop_inode:
+err_unlock_inode:
 	ext4_journal_stop(handle);
 	unlock_new_inode(inode);
-	iput(inode);
 	return err;
 }
 
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c081e34f717f..03e9bebba198 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1350,6 +1350,8 @@ retry:
 				    s_min_extra_isize) {
 					tried_min_extra_isize++;
 					new_extra_isize = s_min_extra_isize;
+					kfree(is); is = NULL;
+					kfree(bs); bs = NULL;
 					goto retry;
 				}
 				error = -1;
diff --git a/fs/file_table.c b/fs/file_table.c
index abdd15ad13c9..e900ca518635 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -297,7 +297,7 @@ void flush_delayed_fput(void)
 	delayed_fput(NULL);
 }
 
-static DECLARE_WORK(delayed_fput_work, delayed_fput);
+static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
 
 void fput(struct file *file)
 {
@@ -317,7 +317,7 @@ void fput(struct file *file)
 		}
 
 		if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
-			schedule_work(&delayed_fput_work);
+			schedule_delayed_work(&delayed_fput_work, 1);
 	}
 }
 
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 318e8433527c..29d7feb62cf7 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -58,15 +58,16 @@ void fscache_cookie_init_once(void *_cookie)
 struct fscache_cookie *__fscache_acquire_cookie(
 	struct fscache_cookie *parent,
 	const struct fscache_cookie_def *def,
-	void *netfs_data)
+	void *netfs_data,
+	bool enable)
 {
 	struct fscache_cookie *cookie;
 
 	BUG_ON(!def);
 
-	_enter("{%s},{%s},%p",
+	_enter("{%s},{%s},%p,%u",
 	       parent ? (char *) parent->def->name : "<no-parent>",
-	       def->name, netfs_data);
+	       def->name, netfs_data, enable);
 
 	fscache_stat(&fscache_n_acquires);
 
@@ -106,7 +107,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
 	cookie->def		= def;
 	cookie->parent		= parent;
 	cookie->netfs_data	= netfs_data;
-	cookie->flags		= 0;
+	cookie->flags		= (1 << FSCACHE_COOKIE_NO_DATA_YET);
 
 	/* radix tree insertion won't use the preallocation pool unless it's
 	 * told it may not wait */
@@ -124,16 +125,22 @@ struct fscache_cookie *__fscache_acquire_cookie(
 		break;
 	}
 
-	/* if the object is an index then we need do nothing more here - we
-	 * create indices on disk when we need them as an index may exist in
-	 * multiple caches */
-	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
-		if (fscache_acquire_non_index_cookie(cookie) < 0) {
-			atomic_dec(&parent->n_children);
-			__fscache_cookie_put(cookie);
-			fscache_stat(&fscache_n_acquires_nobufs);
-			_leave(" = NULL");
-			return NULL;
+	if (enable) {
+		/* if the object is an index then we need do nothing more here
+		 * - we create indices on disk when we need them as an index
+		 * may exist in multiple caches */
+		if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+			if (fscache_acquire_non_index_cookie(cookie) == 0) {
+				set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+			} else {
+				atomic_dec(&parent->n_children);
+				__fscache_cookie_put(cookie);
+				fscache_stat(&fscache_n_acquires_nobufs);
+				_leave(" = NULL");
+				return NULL;
+			}
+		} else {
+			set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
 		}
 	}
 
@@ -144,6 +151,39 @@ struct fscache_cookie *__fscache_acquire_cookie(
 EXPORT_SYMBOL(__fscache_acquire_cookie);
 
 /*
+ * Enable a cookie to permit it to accept new operations.
+ */
+void __fscache_enable_cookie(struct fscache_cookie *cookie,
+			     bool (*can_enable)(void *data),
+			     void *data)
+{
+	_enter("%p", cookie);
+
+	wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
+			 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+
+	if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
+		goto out_unlock;
+
+	if (can_enable && !can_enable(data)) {
+		/* The netfs decided it didn't want to enable after all */
+	} else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+		/* Wait for outstanding disablement to complete */
+		__fscache_wait_on_invalidate(cookie);
+
+		if (fscache_acquire_non_index_cookie(cookie) == 0)
+			set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+	} else {
+		set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+	}
+
+out_unlock:
+	clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
+	wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+}
+EXPORT_SYMBOL(__fscache_enable_cookie);
+
+/*
  * acquire a non-index cookie
  * - this must make sure the index chain is instantiated and instantiate the
  *   object representation too
@@ -157,7 +197,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
 
 	_enter("");
 
-	cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+	set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
 	/* now we need to see whether the backing objects for this cookie yet
 	 * exist, if not there'll be nothing to search */
@@ -180,9 +220,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
 
 	_debug("cache %s", cache->tag->name);
 
-	cookie->flags =
-		(1 << FSCACHE_COOKIE_LOOKING_UP) |
-		(1 << FSCACHE_COOKIE_NO_DATA_YET);
+	set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
 
 	/* ask the cache to allocate objects for this cookie and its parent
 	 * chain */
@@ -398,7 +436,8 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
 	if (!hlist_empty(&cookie->backing_objects)) {
 		spin_lock(&cookie->lock);
 
-		if (!hlist_empty(&cookie->backing_objects) &&
+		if (fscache_cookie_enabled(cookie) &&
+		    !hlist_empty(&cookie->backing_objects) &&
 		    !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
 				      &cookie->flags)) {
 			object = hlist_entry(cookie->backing_objects.first,
@@ -452,10 +491,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
 
 	spin_lock(&cookie->lock);
 
-	/* update the index entry on disk in each cache backing this cookie */
-	hlist_for_each_entry(object,
-			     &cookie->backing_objects, cookie_link) {
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+	if (fscache_cookie_enabled(cookie)) {
+		/* update the index entry on disk in each cache backing this
+		 * cookie.
+		 */
+		hlist_for_each_entry(object,
+				     &cookie->backing_objects, cookie_link) {
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+		}
 	}
 
 	spin_unlock(&cookie->lock);
@@ -464,28 +507,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
 EXPORT_SYMBOL(__fscache_update_cookie);
 
 /*
- * release a cookie back to the cache
- * - the object will be marked as recyclable on disk if retire is true
- * - all dependents of this cookie must have already been unregistered
- *   (indices/files/pages)
+ * Disable a cookie to stop it from accepting new requests from the netfs.
  */
-void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
 {
 	struct fscache_object *object;
+	bool awaken = false;
 
-	fscache_stat(&fscache_n_relinquishes);
-	if (retire)
-		fscache_stat(&fscache_n_relinquishes_retire);
-
-	if (!cookie) {
-		fscache_stat(&fscache_n_relinquishes_null);
-		_leave(" [no cookie]");
-		return;
-	}
-
-	_enter("%p{%s,%p,%d},%d",
-	       cookie, cookie->def->name, cookie->netfs_data,
-	       atomic_read(&cookie->n_active), retire);
+	_enter("%p,%u", cookie, invalidate);
 
 	ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
 
@@ -495,24 +524,82 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 		BUG();
 	}
 
-	/* No further netfs-accessing operations on this cookie permitted */
-	set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
-	if (retire)
-		set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+	wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
+			 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+	if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
+		goto out_unlock_enable;
+
+	/* If the cookie is being invalidated, wait for that to complete first
+	 * so that we can reuse the flag.
+	 */
+	__fscache_wait_on_invalidate(cookie);
+
+	/* Dispose of the backing objects */
+	set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags);
 
 	spin_lock(&cookie->lock);
-	hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+	if (!hlist_empty(&cookie->backing_objects)) {
+		hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+			if (invalidate)
+				set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+		}
+	} else {
+		if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+			awaken = true;
 	}
 	spin_unlock(&cookie->lock);
+	if (awaken)
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
 
 	/* Wait for cessation of activity requiring access to the netfs (when
-	 * n_active reaches 0).
+	 * n_active reaches 0).  This makes sure outstanding reads and writes
+	 * have completed.
 	 */
 	if (!atomic_dec_and_test(&cookie->n_active))
 		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
 				 TASK_UNINTERRUPTIBLE);
 
+	/* Reset the cookie state if it wasn't relinquished */
+	if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) {
+		atomic_inc(&cookie->n_active);
+		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+	}
+
+out_unlock_enable:
+	clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
+	wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+	_leave("");
+}
+EXPORT_SYMBOL(__fscache_disable_cookie);
+
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
+{
+	fscache_stat(&fscache_n_relinquishes);
+	if (retire)
+		fscache_stat(&fscache_n_relinquishes_retire);
+
+	if (!cookie) {
+		fscache_stat(&fscache_n_relinquishes_null);
+		_leave(" [no cookie]");
+		return;
+	}
+
+	_enter("%p{%s,%p,%d},%d",
+	       cookie, cookie->def->name, cookie->netfs_data,
+	       atomic_read(&cookie->n_active), retire);
+
+	/* No further netfs-accessing operations on this cookie permitted */
+	set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+
+	__fscache_disable_cookie(cookie, retire);
+
 	/* Clear pointers back to the netfs */
 	cookie->netfs_data	= NULL;
 	cookie->def		= NULL;
@@ -568,6 +655,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 {
 	struct fscache_operation *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,", cookie);
@@ -586,11 +674,13 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 
 	fscache_operation_init(op, NULL, NULL);
 	op->flags = FSCACHE_OP_MYTHREAD |
-		(1 << FSCACHE_OP_WAITING);
+		(1 << FSCACHE_OP_WAITING) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto inconsistent;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
@@ -599,7 +689,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 
 	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
 
-	atomic_inc(&cookie->n_active);
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_op(object, op) < 0)
 		goto submit_failed;
 
@@ -621,9 +711,11 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 	return ret;
 
 submit_failed:
-	atomic_dec(&cookie->n_active);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 inconsistent:
 	spin_unlock(&cookie->lock);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	kfree(op);
 	_leave(" = -ESTALE");
 	return -ESTALE;
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index 10a2ade0bdf8..5a117df2a9ef 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -59,6 +59,7 @@ struct fscache_cookie fscache_fsdef_index = {
 	.lock		= __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
 	.backing_objects = HLIST_HEAD_INIT,
 	.def		= &fscache_fsdef_index_def,
+	.flags		= 1 << FSCACHE_COOKIE_ENABLED,
 };
 EXPORT_SYMBOL(fscache_fsdef_index);
 
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index b1bb6117473a..989f39401547 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -45,6 +45,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
 	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
 	netfs->primary_index->parent		= &fscache_fsdef_index;
 	netfs->primary_index->netfs_data	= netfs;
+	netfs->primary_index->flags		= 1 << FSCACHE_COOKIE_ENABLED;
 
 	atomic_inc(&netfs->primary_index->parent->usage);
 	atomic_inc(&netfs->primary_index->parent->n_children);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 86d75a60b20c..dcb821617774 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -495,6 +495,7 @@ void fscache_object_lookup_negative(struct fscache_object *object)
 		 * returning ENODATA.
 		 */
 		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+		clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
 		_debug("wake up lookup %p", &cookie->flags);
 		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
@@ -527,6 +528,7 @@ void fscache_obtained_object(struct fscache_object *object)
 
 		/* We do (presumably) have data */
 		clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+		clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
 		/* Allow write requests to begin stacking up and read requests
 		 * to begin shovelling data.
@@ -679,7 +681,8 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
 	 */
 	spin_lock(&cookie->lock);
 	hlist_del_init(&object->cookie_link);
-	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+	if (hlist_empty(&cookie->backing_objects) &&
+	    test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
 		awaken = true;
 	spin_unlock(&cookie->lock);
 
@@ -927,7 +930,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
 	 */
 	if (!fscache_use_cookie(object)) {
 		ASSERT(object->cookie->stores.rnode == NULL);
-		set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+		set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
 		_leave(" [no cookie]");
 		return transit_to(KILL_OBJECT);
 	}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 73899c1c3449..7f5c658af755 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -163,12 +163,10 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
 
 	fscache_stat(&fscache_n_attr_changed_calls);
 
-	if (fscache_object_is_active(object) &&
-	    fscache_use_cookie(object)) {
+	if (fscache_object_is_active(object)) {
 		fscache_stat(&fscache_n_cop_attr_changed);
 		ret = object->cache->ops->attr_changed(object);
 		fscache_stat_d(&fscache_n_cop_attr_changed);
-		fscache_unuse_cookie(object);
 		if (ret < 0)
 			fscache_abort_object(object);
 	}
@@ -184,6 +182,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 {
 	struct fscache_operation *op;
 	struct fscache_object *object;
+	bool wake_cookie;
 
 	_enter("%p", cookie);
 
@@ -199,15 +198,19 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 	}
 
 	fscache_operation_init(op, fscache_attr_changed_op, NULL);
-	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+	op->flags = FSCACHE_OP_ASYNC |
+		(1 << FSCACHE_OP_EXCLUSIVE) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_exclusive_op(object, op) < 0)
 		goto nobufs;
 	spin_unlock(&cookie->lock);
@@ -217,8 +220,11 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 	return 0;
 
 nobufs:
+	wake_cookie = __fscache_unuse_cookie(cookie);
 	spin_unlock(&cookie->lock);
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	fscache_stat(&fscache_n_attr_changed_nobufs);
 	_leave(" = %d", -ENOBUFS);
 	return -ENOBUFS;
@@ -263,7 +269,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 	}
 
 	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
-	atomic_inc(&cookie->n_active);
 	op->op.flags	= FSCACHE_OP_MYTHREAD |
 		(1UL << FSCACHE_OP_WAITING) |
 		(1UL << FSCACHE_OP_UNUSE_COOKIE);
@@ -384,6 +389,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 {
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,%p,,,", cookie, page);
@@ -405,7 +411,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 		return -ERESTARTSYS;
 
 	op = fscache_alloc_retrieval(cookie, page->mapping,
-				     end_io_func,context);
+				     end_io_func, context);
 	if (!op) {
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
@@ -414,13 +420,15 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs_unlock;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
 	ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
 
+	__fscache_use_cookie(cookie);
 	atomic_inc(&object->n_reads);
 	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
 
@@ -475,9 +483,11 @@ error:
 
 nobufs_unlock_dec:
 	atomic_dec(&object->n_reads);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	atomic_dec(&cookie->n_active);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	kfree(op);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
@@ -514,6 +524,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 {
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,,%d,,,", cookie, *nr_pages);
@@ -542,11 +553,13 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs_unlock;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	__fscache_use_cookie(cookie);
 	atomic_inc(&object->n_reads);
 	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
 
@@ -601,10 +614,12 @@ error:
 
 nobufs_unlock_dec:
 	atomic_dec(&object->n_reads);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	atomic_dec(&cookie->n_active);
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
 	_leave(" = -ENOBUFS");
@@ -626,6 +641,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 {
 	struct fscache_retrieval *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,%p,,,", cookie, page);
@@ -653,13 +669,15 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs_unlock;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_op(object, &op->op) < 0)
-		goto nobufs_unlock;
+		goto nobufs_unlock_dec;
 	spin_unlock(&cookie->lock);
 
 	fscache_stat(&fscache_n_alloc_ops);
@@ -689,10 +707,13 @@ error:
 	_leave(" = %d", ret);
 	return ret;
 
+nobufs_unlock_dec:
+	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	atomic_dec(&cookie->n_active);
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 nobufs:
 	fscache_stat(&fscache_n_allocs_nobufs);
 	_leave(" = -ENOBUFS");
@@ -889,6 +910,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 {
 	struct fscache_storage *op;
 	struct fscache_object *object;
+	bool wake_cookie = false;
 	int ret;
 
 	_enter("%p,%x,", cookie, (u32) page->flags);
@@ -920,7 +942,8 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	ret = -ENOBUFS;
 	spin_lock(&cookie->lock);
 
-	if (hlist_empty(&cookie->backing_objects))
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
 		goto nobufs;
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
@@ -957,7 +980,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
 	op->store_limit = object->store_limit;
 
-	atomic_inc(&cookie->n_active);
+	__fscache_use_cookie(cookie);
 	if (fscache_submit_op(object, &op->op) < 0)
 		goto submit_failed;
 
@@ -984,10 +1007,10 @@ already_pending:
 	return 0;
 
 submit_failed:
-	atomic_dec(&cookie->n_active);
 	spin_lock(&cookie->stores_lock);
 	radix_tree_delete(&cookie->stores, page->index);
 	spin_unlock(&cookie->stores_lock);
+	wake_cookie = __fscache_unuse_cookie(cookie);
 	page_cache_release(page);
 	ret = -ENOBUFS;
 	goto nobufs;
@@ -999,6 +1022,8 @@ nobufs:
 	spin_unlock(&cookie->lock);
 	radix_tree_preload_end();
 	kfree(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
 	fscache_stat(&fscache_n_stores_nobufs);
 	_leave(" = -ENOBUFS");
 	return -ENOBUFS;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 62b43b577bfc..b7989f2ab4c4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -182,6 +182,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 	struct inode *inode;
 	struct dentry *parent;
 	struct fuse_conn *fc;
+	struct fuse_inode *fi;
 	int ret;
 
 	inode = ACCESS_ONCE(entry->d_inode);
@@ -228,7 +229,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 		if (!err && !outarg.nodeid)
 			err = -ENOENT;
 		if (!err) {
-			struct fuse_inode *fi = get_fuse_inode(inode);
+			fi = get_fuse_inode(inode);
 			if (outarg.nodeid != get_node_id(inode)) {
 				fuse_queue_forget(fc, forget, outarg.nodeid, 1);
 				goto invalid;
@@ -246,8 +247,11 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 				       attr_version);
 		fuse_change_entry_timeout(entry, &outarg);
 	} else if (inode) {
-		fc = get_fuse_conn(inode);
-		if (fc->readdirplus_auto) {
+		fi = get_fuse_inode(inode);
+		if (flags & LOOKUP_RCU) {
+			if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state))
+				return -ECHILD;
+		} else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) {
 			parent = dget_parent(entry);
 			fuse_advise_use_readdirplus(parent->d_inode);
 			dput(parent);
@@ -259,7 +263,8 @@ out:
 
 invalid:
 	ret = 0;
-	if (check_submounts_and_drop(entry) != 0)
+
+	if (!(flags & LOOKUP_RCU) && check_submounts_and_drop(entry) != 0)
 		ret = 1;
 	goto out;
 }
@@ -1063,6 +1068,8 @@ static int fuse_access(struct inode *inode, int mask)
 	struct fuse_access_in inarg;
 	int err;
 
+	BUG_ON(mask & MAY_NOT_BLOCK);
+
 	if (fc->no_access)
 		return 0;
 
@@ -1150,9 +1157,6 @@ static int fuse_permission(struct inode *inode, int mask)
 		   noticed immediately, only after the attribute
 		   timeout has expired */
 	} else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
-		if (mask & MAY_NOT_BLOCK)
-			return -ECHILD;
-
 		err = fuse_access(inode, mask);
 	} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		if (!(inode->i_mode & S_IXUGO)) {
@@ -1291,6 +1295,8 @@ static int fuse_direntplus_link(struct file *file,
 	}
 
 found:
+	if (fc->readdirplus_auto)
+		set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
 	fuse_change_entry_timeout(dentry, o);
 
 	err = 0;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d409deafc67b..4598345ab87d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2467,6 +2467,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 {
 	struct fuse_file *ff = file->private_data;
 	struct inode *inode = file->f_inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = ff->fc;
 	struct fuse_req *req;
 	struct fuse_fallocate_in inarg = {
@@ -2484,10 +2485,20 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 
 	if (lock_inode) {
 		mutex_lock(&inode->i_mutex);
-		if (mode & FALLOC_FL_PUNCH_HOLE)
-			fuse_set_nowrite(inode);
+		if (mode & FALLOC_FL_PUNCH_HOLE) {
+			loff_t endbyte = offset + length - 1;
+			err = filemap_write_and_wait_range(inode->i_mapping,
+							   offset, endbyte);
+			if (err)
+				goto out;
+
+			fuse_sync_writes(inode);
+		}
 	}
 
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
 	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
@@ -2520,11 +2531,11 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	fuse_invalidate_attr(inode);
 
 out:
-	if (lock_inode) {
-		if (mode & FALLOC_FL_PUNCH_HOLE)
-			fuse_release_nowrite(inode);
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+	if (lock_inode)
 		mutex_unlock(&inode->i_mutex);
-	}
 
 	return err;
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5ced199b50bb..5b9e6f3b6aef 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -115,6 +115,8 @@ struct fuse_inode {
 enum {
 	/** Advise readdirplus  */
 	FUSE_I_ADVISE_RDPLUS,
+	/** Initialized with readdirplus */
+	FUSE_I_INIT_RDPLUS,
 	/** An operation changing file size is in progress  */
 	FUSE_I_SIZE_UNSTABLE,
 };
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1f7d8057ea68..b7fc035a6943 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -611,12 +611,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
 
 	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .aflags = 0, };
 		error = gfs2_quota_lock_check(ip);
 		if (error)
 			goto out_unlock;
 
 		requested = data_blocks + ind_blocks;
-		error = gfs2_inplace_reserve(ip, requested, 0);
+		ap.target = requested;
+		error = gfs2_inplace_reserve(ip, &ap);
 		if (error)
 			goto out_qunlock;
 	}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 62a65fc448dc..fe0500c0af7a 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1216,6 +1216,7 @@ static int do_grow(struct inode *inode, u64 size)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_alloc_parms ap = { .target = 1, };
 	struct buffer_head *dibh;
 	int error;
 	int unstuff = 0;
@@ -1226,7 +1227,7 @@ static int do_grow(struct inode *inode, u64 size)
 		if (error)
 			return error;
 
-		error = gfs2_inplace_reserve(ip, 1, 0);
+		error = gfs2_inplace_reserve(ip, &ap);
 		if (error)
 			goto do_grow_qunlock;
 		unstuff = 1;
@@ -1279,6 +1280,7 @@ do_grow_qunlock:
 
 int gfs2_setattr_size(struct inode *inode, u64 newsize)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
 	int ret;
 	u64 oldsize;
 
@@ -1294,7 +1296,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
 
 	inode_dio_wait(inode);
 
-	ret = gfs2_rs_alloc(GFS2_I(inode));
+	ret = gfs2_rs_alloc(ip);
 	if (ret)
 		goto out;
 
@@ -1304,6 +1306,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
 		goto out;
 	}
 
+	gfs2_rs_deltree(ip->i_res);
 	ret = do_shrink(inode, oldsize, newsize);
 out:
 	put_write_access(inode);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 0621b46d474d..efc078f0ee4e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -383,6 +383,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = file_inode(vma->vm_file);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_alloc_parms ap = { .aflags = 0, };
 	unsigned long last_index;
 	u64 pos = page->index << PAGE_CACHE_SHIFT;
 	unsigned int data_blocks, ind_blocks, rblocks;
@@ -430,7 +431,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		goto out_unlock;
 	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
-	ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
+	ap.target = data_blocks + ind_blocks;
+	ret = gfs2_inplace_reserve(ip, &ap);
 	if (ret)
 		goto out_quota_unlock;
 
@@ -620,7 +622,7 @@ static int gfs2_release(struct inode *inode, struct file *file)
 	if (!(file->f_mode & FMODE_WRITE))
 		return 0;
 
-	gfs2_rs_delete(ip);
+	gfs2_rs_delete(ip, &inode->i_writecount);
 	return 0;
 }
 
@@ -800,6 +802,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 	struct inode *inode = file_inode(file);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_alloc_parms ap = { .aflags = 0, };
 	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
 	loff_t bytes, max_bytes;
 	int error;
@@ -850,7 +853,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 retry:
 		gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
 
-		error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
+		ap.target = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip, &ap);
 		if (error) {
 			if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
 				bytes >>= 1;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c2f41b4d00b9..e66a8009aff1 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -31,6 +31,7 @@
 #include <linux/bit_spinlock.h>
 #include <linux/percpu.h>
 #include <linux/list_sort.h>
+#include <linux/lockref.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -129,10 +130,10 @@ void gfs2_glock_free(struct gfs2_glock *gl)
  *
  */
 
-void gfs2_glock_hold(struct gfs2_glock *gl)
+static void gfs2_glock_hold(struct gfs2_glock *gl)
 {
-	GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
-	atomic_inc(&gl->gl_ref);
+	GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
+	lockref_get(&gl->gl_lockref);
 }
 
 /**
@@ -187,20 +188,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 }
 
 /**
- * gfs2_glock_put_nolock() - Decrement reference count on glock
- * @gl: The glock to put
- *
- * This function should only be used if the caller has its own reference
- * to the glock, in addition to the one it is dropping.
- */
-
-void gfs2_glock_put_nolock(struct gfs2_glock *gl)
-{
-	if (atomic_dec_and_test(&gl->gl_ref))
-		GLOCK_BUG_ON(gl, 1);
-}
-
-/**
  * gfs2_glock_put() - Decrement reference count on glock
  * @gl: The glock to put
  *
@@ -211,17 +198,22 @@ void gfs2_glock_put(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct address_space *mapping = gfs2_glock2aspace(gl);
 
-	if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
-		__gfs2_glock_remove_from_lru(gl);
-		spin_unlock(&lru_lock);
-		spin_lock_bucket(gl->gl_hash);
-		hlist_bl_del_rcu(&gl->gl_list);
-		spin_unlock_bucket(gl->gl_hash);
-		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
-		GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
-		trace_gfs2_glock_put(gl);
-		sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
-	}
+	if (lockref_put_or_lock(&gl->gl_lockref))
+		return;
+
+	lockref_mark_dead(&gl->gl_lockref);
+
+	spin_lock(&lru_lock);
+	__gfs2_glock_remove_from_lru(gl);
+	spin_unlock(&lru_lock);
+	spin_unlock(&gl->gl_lockref.lock);
+	spin_lock_bucket(gl->gl_hash);
+	hlist_bl_del_rcu(&gl->gl_list);
+	spin_unlock_bucket(gl->gl_hash);
+	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
+	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+	trace_gfs2_glock_put(gl);
+	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
 }
 
 /**
@@ -244,7 +236,7 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
 			continue;
 		if (gl->gl_sbd != sdp)
 			continue;
-		if (atomic_inc_not_zero(&gl->gl_ref))
+		if (lockref_get_not_dead(&gl->gl_lockref))
 			return gl;
 	}
 
@@ -396,10 +388,11 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 	held2 = (new_state != LM_ST_UNLOCKED);
 
 	if (held1 != held2) {
+		GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
 		if (held2)
-			gfs2_glock_hold(gl);
+			gl->gl_lockref.count++;
 		else
-			gfs2_glock_put_nolock(gl);
+			gl->gl_lockref.count--;
 	}
 	if (held1 && held2 && list_empty(&gl->gl_holders))
 		clear_bit(GLF_QUEUED, &gl->gl_flags);
@@ -626,9 +619,9 @@ out:
 out_sched:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
 	smp_mb__after_clear_bit();
-	gfs2_glock_hold(gl);
+	gl->gl_lockref.count++;
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-		gfs2_glock_put_nolock(gl);
+		gl->gl_lockref.count--;
 	return;
 
 out_unlock:
@@ -754,7 +747,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_sbd = sdp;
 	gl->gl_flags = 0;
 	gl->gl_name = name;
-	atomic_set(&gl->gl_ref, 1);
+	gl->gl_lockref.count = 1;
 	gl->gl_state = LM_ST_UNLOCKED;
 	gl->gl_target = LM_ST_UNLOCKED;
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
@@ -1356,10 +1349,10 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 		}
 	}
 
-	spin_unlock(&gl->gl_spin);
+	gl->gl_lockref.count++;
 	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
-	smp_wmb();
-	gfs2_glock_hold(gl);
+	spin_unlock(&gl->gl_spin);
+
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 		gfs2_glock_put(gl);
 }
@@ -1404,15 +1397,19 @@ __acquires(&lru_lock)
 	while(!list_empty(list)) {
 		gl = list_entry(list->next, struct gfs2_glock, gl_lru);
 		list_del_init(&gl->gl_lru);
+		if (!spin_trylock(&gl->gl_spin)) {
+			list_add(&gl->gl_lru, &lru_list);
+			atomic_inc(&lru_count);
+			continue;
+		}
 		clear_bit(GLF_LRU, &gl->gl_flags);
-		gfs2_glock_hold(gl);
 		spin_unlock(&lru_lock);
-		spin_lock(&gl->gl_spin);
+		gl->gl_lockref.count++;
 		if (demote_ok(gl))
 			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
 		WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
 		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-			gfs2_glock_put_nolock(gl);
+			gl->gl_lockref.count--;
 		spin_unlock(&gl->gl_spin);
 		spin_lock(&lru_lock);
 	}
@@ -1493,7 +1490,7 @@ static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
 
 	rcu_read_lock();
 	hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
-		if ((gl->gl_sbd == sdp) && atomic_inc_not_zero(&gl->gl_ref))
+		if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref))
 			examiner(gl);
 	}
 	rcu_read_unlock();
@@ -1746,7 +1743,7 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 		  state2str(gl->gl_demote_state), dtime,
 		  atomic_read(&gl->gl_ail_count),
 		  atomic_read(&gl->gl_revokes),
-		  atomic_read(&gl->gl_ref), gl->gl_hold_time);
+		  (int)gl->gl_lockref.count, gl->gl_hold_time);
 
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
 		error = dump_holder(seq, gh);
@@ -1902,7 +1899,7 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 			gi->nhash = 0;
 		}
 	/* Skip entries for other sb and dead entries */
-	} while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
+	} while (gi->sdp != gi->gl->gl_sbd || __lockref_is_dead(&gl->gl_lockref));
 
 	return 0;
 }
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 69f66e3d22bf..6647d77366ba 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -181,8 +181,6 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
 extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 			  const struct gfs2_glock_operations *glops,
 			  int create, struct gfs2_glock **glp);
-extern void gfs2_glock_hold(struct gfs2_glock *gl);
-extern void gfs2_glock_put_nolock(struct gfs2_glock *gl);
 extern void gfs2_glock_put(struct gfs2_glock *gl);
 extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
 			     unsigned flags, struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index e2e0a90396e7..db908f697139 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -525,9 +525,9 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
 
 	if (gl->gl_demote_state == LM_ST_UNLOCKED &&
 	    gl->gl_state == LM_ST_SHARED && ip) {
-		gfs2_glock_hold(gl);
+		gl->gl_lockref.count++;
 		if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
-			gfs2_glock_put_nolock(gl);
+			gl->gl_lockref.count--;
 	}
 }
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 26aabd7caba7..ba1ea67f4eeb 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -21,6 +21,7 @@
 #include <linux/rbtree.h>
 #include <linux/ktime.h>
 #include <linux/percpu.h>
+#include <linux/lockref.h>
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
@@ -71,6 +72,7 @@ struct gfs2_bitmap {
 	u32 bi_offset;
 	u32 bi_start;
 	u32 bi_len;
+	u32 bi_blocks;
 };
 
 struct gfs2_rgrpd {
@@ -101,19 +103,25 @@ struct gfs2_rgrpd {
 
 struct gfs2_rbm {
 	struct gfs2_rgrpd *rgd;
-	struct gfs2_bitmap *bi;	/* Bitmap must belong to the rgd */
 	u32 offset;		/* The offset is bitmap relative */
+	int bii;		/* Bitmap index */
 };
 
+static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
+{
+	return rbm->rgd->rd_bits + rbm->bii;
+}
+
 static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
 {
-	return rbm->rgd->rd_data0 + (rbm->bi->bi_start * GFS2_NBBY) + rbm->offset;
+	return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
+		rbm->offset;
 }
 
 static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1,
 			       const struct gfs2_rbm *rbm2)
 {
-	return (rbm1->rgd == rbm2->rgd) && (rbm1->bi == rbm2->bi) && 
+	return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) &&
 	       (rbm1->offset == rbm2->offset);
 }
 
@@ -278,6 +286,20 @@ struct gfs2_blkreserv {
 	unsigned int rs_qa_qd_num;
 };
 
+/*
+ * Allocation parameters
+ * @target: The number of blocks we'd ideally like to allocate
+ * @aflags: The flags (e.g. Orlov flag)
+ *
+ * The intent is to gradually expand this structure over time in
+ * order to give more information, e.g. alignment, min extent size
+ * to the allocation code.
+ */
+struct gfs2_alloc_parms {
+	u32 target;
+	u32 aflags;
+};
+
 enum {
 	GLF_LOCK			= 1,
 	GLF_DEMOTE			= 3,
@@ -300,9 +322,9 @@ struct gfs2_glock {
 	struct gfs2_sbd *gl_sbd;
 	unsigned long gl_flags;		/* GLF_... */
 	struct lm_lockname gl_name;
-	atomic_t gl_ref;
 
-	spinlock_t gl_spin;
+	struct lockref gl_lockref;
+#define gl_spin gl_lockref.lock
 
 	/* State fields protected by gl_spin */
 	unsigned int gl_state:2,	/* Current state */
@@ -398,11 +420,10 @@ enum {
 
 struct gfs2_quota_data {
 	struct list_head qd_list;
-	struct list_head qd_reclaim;
-
-	atomic_t qd_count;
-
 	struct kqid qd_id;
+	struct lockref qd_lockref;
+	struct list_head qd_lru;
+
 	unsigned long qd_flags;		/* QDF_... */
 
 	s64 qd_change;
@@ -516,7 +537,6 @@ struct gfs2_tune {
 
 	unsigned int gt_logd_secs;
 
-	unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
 	unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
 	unsigned int gt_quota_scale_num; /* Numerator */
 	unsigned int gt_quota_scale_den; /* Denominator */
@@ -694,6 +714,7 @@ struct gfs2_sbd {
 	struct list_head sd_quota_list;
 	atomic_t sd_quota_count;
 	struct mutex sd_quota_mutex;
+	struct mutex sd_quota_sync_mutex;
 	wait_queue_head_t sd_quota_wait;
 	struct list_head sd_trunc_list;
 	spinlock_t sd_trunc_lock;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 64915eeae5a7..109ce9325b76 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -379,6 +379,7 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,
 static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc_parms ap = { .target = RES_DINODE, .aflags = flags, };
 	int error;
 	int dblocks = 1;
 
@@ -386,7 +387,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
 	if (error)
 		goto out;
 
-	error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
+	error = gfs2_inplace_reserve(ip, &ap);
 	if (error)
 		goto out_quota;
 
@@ -472,6 +473,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 		       struct gfs2_inode *ip, int arq)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
 	int error;
 
 	if (arq) {
@@ -479,7 +481,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 		if (error)
 			goto fail_quota_locks;
 
-		error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
+		error = gfs2_inplace_reserve(dip, &ap);
 		if (error)
 			goto fail_quota_locks;
 
@@ -584,17 +586,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (!IS_ERR(inode)) {
 		d = d_splice_alias(inode, dentry);
 		error = 0;
-		if (file && !IS_ERR(d)) {
-			if (d == NULL)
-				d = dentry;
-			if (S_ISREG(inode->i_mode))
-				error = finish_open(file, d, gfs2_open_common, opened);
-			else
+		if (file) {
+			if (S_ISREG(inode->i_mode)) {
+				WARN_ON(d != NULL);
+				error = finish_open(file, dentry, gfs2_open_common, opened);
+			} else {
 				error = finish_no_open(file, d);
+			}
+		} else {
+			dput(d);
 		}
 		gfs2_glock_dq_uninit(ghs);
-		if (IS_ERR(d))
-			return PTR_ERR(d);
 		return error;
 	} else if (error != -ENOENT) {
 		goto fail_gunlock;
@@ -694,8 +696,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 
 	mark_inode_dirty(inode);
 	d_instantiate(dentry, inode);
-	if (file)
+	if (file) {
+		*opened |= FILE_CREATED;
 		error = finish_open(file, dentry, gfs2_open_common, opened);
+	}
 	gfs2_glock_dq_uninit(ghs);
 	gfs2_glock_dq_uninit(ghs + 1);
 	return error;
@@ -711,7 +715,7 @@ fail_gunlock2:
 fail_free_inode:
 	if (ip->i_gl)
 		gfs2_glock_put(ip->i_gl);
-	gfs2_rs_delete(ip);
+	gfs2_rs_delete(ip, NULL);
 	free_inode_nonrcu(inode);
 	inode = NULL;
 fail_gunlock:
@@ -779,8 +783,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
 		error = finish_open(file, dentry, gfs2_open_common, opened);
 
 	gfs2_glock_dq_uninit(&gh);
-	if (error)
+	if (error) {
+		dput(d);
 		return ERR_PTR(error);
+	}
 	return d;
 }
 
@@ -872,11 +878,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	error = 0;
 
 	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
 		error = gfs2_quota_lock_check(dip);
 		if (error)
 			goto out_gunlock;
 
-		error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
+		error = gfs2_inplace_reserve(dip, &ap);
 		if (error)
 			goto out_gunlock_q;
 
@@ -1161,14 +1168,16 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
 	d = __gfs2_lookup(dir, dentry, file, opened);
 	if (IS_ERR(d))
 		return PTR_ERR(d);
-	if (d == NULL)
-		d = dentry;
-	if (d->d_inode) {
+	if (d != NULL)
+		dentry = d;
+	if (dentry->d_inode) {
 		if (!(*opened & FILE_OPENED))
-			return finish_no_open(file, d);
+			return finish_no_open(file, dentry);
+		dput(d);
 		return 0;
 	}
 
+	BUG_ON(d != NULL);
 	if (!(flags & O_CREAT))
 		return -ENOENT;
 
@@ -1383,11 +1392,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		goto out_gunlock;
 
 	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
 		error = gfs2_quota_lock_check(ndip);
 		if (error)
 			goto out_gunlock;
 
-		error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0);
+		error = gfs2_inplace_reserve(ndip, &ap);
 		if (error)
 			goto out_gunlock_q;
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 351586e24e30..0650db2541ef 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -31,12 +31,6 @@
 
 struct workqueue_struct *gfs2_control_wq;
 
-static struct shrinker qd_shrinker = {
-	.count_objects = gfs2_qd_shrink_count,
-	.scan_objects = gfs2_qd_shrink_scan,
-	.seeks = DEFAULT_SEEKS,
-};
-
 static void gfs2_init_inode_once(void *foo)
 {
 	struct gfs2_inode *ip = foo;
@@ -87,6 +81,10 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		return error;
 
+	error = list_lru_init(&gfs2_qd_lru);
+	if (error)
+		goto fail_lru;
+
 	error = gfs2_glock_init();
 	if (error)
 		goto fail;
@@ -139,7 +137,7 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_rsrv_cachep)
 		goto fail;
 
-	register_shrinker(&qd_shrinker);
+	register_shrinker(&gfs2_qd_shrinker);
 
 	error = register_filesystem(&gfs2_fs_type);
 	if (error)
@@ -179,7 +177,9 @@ fail_wq:
 fail_unregister:
 	unregister_filesystem(&gfs2_fs_type);
 fail:
-	unregister_shrinker(&qd_shrinker);
+	list_lru_destroy(&gfs2_qd_lru);
+fail_lru:
+	unregister_shrinker(&gfs2_qd_shrinker);
 	gfs2_glock_exit();
 
 	if (gfs2_rsrv_cachep)
@@ -214,13 +214,14 @@ fail:
 
 static void __exit exit_gfs2_fs(void)
 {
-	unregister_shrinker(&qd_shrinker);
+	unregister_shrinker(&gfs2_qd_shrinker);
 	gfs2_glock_exit();
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
 	destroy_workqueue(gfs_recovery_wq);
 	destroy_workqueue(gfs2_control_wq);
+	list_lru_destroy(&gfs2_qd_lru);
 
 	rcu_barrier();
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 19ff5e8c285c..82303b474958 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -51,7 +51,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
 	spin_lock_init(&gt->gt_spin);
 
-	gt->gt_quota_simul_sync = 64;
 	gt->gt_quota_warn_period = 10;
 	gt->gt_quota_scale_num = 1;
 	gt->gt_quota_scale_den = 1;
@@ -94,6 +93,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	INIT_LIST_HEAD(&sdp->sd_quota_list);
 	mutex_init(&sdp->sd_quota_mutex);
+	mutex_init(&sdp->sd_quota_sync_mutex);
 	init_waitqueue_head(&sdp->sd_quota_wait);
 	INIT_LIST_HEAD(&sdp->sd_trunc_list);
 	spin_lock_init(&sdp->sd_trunc_lock);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index db441359ee8c..453b50eaddec 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -50,6 +50,8 @@
 #include <linux/freezer.h>
 #include <linux/quota.h>
 #include <linux/dqblk_xfs.h>
+#include <linux/lockref.h>
+#include <linux/list_lru.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -71,29 +73,25 @@ struct gfs2_quota_change_host {
 	struct kqid qc_id;
 };
 
-static LIST_HEAD(qd_lru_list);
-static atomic_t qd_lru_count = ATOMIC_INIT(0);
-static DEFINE_SPINLOCK(qd_lru_lock);
+/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */
+static DEFINE_SPINLOCK(qd_lock);
+struct list_lru gfs2_qd_lru;
 
-unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
-				  struct shrink_control *sc)
+static void gfs2_qd_dispose(struct list_head *list)
 {
 	struct gfs2_quota_data *qd;
 	struct gfs2_sbd *sdp;
-	int nr_to_scan = sc->nr_to_scan;
-	long freed = 0;
 
-	if (!(sc->gfp_mask & __GFP_FS))
-		return SHRINK_STOP;
-
-	spin_lock(&qd_lru_lock);
-	while (nr_to_scan && !list_empty(&qd_lru_list)) {
-		qd = list_entry(qd_lru_list.next,
-				struct gfs2_quota_data, qd_reclaim);
+	while (!list_empty(list)) {
+		qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
 		sdp = qd->qd_gl->gl_sbd;
 
+		list_del(&qd->qd_lru);
+
 		/* Free from the filesystem-specific list */
+		spin_lock(&qd_lock);
 		list_del(&qd->qd_list);
+		spin_unlock(&qd_lock);
 
 		gfs2_assert_warn(sdp, !qd->qd_change);
 		gfs2_assert_warn(sdp, !qd->qd_slot_count);
@@ -103,24 +101,59 @@ unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 		atomic_dec(&sdp->sd_quota_count);
 
 		/* Delete it from the common reclaim list */
-		list_del_init(&qd->qd_reclaim);
-		atomic_dec(&qd_lru_count);
-		spin_unlock(&qd_lru_lock);
 		kmem_cache_free(gfs2_quotad_cachep, qd);
-		spin_lock(&qd_lru_lock);
-		nr_to_scan--;
-		freed++;
 	}
-	spin_unlock(&qd_lru_lock);
+}
+
+
+static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+{
+	struct list_head *dispose = arg;
+	struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
+
+	if (!spin_trylock(&qd->qd_lockref.lock))
+		return LRU_SKIP;
+
+	if (qd->qd_lockref.count == 0) {
+		lockref_mark_dead(&qd->qd_lockref);
+		list_move(&qd->qd_lru, dispose);
+	}
+
+	spin_unlock(&qd->qd_lockref.lock);
+	return LRU_REMOVED;
+}
+
+static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
+					 struct shrink_control *sc)
+{
+	LIST_HEAD(dispose);
+	unsigned long freed;
+
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+
+	freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate,
+				   &dispose, &sc->nr_to_scan);
+
+	gfs2_qd_dispose(&dispose);
+
 	return freed;
 }
 
-unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
-				   struct shrink_control *sc)
+static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
+					  struct shrink_control *sc)
 {
-	return vfs_pressure_ratio(atomic_read(&qd_lru_count));
+	return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid));
 }
 
+struct shrinker gfs2_qd_shrinker = {
+	.count_objects = gfs2_qd_shrink_count,
+	.scan_objects = gfs2_qd_shrink_scan,
+	.seeks = DEFAULT_SEEKS,
+	.flags = SHRINKER_NUMA_AWARE,
+};
+
+
 static u64 qd2index(struct gfs2_quota_data *qd)
 {
 	struct kqid qid = qd->qd_id;
@@ -148,10 +181,11 @@ static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
 	if (!qd)
 		return -ENOMEM;
 
-	atomic_set(&qd->qd_count, 1);
+	qd->qd_lockref.count = 1;
+	spin_lock_init(&qd->qd_lockref.lock);
 	qd->qd_id = qid;
 	qd->qd_slot = -1;
-	INIT_LIST_HEAD(&qd->qd_reclaim);
+	INIT_LIST_HEAD(&qd->qd_lru);
 
 	error = gfs2_glock_get(sdp, qd2index(qd),
 			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
@@ -177,16 +211,11 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 
 	for (;;) {
 		found = 0;
-		spin_lock(&qd_lru_lock);
+		spin_lock(&qd_lock);
 		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
-			if (qid_eq(qd->qd_id, qid)) {
-				if (!atomic_read(&qd->qd_count) &&
-				    !list_empty(&qd->qd_reclaim)) {
-					/* Remove it from reclaim list */
-					list_del_init(&qd->qd_reclaim);
-					atomic_dec(&qd_lru_count);
-				}
-				atomic_inc(&qd->qd_count);
+			if (qid_eq(qd->qd_id, qid) &&
+			    lockref_get_not_dead(&qd->qd_lockref)) {
+				list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
 				found = 1;
 				break;
 			}
@@ -202,7 +231,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 			new_qd = NULL;
 		}
 
-		spin_unlock(&qd_lru_lock);
+		spin_unlock(&qd_lock);
 
 		if (qd) {
 			if (new_qd) {
@@ -222,18 +251,19 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 static void qd_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-	gfs2_assert(sdp, atomic_read(&qd->qd_count));
-	atomic_inc(&qd->qd_count);
+	gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
+	lockref_get(&qd->qd_lockref);
 }
 
 static void qd_put(struct gfs2_quota_data *qd)
 {
-	if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) {
-		/* Add to the reclaim list */
-		list_add_tail(&qd->qd_reclaim, &qd_lru_list);
-		atomic_inc(&qd_lru_count);
-		spin_unlock(&qd_lru_lock);
-	}
+	if (lockref_put_or_lock(&qd->qd_lockref))
+		return;
+
+	qd->qd_lockref.count = 0;
+	list_lru_add(&gfs2_qd_lru, &qd->qd_lru);
+	spin_unlock(&qd->qd_lockref.lock);
+
 }
 
 static int slot_get(struct gfs2_quota_data *qd)
@@ -242,10 +272,10 @@ static int slot_get(struct gfs2_quota_data *qd)
 	unsigned int c, o = 0, b;
 	unsigned char byte = 0;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 
 	if (qd->qd_slot_count++) {
-		spin_unlock(&qd_lru_lock);
+		spin_unlock(&qd_lock);
 		return 0;
 	}
 
@@ -269,13 +299,13 @@ found:
 
 	sdp->sd_quota_bitmap[c][o] |= 1 << b;
 
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 
 	return 0;
 
 fail:
 	qd->qd_slot_count--;
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 	return -ENOSPC;
 }
 
@@ -283,23 +313,43 @@ static void slot_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	qd->qd_slot_count++;
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
+}
+
+static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+			     unsigned int bit, int new_value)
+{
+	unsigned int c, o, b = bit;
+	int old_value;
+
+	c = b / (8 * PAGE_SIZE);
+	b %= 8 * PAGE_SIZE;
+	o = b / 8;
+	b %= 8;
+
+	old_value = (bitmap[c][o] & (1 << b));
+	gfs2_assert_withdraw(sdp, !old_value != !new_value);
+
+	if (new_value)
+		bitmap[c][o] |= 1 << b;
+	else
+		bitmap[c][o] &= ~(1 << b);
 }
 
 static void slot_put(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	if (!--qd->qd_slot_count) {
 		gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
 		qd->qd_slot = -1;
 	}
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 }
 
 static int bh_get(struct gfs2_quota_data *qd)
@@ -363,6 +413,25 @@ static void bh_put(struct gfs2_quota_data *qd)
 	mutex_unlock(&sdp->sd_quota_mutex);
 }
 
+static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
+			 u64 *sync_gen)
+{
+	if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+	    !test_bit(QDF_CHANGE, &qd->qd_flags) ||
+	    (sync_gen && (qd->qd_sync_gen >= *sync_gen)))
+		return 0;
+
+	if (!lockref_get_not_dead(&qd->qd_lockref))
+		return 0;
+
+	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+	set_bit(QDF_LOCKED, &qd->qd_flags);
+	qd->qd_change_sync = qd->qd_change;
+	gfs2_assert_warn(sdp, qd->qd_slot_count);
+	qd->qd_slot_count++;
+	return 1;
+}
+
 static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 {
 	struct gfs2_quota_data *qd = NULL;
@@ -374,31 +443,18 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 	if (sdp->sd_vfs->s_flags & MS_RDONLY)
 		return 0;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 
 	list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
-		if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
-		    !test_bit(QDF_CHANGE, &qd->qd_flags) ||
-		    qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
-			continue;
-
-		list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
-
-		set_bit(QDF_LOCKED, &qd->qd_flags);
-		gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
-		atomic_inc(&qd->qd_count);
-		qd->qd_change_sync = qd->qd_change;
-		gfs2_assert_warn(sdp, qd->qd_slot_count);
-		qd->qd_slot_count++;
-		found = 1;
-
-		break;
+		found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen);
+		if (found)
+			break;
 	}
 
 	if (!found)
 		qd = NULL;
 
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 
 	if (qd) {
 		gfs2_assert_warn(sdp, qd->qd_change_sync);
@@ -416,43 +472,6 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 	return 0;
 }
 
-static int qd_trylock(struct gfs2_quota_data *qd)
-{
-	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-
-	if (sdp->sd_vfs->s_flags & MS_RDONLY)
-		return 0;
-
-	spin_lock(&qd_lru_lock);
-
-	if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
-	    !test_bit(QDF_CHANGE, &qd->qd_flags)) {
-		spin_unlock(&qd_lru_lock);
-		return 0;
-	}
-
-	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
-
-	set_bit(QDF_LOCKED, &qd->qd_flags);
-	gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
-	atomic_inc(&qd->qd_count);
-	qd->qd_change_sync = qd->qd_change;
-	gfs2_assert_warn(sdp, qd->qd_slot_count);
-	qd->qd_slot_count++;
-
-	spin_unlock(&qd_lru_lock);
-
-	gfs2_assert_warn(sdp, qd->qd_change_sync);
-	if (bh_get(qd)) {
-		clear_bit(QDF_LOCKED, &qd->qd_flags);
-		slot_put(qd);
-		qd_put(qd);
-		return 0;
-	}
-
-	return 1;
-}
-
 static void qd_unlock(struct gfs2_quota_data *qd)
 {
 	gfs2_assert_warn(qd->qd_gl->gl_sbd,
@@ -602,9 +621,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 	x = be64_to_cpu(qc->qc_change) + change;
 	qc->qc_change = cpu_to_be64(x);
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	qd->qd_change = x;
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 
 	if (!x) {
 		gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
@@ -763,6 +782,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 {
 	struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_alloc_parms ap = { .aflags = 0, };
 	unsigned int data_blocks, ind_blocks;
 	struct gfs2_holder *ghs, i_gh;
 	unsigned int qx, x;
@@ -815,7 +835,8 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 	blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
 
 	reserved = 1 + (nalloc * (data_blocks + ind_blocks));
-	error = gfs2_inplace_reserve(ip, reserved, 0);
+	ap.target = reserved;
+	error = gfs2_inplace_reserve(ip, &ap);
 	if (error)
 		goto out_alloc;
 
@@ -974,9 +995,9 @@ static int need_sync(struct gfs2_quota_data *qd)
 	if (!qd->qd_qb.qb_limit)
 		return 0;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	value = qd->qd_change;
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 
 	spin_lock(&gt->gt_spin);
 	num = gt->gt_quota_scale_num;
@@ -1001,9 +1022,11 @@ static int need_sync(struct gfs2_quota_data *qd)
 
 void gfs2_quota_unlock(struct gfs2_inode *ip)
 {
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_quota_data *qda[4];
 	unsigned int count = 0;
 	unsigned int x;
+	int found;
 
 	if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
 		goto out;
@@ -1016,9 +1039,25 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
 		sync = need_sync(qd);
 
 		gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+		if (!sync)
+			continue;
+
+		spin_lock(&qd_lock);
+		found = qd_check_sync(sdp, qd, NULL);
+		spin_unlock(&qd_lock);
+
+		if (!found)
+			continue;
 
-		if (sync && qd_trylock(qd))
-			qda[count++] = qd;
+		gfs2_assert_warn(sdp, qd->qd_change_sync);
+		if (bh_get(qd)) {
+			clear_bit(QDF_LOCKED, &qd->qd_flags);
+			slot_put(qd);
+			qd_put(qd);
+			continue;
+		}
+
+		qda[count++] = qd;
 	}
 
 	if (count) {
@@ -1067,9 +1106,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 			continue;
 
 		value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
-		spin_lock(&qd_lru_lock);
+		spin_lock(&qd_lock);
 		value += qd->qd_change;
-		spin_unlock(&qd_lru_lock);
+		spin_unlock(&qd_lock);
 
 		if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
 			print_message(qd, "exceeded");
@@ -1118,17 +1157,18 @@ int gfs2_quota_sync(struct super_block *sb, int type)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_quota_data **qda;
-	unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
+	unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder);
 	unsigned int num_qd;
 	unsigned int x;
 	int error = 0;
 
-	sdp->sd_quota_sync_gen++;
-
 	qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
 	if (!qda)
 		return -ENOMEM;
 
+	mutex_lock(&sdp->sd_quota_sync_mutex);
+	sdp->sd_quota_sync_gen++;
+
 	do {
 		num_qd = 0;
 
@@ -1153,6 +1193,7 @@ int gfs2_quota_sync(struct super_block *sb, int type)
 		}
 	} while (!error && num_qd == max_qd);
 
+	mutex_unlock(&sdp->sd_quota_sync_mutex);
 	kfree(qda);
 
 	return error;
@@ -1258,11 +1299,11 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 			qd->qd_slot = slot;
 			qd->qd_slot_count = 1;
 
-			spin_lock(&qd_lru_lock);
+			spin_lock(&qd_lock);
 			gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
 			list_add(&qd->qd_list, &sdp->sd_quota_list);
 			atomic_inc(&sdp->sd_quota_count);
-			spin_unlock(&qd_lru_lock);
+			spin_unlock(&qd_lock);
 
 			found++;
 		}
@@ -1288,30 +1329,34 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 	struct gfs2_quota_data *qd;
 	unsigned int x;
 
-	spin_lock(&qd_lru_lock);
+	spin_lock(&qd_lock);
 	while (!list_empty(head)) {
 		qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
 
-		if (atomic_read(&qd->qd_count) > 1 ||
-		    (atomic_read(&qd->qd_count) &&
-		     !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+		/*
+		 * To be removed in due course... we should be able to
+		 * ensure that all refs to the qd have done by this point
+		 * so that this rather odd test is not required
+		 */
+		spin_lock(&qd->qd_lockref.lock);
+		if (qd->qd_lockref.count > 1 ||
+		    (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+			spin_unlock(&qd->qd_lockref.lock);
 			list_move(&qd->qd_list, head);
-			spin_unlock(&qd_lru_lock);
+			spin_unlock(&qd_lock);
 			schedule();
-			spin_lock(&qd_lru_lock);
+			spin_lock(&qd_lock);
 			continue;
 		}
+		spin_unlock(&qd->qd_lockref.lock);
 
 		list_del(&qd->qd_list);
 		/* Also remove if this qd exists in the reclaim list */
-		if (!list_empty(&qd->qd_reclaim)) {
-			list_del_init(&qd->qd_reclaim);
-			atomic_dec(&qd_lru_count);
-		}
+		list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
 		atomic_dec(&sdp->sd_quota_count);
-		spin_unlock(&qd_lru_lock);
+		spin_unlock(&qd_lock);
 
-		if (!atomic_read(&qd->qd_count)) {
+		if (!qd->qd_lockref.count) {
 			gfs2_assert_warn(sdp, !qd->qd_change);
 			gfs2_assert_warn(sdp, !qd->qd_slot_count);
 		} else
@@ -1321,9 +1366,9 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 		gfs2_glock_put(qd->qd_gl);
 		kmem_cache_free(gfs2_quotad_cachep, qd);
 
-		spin_lock(&qd_lru_lock);
+		spin_lock(&qd_lock);
 	}
-	spin_unlock(&qd_lru_lock);
+	spin_unlock(&qd_lock);
 
 	gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
 
@@ -1462,7 +1507,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 	}
 	fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
 	fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
-	fqs->qs_incoredqs = atomic_read(&qd_lru_count);
+	fqs->qs_incoredqs = list_lru_count(&gfs2_qd_lru);
 	return 0;
 }
 
@@ -1573,10 +1618,12 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
 	if (gfs2_is_stuffed(ip))
 		alloc_required = 1;
 	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .aflags = 0, };
 		gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
 				       &data_blocks, &ind_blocks);
 		blocks = 1 + data_blocks + ind_blocks;
-		error = gfs2_inplace_reserve(ip, blocks, 0);
+		ap.target = blocks;
+		error = gfs2_inplace_reserve(ip, &ap);
 		if (error)
 			goto out_i;
 		blocks += gfs2_rg_blocks(ip, blocks);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 0f64d9deb1b0..96e4f34a03b0 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -10,9 +10,10 @@
 #ifndef __QUOTA_DOT_H__
 #define __QUOTA_DOT_H__
 
+#include <linux/list_lru.h>
+
 struct gfs2_inode;
 struct gfs2_sbd;
-struct shrink_control;
 
 #define NO_UID_QUOTA_CHANGE INVALID_UID
 #define NO_GID_QUOTA_CHANGE INVALID_GID
@@ -53,10 +54,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 	return ret;
 }
 
-extern unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
-					  struct shrink_control *sc);
-extern unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
-					 struct shrink_control *sc);
 extern const struct quotactl_ops gfs2_quotactl_ops;
+extern struct shrinker gfs2_qd_shrinker;
+extern struct list_lru gfs2_qd_lru;
 
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 69317435faa7..4d83abdd5635 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -81,11 +81,12 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
 			       unsigned char new_state)
 {
 	unsigned char *byte1, *byte2, *end, cur_state;
-	unsigned int buflen = rbm->bi->bi_len;
+	struct gfs2_bitmap *bi = rbm_bi(rbm);
+	unsigned int buflen = bi->bi_len;
 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
 
-	byte1 = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY);
-	end = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + buflen;
+	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
+	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
 
 	BUG_ON(byte1 >= end);
 
@@ -95,18 +96,17 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
 		printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, "
 		       "new_state=%d\n", rbm->offset, cur_state, new_state);
 		printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n",
-		       (unsigned long long)rbm->rgd->rd_addr,
-		       rbm->bi->bi_start);
+		       (unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
 		printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n",
-		       rbm->bi->bi_offset, rbm->bi->bi_len);
+		       bi->bi_offset, bi->bi_len);
 		dump_stack();
 		gfs2_consist_rgrpd(rbm->rgd);
 		return;
 	}
 	*byte1 ^= (cur_state ^ new_state) << bit;
 
-	if (do_clone && rbm->bi->bi_clone) {
-		byte2 = rbm->bi->bi_clone + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY);
+	if (do_clone && bi->bi_clone) {
+		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
 		*byte2 ^= (cur_state ^ new_state) << bit;
 	}
@@ -121,7 +121,8 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
 
 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm)
 {
-	const u8 *buffer = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset;
+	struct gfs2_bitmap *bi = rbm_bi(rbm);
+	const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset;
 	const u8 *byte;
 	unsigned int bit;
 
@@ -252,29 +253,53 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
 {
 	u64 rblock = block - rbm->rgd->rd_data0;
-	u32 x;
 
 	if (WARN_ON_ONCE(rblock > UINT_MAX))
 		return -EINVAL;
 	if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
 		return -E2BIG;
 
-	rbm->bi = rbm->rgd->rd_bits;
+	rbm->bii = 0;
 	rbm->offset = (u32)(rblock);
 	/* Check if the block is within the first block */
-	if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY)
+	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
 		return 0;
 
 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
 	rbm->offset += (sizeof(struct gfs2_rgrp) -
 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
-	x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
-	rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
-	rbm->bi += x;
+	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
+	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
 	return 0;
 }
 
 /**
+ * gfs2_rbm_incr - increment an rbm structure
+ * @rbm: The rbm with rgd already set correctly
+ *
+ * This function takes an existing rbm structure and increments it to the next
+ * viable block offset.
+ *
+ * Returns: If incrementing the offset would cause the rbm to go past the
+ *          end of the rgrp, true is returned, otherwise false.
+ *
+ */
+
+static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
+{
+	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
+		rbm->offset++;
+		return false;
+	}
+	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
+		return true;
+
+	rbm->offset = 0;
+	rbm->bii++;
+	return false;
+}
+
+/**
  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
  * @rbm: Position to search (value/result)
  * @n_unaligned: Number of unaligned blocks to check
@@ -285,7 +310,6 @@ static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
 
 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
 {
-	u64 block;
 	u32 n;
 	u8 res;
 
@@ -296,8 +320,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le
 		(*len)--;
 		if (*len == 0)
 			return true;
-		block = gfs2_rbm_to_block(rbm);
-		if (gfs2_rbm_from_block(rbm, block + 1))
+		if (gfs2_rbm_incr(rbm))
 			return true;
 	}
 
@@ -328,6 +351,7 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
 	u32 chunk_size;
 	u8 *ptr, *start, *end;
 	u64 block;
+	struct gfs2_bitmap *bi;
 
 	if (n_unaligned &&
 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
@@ -336,11 +360,12 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
 	n_unaligned = len & 3;
 	/* Start is now byte aligned */
 	while (len > 3) {
-		start = rbm.bi->bi_bh->b_data;
-		if (rbm.bi->bi_clone)
-			start = rbm.bi->bi_clone;
-		end = start + rbm.bi->bi_bh->b_size;
-		start += rbm.bi->bi_offset;
+		bi = rbm_bi(&rbm);
+		start = bi->bi_bh->b_data;
+		if (bi->bi_clone)
+			start = bi->bi_clone;
+		end = start + bi->bi_bh->b_size;
+		start += bi->bi_offset;
 		BUG_ON(rbm.offset & 3);
 		start += (rbm.offset / GFS2_NBBY);
 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
@@ -605,11 +630,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
 	RB_CLEAR_NODE(&rs->rs_node);
 
 	if (rs->rs_free) {
+		struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
+
 		/* return reserved blocks to the rgrp */
 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
 		rs->rs_free = 0;
-		clear_bit(GBF_FULL, &rs->rs_rbm.bi->bi_flags);
+		clear_bit(GBF_FULL, &bi->bi_flags);
 		smp_mb__after_clear_bit();
 	}
 }
@@ -634,14 +661,13 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
 /**
  * gfs2_rs_delete - delete a multi-block reservation
  * @ip: The inode for this reservation
+ * @wcount: The inode's write count, or NULL
  *
  */
-void gfs2_rs_delete(struct gfs2_inode *ip)
+void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
 {
-	struct inode *inode = &ip->i_inode;
-
 	down_write(&ip->i_rw_mutex);
-	if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) {
+	if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
 		gfs2_rs_deltree(ip->i_res);
 		BUG_ON(ip->i_res->rs_free);
 		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
@@ -743,18 +769,21 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
 			bi->bi_offset = sizeof(struct gfs2_rgrp);
 			bi->bi_start = 0;
 			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
 		/* header block */
 		} else if (x == 0) {
 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
 			bi->bi_offset = sizeof(struct gfs2_rgrp);
 			bi->bi_start = 0;
 			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
 		/* last block */
 		} else if (x + 1 == length) {
 			bytes = bytes_left;
 			bi->bi_offset = sizeof(struct gfs2_meta_header);
 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
 			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
 		/* other blocks */
 		} else {
 			bytes = sdp->sd_sb.sb_bsize -
@@ -762,6 +791,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
 			bi->bi_offset = sizeof(struct gfs2_meta_header);
 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
 			bi->bi_len = bytes;
+			bi->bi_blocks = bytes * GFS2_NBBY;
 		}
 
 		bytes_left -= bytes;
@@ -1392,12 +1422,12 @@ static void rs_insert(struct gfs2_inode *ip)
  * rg_mblk_search - find a group of multiple free blocks to form a reservation
  * @rgd: the resource group descriptor
  * @ip: pointer to the inode for which we're reserving blocks
- * @requested: number of blocks required for this allocation
+ * @ap: the allocation parameters
  *
  */
 
 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
-			   unsigned requested)
+			   const struct gfs2_alloc_parms *ap)
 {
 	struct gfs2_rbm rbm = { .rgd = rgd, };
 	u64 goal;
@@ -1410,7 +1440,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
 	if (S_ISDIR(inode->i_mode))
 		extlen = 1;
 	else {
-		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
+		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
 		extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
 	}
 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
@@ -1554,14 +1584,14 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
 			 const struct gfs2_inode *ip, bool nowrap)
 {
 	struct buffer_head *bh;
-	struct gfs2_bitmap *initial_bi;
+	int initial_bii;
 	u32 initial_offset;
 	u32 offset;
 	u8 *buffer;
-	int index;
 	int n = 0;
 	int iters = rbm->rgd->rd_length;
 	int ret;
+	struct gfs2_bitmap *bi;
 
 	/* If we are not starting at the beginning of a bitmap, then we
 	 * need to add one to the bitmap count to ensure that we search
@@ -1571,52 +1601,53 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
 		iters++;
 
 	while(1) {
-		if (test_bit(GBF_FULL, &rbm->bi->bi_flags) &&
+		bi = rbm_bi(rbm);
+		if (test_bit(GBF_FULL, &bi->bi_flags) &&
 		    (state == GFS2_BLKST_FREE))
 			goto next_bitmap;
 
-		bh = rbm->bi->bi_bh;
-		buffer = bh->b_data + rbm->bi->bi_offset;
+		bh = bi->bi_bh;
+		buffer = bh->b_data + bi->bi_offset;
 		WARN_ON(!buffer_uptodate(bh));
-		if (state != GFS2_BLKST_UNLINKED && rbm->bi->bi_clone)
-			buffer = rbm->bi->bi_clone + rbm->bi->bi_offset;
+		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
+			buffer = bi->bi_clone + bi->bi_offset;
 		initial_offset = rbm->offset;
-		offset = gfs2_bitfit(buffer, rbm->bi->bi_len, rbm->offset, state);
+		offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
 		if (offset == BFITNOENT)
 			goto bitmap_full;
 		rbm->offset = offset;
 		if (ip == NULL)
 			return 0;
 
-		initial_bi = rbm->bi;
+		initial_bii = rbm->bii;
 		ret = gfs2_reservation_check_and_update(rbm, ip, minext);
 		if (ret == 0)
 			return 0;
 		if (ret > 0) {
-			n += (rbm->bi - initial_bi);
+			n += (rbm->bii - initial_bii);
 			goto next_iter;
 		}
 		if (ret == -E2BIG) {
-			index = 0;
+			rbm->bii = 0;
 			rbm->offset = 0;
-			n += (rbm->bi - initial_bi);
+			n += (rbm->bii - initial_bii);
 			goto res_covered_end_of_rgrp;
 		}
 		return ret;
 
 bitmap_full:	/* Mark bitmap as full and fall through */
-		if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
-			set_bit(GBF_FULL, &rbm->bi->bi_flags);
+		if ((state == GFS2_BLKST_FREE) && initial_offset == 0) {
+			struct gfs2_bitmap *bi = rbm_bi(rbm);
+			set_bit(GBF_FULL, &bi->bi_flags);
+		}
 
 next_bitmap:	/* Find next bitmap in the rgrp */
 		rbm->offset = 0;
-		index = rbm->bi - rbm->rgd->rd_bits;
-		index++;
-		if (index == rbm->rgd->rd_length)
-			index = 0;
+		rbm->bii++;
+		if (rbm->bii == rbm->rgd->rd_length)
+			rbm->bii = 0;
 res_covered_end_of_rgrp:
-		rbm->bi = &rbm->rgd->rd_bits[index];
-		if ((index == 0) && nowrap)
+		if ((rbm->bii == 0) && nowrap)
 			break;
 		n++;
 next_iter:
@@ -1645,7 +1676,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 	struct gfs2_inode *ip;
 	int error;
 	int found = 0;
-	struct gfs2_rbm rbm = { .rgd = rgd, .bi = rgd->rd_bits, .offset = 0 };
+	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
 
 	while (1) {
 		down_write(&sdp->sd_log_flush_lock);
@@ -1800,12 +1831,12 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
 /**
  * gfs2_inplace_reserve - Reserve space in the filesystem
  * @ip: the inode to reserve space for
- * @requested: the number of blocks to be reserved
+ * @ap: the allocation parameters
  *
  * Returns: errno
  */
 
-int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
+int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *begin = NULL;
@@ -1817,17 +1848,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
 
 	if (sdp->sd_args.ar_rgrplvb)
 		flags |= GL_SKIP;
-	if (gfs2_assert_warn(sdp, requested))
+	if (gfs2_assert_warn(sdp, ap->target))
 		return -EINVAL;
 	if (gfs2_rs_active(rs)) {
 		begin = rs->rs_rbm.rgd;
-		flags = 0; /* Yoda: Do or do not. There is no try */
 	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
 		rs->rs_rbm.rgd = begin = ip->i_rgd;
 	} else {
 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
 	}
-	if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV))
+	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
 		skip = gfs2_orlov_skip(ip);
 	if (rs->rs_rbm.rgd == NULL)
 		return -EBADSLT;
@@ -1869,14 +1899,14 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
 
 		/* Get a reservation if we don't already have one */
 		if (!gfs2_rs_active(rs))
-			rg_mblk_search(rs->rs_rbm.rgd, ip, requested);
+			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
 
 		/* Skip rgrps when we can't get a reservation on first pass */
 		if (!gfs2_rs_active(rs) && (loops < 1))
 			goto check_rgrp;
 
 		/* If rgrp has enough free space, use it */
-		if (rs->rs_rbm.rgd->rd_free_clone >= requested) {
+		if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) {
 			ip->i_rgd = rs->rs_rbm.rgd;
 			return 0;
 		}
@@ -1973,14 +2003,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
 
 	*n = 1;
 	block = gfs2_rbm_to_block(rbm);
-	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh);
+	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
 	block++;
 	while (*n < elen) {
 		ret = gfs2_rbm_from_block(&pos, block);
 		if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
 			break;
-		gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh);
+		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
 		(*n)++;
 		block++;
@@ -2001,6 +2031,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 				     u32 blen, unsigned char new_state)
 {
 	struct gfs2_rbm rbm;
+	struct gfs2_bitmap *bi;
 
 	rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
 	if (!rbm.rgd) {
@@ -2011,15 +2042,15 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 
 	while (blen--) {
 		gfs2_rbm_from_block(&rbm, bstart);
+		bi = rbm_bi(&rbm);
 		bstart++;
-		if (!rbm.bi->bi_clone) {
-			rbm.bi->bi_clone = kmalloc(rbm.bi->bi_bh->b_size,
-						   GFP_NOFS | __GFP_NOFAIL);
-			memcpy(rbm.bi->bi_clone + rbm.bi->bi_offset,
-			       rbm.bi->bi_bh->b_data + rbm.bi->bi_offset,
-			       rbm.bi->bi_len);
+		if (!bi->bi_clone) {
+			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+					       GFP_NOFS | __GFP_NOFAIL);
+			memcpy(bi->bi_clone + bi->bi_offset,
+			       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
 		}
-		gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh);
+		gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
 		gfs2_setbit(&rbm, false, new_state);
 	}
 
@@ -2103,6 +2134,35 @@ out:
 }
 
 /**
+ * gfs2_set_alloc_start - Set starting point for block allocation
+ * @rbm: The rbm which will be set to the required location
+ * @ip: The gfs2 inode
+ * @dinode: Flag to say if allocation includes a new inode
+ *
+ * This sets the starting point from the reservation if one is active
+ * otherwise it falls back to guessing a start point based on the
+ * inode's goal block or the last allocation point in the rgrp.
+ */
+
+static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
+				 const struct gfs2_inode *ip, bool dinode)
+{
+	u64 goal;
+
+	if (gfs2_rs_active(ip->i_res)) {
+		*rbm = ip->i_res->rs_rbm;
+		return;
+	}
+
+	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
+		goal = ip->i_goal;
+	else
+		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
+
+	gfs2_rbm_from_block(rbm, goal);
+}
+
+/**
  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
  * @ip: the inode to allocate the block for
  * @bn: Used to return the starting block number
@@ -2120,22 +2180,14 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	struct buffer_head *dibh;
 	struct gfs2_rbm rbm = { .rgd = ip->i_rgd, };
 	unsigned int ndata;
-	u64 goal;
 	u64 block; /* block, within the file system scope */
 	int error;
 
-	if (gfs2_rs_active(ip->i_res))
-		goal = gfs2_rbm_to_block(&ip->i_res->rs_rbm);
-	else if (!dinode && rgrp_contains_block(rbm.rgd, ip->i_goal))
-		goal = ip->i_goal;
-	else
-		goal = rbm.rgd->rd_last_alloc + rbm.rgd->rd_data0;
-
-	gfs2_rbm_from_block(&rbm, goal);
+	gfs2_set_alloc_start(&rbm, ip, dinode);
 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false);
 
 	if (error == -ENOSPC) {
-		gfs2_rbm_from_block(&rbm, goal);
+		gfs2_set_alloc_start(&rbm, ip, dinode);
 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false);
 	}
 
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5b3f4a896e6c..3a10d2ffbbe7 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -40,7 +40,7 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 
 #define GFS2_AF_ORLOV 1
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags);
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
 extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
@@ -48,7 +48,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
 
 extern int gfs2_rs_alloc(struct gfs2_inode *ip);
 extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip);
+extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
 extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e5639dec66c4..35da5b19c0de 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1526,7 +1526,7 @@ out_unlock:
 out:
 	/* Case 3 starts here */
 	truncate_inode_pages(&inode->i_data, 0);
-	gfs2_rs_delete(ip);
+	gfs2_rs_delete(ip, NULL);
 	gfs2_ordered_del_inode(ip);
 	clear_inode(inode);
 	gfs2_dir_hash_inval(ip);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index aa5c48044966..d09f6edda0ff 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -587,7 +587,6 @@ TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
@@ -597,7 +596,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_max_readahead.attr,
 	&tune_attr_complain_secs.attr,
 	&tune_attr_statfs_slow.attr,
-	&tune_attr_quota_simul_sync.attr,
 	&tune_attr_statfs_quantum.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 6402fb69d71b..f7109f689e61 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -268,23 +268,3 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	return rv;
 }
 
-void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
-		      unsigned int bit, int new_value)
-{
-	unsigned int c, o, b = bit;
-	int old_value;
-
-	c = b / (8 * PAGE_SIZE);
-	b %= 8 * PAGE_SIZE;
-	o = b / 8;
-	b %= 8;
-
-	old_value = (bitmap[c][o] & (1 << b));
-	gfs2_assert_withdraw(sdp, !old_value != !new_value);
-
-	if (new_value)
-		bitmap[c][o] |= 1 << b;
-	else
-		bitmap[c][o] &= ~(1 << b);
-}
-
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 80535739ac7b..b7ffb09b99ea 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -164,8 +164,6 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
 #define gfs2_tune_get(sdp, field) \
 gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
 
-void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
-		      unsigned int bit, int new_value);
 int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...);
 
 #endif /* __UTIL_DOT_H__ */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index ecd37f30ab91..8c6a6f6bdba9 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -723,6 +723,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 			     unsigned int blks,
 			     ea_skeleton_call_t skeleton_call, void *private)
 {
+	struct gfs2_alloc_parms ap = { .target = blks };
 	struct buffer_head *dibh;
 	int error;
 
@@ -734,7 +735,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (error)
 		return error;
 
-	error = gfs2_inplace_reserve(ip, blks, 0);
+	error = gfs2_inplace_reserve(ip, &ap);
 	if (error)
 		goto out_gunlock_q;
 
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index c1a3e603279c..7f464c513ba0 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -95,7 +95,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 
 	if (insert_inode_locked(inode) < 0) {
 		rc = -EINVAL;
-		goto fail_unlock;
+		goto fail_put;
 	}
 
 	inode_init_owner(inode, parent, mode);
@@ -156,7 +156,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 fail_drop:
 	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
-fail_unlock:
 	clear_nlink(inode);
 	unlock_new_inode(inode);
 fail_put:
diff --git a/fs/namei.c b/fs/namei.c
index 0dc4cbf21f37..caa28051e197 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2294,10 +2294,11 @@ out:
  * path_mountpoint - look up a path to be umounted
  * @dfd:	directory file descriptor to start walk from
  * @name:	full pathname to walk
+ * @path:	pointer to container for result
  * @flags:	lookup flags
  *
  * Look up the given name, but don't attempt to revalidate the last component.
- * Returns 0 and "path" will be valid on success; Retuns error otherwise.
+ * Returns 0 and "path" will be valid on success; Returns error otherwise.
  */
 static int
 path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
@@ -2656,6 +2657,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 	int acc_mode;
 	int create_error = 0;
 	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
+	bool excl;
 
 	BUG_ON(dentry->d_inode);
 
@@ -2669,10 +2671,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 	if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
 		mode &= ~current_umask();
 
-	if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) {
+	excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
+	if (excl)
 		open_flag &= ~O_TRUNC;
-		*opened |= FILE_CREATED;
-	}
 
 	/*
 	 * Checking write permission is tricky, bacuse we don't know if we are
@@ -2725,12 +2726,6 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 		goto out;
 	}
 
-	acc_mode = op->acc_mode;
-	if (*opened & FILE_CREATED) {
-		fsnotify_create(dir, dentry);
-		acc_mode = MAY_OPEN;
-	}
-
 	if (error) {	/* returned 1, that is */
 		if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
 			error = -EIO;
@@ -2740,9 +2735,19 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 			dput(dentry);
 			dentry = file->f_path.dentry;
 		}
-		if (create_error && dentry->d_inode == NULL) {
-			error = create_error;
-			goto out;
+		if (*opened & FILE_CREATED)
+			fsnotify_create(dir, dentry);
+		if (!dentry->d_inode) {
+			WARN_ON(*opened & FILE_CREATED);
+			if (create_error) {
+				error = create_error;
+				goto out;
+			}
+		} else {
+			if (excl && !(*opened & FILE_CREATED)) {
+				error = -EEXIST;
+				goto out;
+			}
 		}
 		goto looked_up;
 	}
@@ -2751,6 +2756,12 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 	 * We didn't have the inode before the open, so check open permission
 	 * here.
 	 */
+	acc_mode = op->acc_mode;
+	if (*opened & FILE_CREATED) {
+		WARN_ON(!(open_flag & O_CREAT));
+		fsnotify_create(dir, dentry);
+		acc_mode = MAY_OPEN;
+	}
 	error = may_open(&file->f_path, acc_mode, open_flag);
 	if (error)
 		fput(file);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index b5e80b0af315..38c1768b4142 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -140,6 +140,17 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
 	  If the NFS client is unchanged from the upstream kernel, this
 	  option should be set to the default "kernel.org".
 
+config NFS_V4_1_MIGRATION
+	bool "NFSv4.1 client support for migration"
+	depends on NFS_V4_1
+	default n
+	help
+	  This option makes the NFS client advertise to NFSv4.1 servers that
+          it can support NFSv4 migration.
+
+          The NFSv4.1 pieces of the Linux NFSv4 migration implementation are
+          still experimental.  If you are not an NFSv4 developer, say N here.
+
 config NFS_V4_SECURITY_LABEL
 	bool
 	depends on NFS_V4_2 && SECURITY
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 67cd73213168..073b4cf67ed9 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -164,8 +164,7 @@ nfs41_callback_up(struct svc_serv *serv)
 		svc_xprt_put(serv->sv_bc_xprt);
 		serv->sv_bc_xprt = NULL;
 	}
-	dprintk("--> %s return %ld\n", __func__,
-		IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
+	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
 	return rqstp;
 }
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2dceee4db076..1d09289c8f0e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -590,6 +590,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 
 	if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+	if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags))
+		args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT;
 	if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 	if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags))
@@ -784,8 +786,10 @@ static int nfs_init_server(struct nfs_server *server,
 		goto error;
 
 	server->port = data->nfs_server.port;
+	server->auth_info = data->auth_info;
 
-	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
+	error = nfs_init_server_rpcclient(server, &timeparms,
+					  data->selected_flavor);
 	if (error < 0)
 		goto error;
 
@@ -926,6 +930,7 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour
 	target->acdirmax = source->acdirmax;
 	target->caps = source->caps;
 	target->options = source->options;
+	target->auth_info = source->auth_info;
 }
 EXPORT_SYMBOL_GPL(nfs_server_copy_userdata);
 
@@ -943,7 +948,7 @@ void nfs_server_insert_lists(struct nfs_server *server)
 }
 EXPORT_SYMBOL_GPL(nfs_server_insert_lists);
 
-static void nfs_server_remove_lists(struct nfs_server *server)
+void nfs_server_remove_lists(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs_net *nn;
@@ -960,6 +965,7 @@ static void nfs_server_remove_lists(struct nfs_server *server)
 
 	synchronize_rcu();
 }
+EXPORT_SYMBOL_GPL(nfs_server_remove_lists);
 
 /*
  * Allocate and initialise a server record
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index de434f309af0..9a8676f33350 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1139,7 +1139,13 @@ out_zap_parent:
 	if (inode && S_ISDIR(inode->i_mode)) {
 		/* Purge readdir caches. */
 		nfs_zap_caches(inode);
-		if (dentry->d_flags & DCACHE_DISCONNECTED)
+		/*
+		 * We can't d_drop the root of a disconnected tree:
+		 * its d_hash is on the s_anon list and d_drop() would hide
+		 * it from shrink_dcache_for_unmount(), leading to busy
+		 * inodes on unmount and further oopses.
+		 */
+		if (IS_ROOT(dentry))
 			goto out_valid;
 	}
 	/* If we have submounts, don't unhash ! */
@@ -1381,7 +1387,7 @@ static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, i
 
 static int do_open(struct inode *inode, struct file *filp)
 {
-	nfs_fscache_set_inode_cookie(inode, filp);
+	nfs_fscache_open_file(inode, filp);
 	return 0;
 }
 
@@ -1392,6 +1398,9 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
 {
 	int err;
 
+	if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		*opened |= FILE_CREATED;
+
 	err = finish_open(file, dentry, do_open, opened);
 	if (err)
 		goto out;
@@ -1455,7 +1464,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 
 	trace_nfs_atomic_open_enter(dir, ctx, open_flags);
 	nfs_block_sillyrename(dentry->d_parent);
-	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened);
 	nfs_unblock_sillyrename(dentry->d_parent);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 24d1d1c5fcaf..3ef01f0ba0bc 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -39,7 +39,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp)
 	/* create a cache index for looking up filehandles */
 	clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
 					      &nfs_fscache_server_index_def,
-					      clp);
+					      clp, true);
 	dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
 		 clp, clp->fscache);
 }
@@ -139,7 +139,7 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int
 	/* create a cache index for looking up filehandles */
 	nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
 					       &nfs_fscache_super_index_def,
-					       nfss);
+					       nfss, true);
 	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
 		 nfss, nfss->fscache);
 	return;
@@ -178,163 +178,79 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
 /*
  * Initialise the per-inode cache cookie pointer for an NFS inode.
  */
-void nfs_fscache_init_inode_cookie(struct inode *inode)
+void nfs_fscache_init_inode(struct inode *inode)
 {
-	NFS_I(inode)->fscache = NULL;
-	if (S_ISREG(inode->i_mode))
-		set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
-}
-
-/*
- * Get the per-inode cache cookie for an NFS inode.
- */
-static void nfs_fscache_enable_inode_cookie(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if (nfsi->fscache || !NFS_FSCACHE(inode))
+	nfsi->fscache = NULL;
+	if (!S_ISREG(inode->i_mode))
 		return;
-
-	if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
-		nfsi->fscache = fscache_acquire_cookie(
-			NFS_SB(sb)->fscache,
-			&nfs_fscache_inode_object_def,
-			nfsi);
-
-		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
-			 sb, nfsi, nfsi->fscache);
-	}
+	nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
+					       &nfs_fscache_inode_object_def,
+					       nfsi, false);
 }
 
 /*
  * Release a per-inode cookie.
  */
-void nfs_fscache_release_inode_cookie(struct inode *inode)
+void nfs_fscache_clear_inode(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
-	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
-		 nfsi, nfsi->fscache);
+	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
 
-	fscache_relinquish_cookie(nfsi->fscache, 0);
+	fscache_relinquish_cookie(cookie, false);
 	nfsi->fscache = NULL;
 }
 
-/*
- * Retire a per-inode cookie, destroying the data attached to it.
- */
-void nfs_fscache_zap_inode_cookie(struct inode *inode)
+static bool nfs_fscache_can_enable(void *data)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
+	struct inode *inode = data;
 
-	dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
-		 nfsi, nfsi->fscache);
-
-	fscache_relinquish_cookie(nfsi->fscache, 1);
-	nfsi->fscache = NULL;
+	return !inode_is_open_for_write(inode);
 }
 
 /*
- * Turn off the cache with regard to a per-inode cookie if opened for writing,
- * invalidating all the pages in the page cache relating to the associated
- * inode to clear the per-page caching.
- */
-static void nfs_fscache_disable_inode_cookie(struct inode *inode)
-{
-	clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
-
-	if (NFS_I(inode)->fscache) {
-		dfprintk(FSCACHE,
-			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
-
-		/* Need to uncache any pages attached to this inode that
-		 * fscache knows about before turning off the cache.
-		 */
-		fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode);
-		nfs_fscache_zap_inode_cookie(inode);
-	}
-}
-
-/*
- * wait_on_bit() sleep function for uninterruptible waiting
- */
-static int nfs_fscache_wait_bit(void *flags)
-{
-	schedule();
-	return 0;
-}
-
-/*
- * Lock against someone else trying to also acquire or relinquish a cookie
- */
-static inline void nfs_fscache_inode_lock(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
-		wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
-			    nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-}
-
-/*
- * Unlock cookie management lock
- */
-static inline void nfs_fscache_inode_unlock(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	smp_mb__before_clear_bit();
-	clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
-}
-
-/*
- * Decide if we should enable or disable local caching for this inode.
- * - For now, with NFS, only regular files that are open read-only will be able
- *   to use the cache.
- * - May be invoked multiple times in parallel by parallel nfs_open() functions.
- */
-void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
-{
-	if (NFS_FSCACHE(inode)) {
-		nfs_fscache_inode_lock(inode);
-		if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
-			nfs_fscache_disable_inode_cookie(inode);
-		else
-			nfs_fscache_enable_inode_cookie(inode);
-		nfs_fscache_inode_unlock(inode);
-	}
-}
-EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie);
-
-/*
- * Replace a per-inode cookie due to revalidation detecting a file having
- * changed on the server.
+ * Enable or disable caching for a file that is being opened as appropriate.
+ * The cookie is allocated when the inode is initialised, but is not enabled at
+ * that time.  Enablement is deferred to file-open time to avoid stat() and
+ * access() thrashing the cache.
+ *
+ * For now, with NFS, only regular files that are open read-only will be able
+ * to use the cache.
+ *
+ * We enable the cache for an inode if we open it read-only and it isn't
+ * currently open for writing.  We disable the cache if the inode is open
+ * write-only.
+ *
+ * The caller uses the file struct to pin i_writecount on the inode before
+ * calling us when a file is opened for writing, so we can make use of that.
+ *
+ * Note that this may be invoked multiple times in parallel by parallel
+ * nfs_open() functions.
  */
-void nfs_fscache_reset_inode_cookie(struct inode *inode)
+void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_server *nfss = NFS_SERVER(inode);
-	NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache);
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
-	nfs_fscache_inode_lock(inode);
-	if (nfsi->fscache) {
-		/* retire the current fscache cache and get a new one */
-		fscache_relinquish_cookie(nfsi->fscache, 1);
-
-		nfsi->fscache = fscache_acquire_cookie(
-			nfss->nfs_client->fscache,
-			&nfs_fscache_inode_object_def,
-			nfsi);
+	if (!fscache_cookie_valid(cookie))
+		return;
 
-		dfprintk(FSCACHE,
-			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
-			 nfss, nfsi, old, nfsi->fscache);
+	if (inode_is_open_for_write(inode)) {
+		dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
+		clear_bit(NFS_INO_FSCACHE, &nfsi->flags);
+		fscache_disable_cookie(cookie, true);
+		fscache_uncache_all_inode_pages(cookie, inode);
+	} else {
+		dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi);
+		fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode);
+		if (fscache_cookie_enabled(cookie))
+			set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
 	}
-	nfs_fscache_inode_unlock(inode);
 }
+EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
 
 /*
  * Release the caching state associated with a page, if the page isn't busy
@@ -344,12 +260,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
 int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
 	if (PageFsCache(page)) {
-		struct nfs_inode *nfsi = NFS_I(page->mapping->host);
-		struct fscache_cookie *cookie = nfsi->fscache;
+		struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host);
 
 		BUG_ON(!cookie);
 		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
-			 cookie, page, nfsi);
+			 cookie, page, NFS_I(page->mapping->host));
 
 		if (!fscache_maybe_release_page(cookie, page, gfp))
 			return 0;
@@ -367,13 +282,12 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)
  */
 void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct fscache_cookie *cookie = nfsi->fscache;
+	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
 	BUG_ON(!cookie);
 
 	dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
-		 cookie, page, nfsi);
+		 cookie, page, NFS_I(inode));
 
 	fscache_wait_on_page_write(cookie, page);
 
@@ -417,9 +331,9 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
 
 	dfprintk(FSCACHE,
 		 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
-		 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+		 nfs_i_fscache(inode), page, page->index, page->flags, inode);
 
-	ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+	ret = fscache_read_or_alloc_page(nfs_i_fscache(inode),
 					 page,
 					 nfs_readpage_from_fscache_complete,
 					 ctx,
@@ -459,9 +373,9 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
 	int ret;
 
 	dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
-		 NFS_I(inode)->fscache, npages, inode);
+		 nfs_i_fscache(inode), npages, inode);
 
-	ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+	ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode),
 					  mapping, pages, nr_pages,
 					  nfs_readpage_from_fscache_complete,
 					  ctx,
@@ -506,15 +420,15 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
 
 	dfprintk(FSCACHE,
 		 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
-		 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+		 nfs_i_fscache(inode), page, page->index, page->flags, sync);
 
-	ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+	ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL);
 	dfprintk(FSCACHE,
 		 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
 		 page, page->index, page->flags, ret);
 
 	if (ret != 0) {
-		fscache_uncache_page(NFS_I(inode)->fscache, page);
+		fscache_uncache_page(nfs_i_fscache(inode), page);
 		nfs_add_fscache_stats(inode,
 				      NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
 		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 4ecb76652eba..d7fe3e799f2f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -76,11 +76,9 @@ extern void nfs_fscache_release_client_cookie(struct nfs_client *);
 extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
 extern void nfs_fscache_release_super_cookie(struct super_block *);
 
-extern void nfs_fscache_init_inode_cookie(struct inode *);
-extern void nfs_fscache_release_inode_cookie(struct inode *);
-extern void nfs_fscache_zap_inode_cookie(struct inode *);
-extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
-extern void nfs_fscache_reset_inode_cookie(struct inode *);
+extern void nfs_fscache_init_inode(struct inode *);
+extern void nfs_fscache_clear_inode(struct inode *);
+extern void nfs_fscache_open_file(struct inode *, struct file *);
 
 extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
 extern int nfs_fscache_release_page(struct page *, gfp_t);
@@ -187,12 +185,10 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
 
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 
-static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
-						struct file *filp) {}
-static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_init_inode(struct inode *inode) {}
+static inline void nfs_fscache_clear_inode(struct inode *inode) {}
+static inline void nfs_fscache_open_file(struct inode *inode,
+					 struct file *filp) {}
 
 static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index eda8879171c4..18ab2da4eeb6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -122,7 +122,7 @@ void nfs_clear_inode(struct inode *inode)
 	WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
 	nfs_zap_acl_cache(inode);
 	nfs_access_zap_cache(inode);
-	nfs_fscache_release_inode_cookie(inode);
+	nfs_fscache_clear_inode(inode);
 }
 EXPORT_SYMBOL_GPL(nfs_clear_inode);
 
@@ -274,12 +274,6 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
 	if (label == NULL)
 		return;
 
-	if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
-		return;
-
-	if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
-		return;
-
 	if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
 		error = security_inode_notifysecctx(inode, label->label,
 				label->len);
@@ -459,7 +453,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		nfsi->attrtimeo_timestamp = now;
 		nfsi->access_cache = RB_ROOT;
 
-		nfs_fscache_init_inode_cookie(inode);
+		nfs_fscache_init_inode(inode);
 
 		unlock_new_inode(inode);
 	} else
@@ -854,7 +848,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 		return PTR_ERR(ctx);
 	nfs_file_set_open_context(filp, ctx);
 	put_nfs_open_context(ctx);
-	nfs_fscache_set_inode_cookie(inode, filp);
+	nfs_fscache_open_file(inode, filp);
 	return 0;
 }
 
@@ -923,6 +917,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
 		nfs_zap_acl_cache(inode);
 
+	nfs_setsecurity(inode, fattr, label);
+
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
 		inode->i_sb->s_id,
 		(long long)NFS_FILEID(inode));
@@ -1209,6 +1205,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
 	 * not on the result */
 	return nfs_fhandle_hash(fh);
 }
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash);
 
 /*
  * _nfs_display_fhandle - display an NFS file handle on the console
@@ -1253,6 +1250,7 @@ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)
 		}
 	}
 }
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
 #endif
 
 /**
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 38da8c2b81ac..bca6a3e3c49c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -88,8 +88,8 @@ struct nfs_parsed_mount_data {
 	unsigned int		namlen;
 	unsigned int		options;
 	unsigned int		bsize;
-	unsigned int		auth_flavor_len;
-	rpc_authflavor_t	auth_flavors[1];
+	struct nfs_auth_info	auth_info;
+	rpc_authflavor_t	selected_flavor;
 	char			*client_address;
 	unsigned int		version;
 	unsigned int		minorversion;
@@ -154,6 +154,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
 				  rpc_authflavor_t);
 int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
 void nfs_server_insert_lists(struct nfs_server *);
+void nfs_server_remove_lists(struct nfs_server *);
 void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int);
 int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
 		rpc_authflavor_t);
@@ -174,6 +175,8 @@ extern struct nfs_server *nfs4_create_server(
 					struct nfs_subversion *);
 extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
 						      struct nfs_fh *);
+extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
+					struct sockaddr *sap, size_t salen);
 extern void nfs_free_server(struct nfs_server *server);
 extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fh *,
@@ -323,6 +326,7 @@ extern struct file_system_type nfs_xdev_fs_type;
 extern struct file_system_type nfs4_xdev_fs_type;
 extern struct file_system_type nfs4_referral_fs_type;
 #endif
+bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
 struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *,
 			struct nfs_subversion *);
 void nfs_initialise_sb(struct super_block *);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 28842abafab4..3ce79b04522e 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -29,6 +29,8 @@ enum nfs4_client_state {
 	NFS4CLNT_SERVER_SCOPE_MISMATCH,
 	NFS4CLNT_PURGE_STATE,
 	NFS4CLNT_BIND_CONN_TO_SESSION,
+	NFS4CLNT_MOVED,
+	NFS4CLNT_LEASE_MOVED,
 };
 
 #define NFS4_RENEW_TIMEOUT		0x01
@@ -50,6 +52,7 @@ struct nfs4_minor_version_ops {
 	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
 	const struct nfs4_state_recovery_ops *nograce_recovery_ops;
 	const struct nfs4_state_maintenance_ops *state_renewal_ops;
+	const struct nfs4_mig_recovery_ops *mig_recovery_ops;
 };
 
 #define NFS_SEQID_CONFIRMED 1
@@ -203,6 +206,12 @@ struct nfs4_state_maintenance_ops {
 	int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
 };
 
+struct nfs4_mig_recovery_ops {
+	int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
+		struct page *, struct rpc_cred *);
+	int (*fsid_present)(struct inode *, struct rpc_cred *);
+};
+
 extern const struct dentry_operations nfs4_dentry_operations;
 
 /* dir.c */
@@ -213,10 +222,11 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
 extern struct file_system_type nfs4_fs_type;
 
 /* nfs4namespace.c */
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
 struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
 			       struct nfs_fh *, struct nfs_fattr *);
+int nfs4_replace_transport(struct nfs_server *server,
+				const struct nfs4_fs_locations *locations);
 
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
@@ -231,6 +241,9 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
 				  struct nfs4_fs_locations *, struct page *);
+extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
+		struct page *page, struct rpc_cred *);
+extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *);
 extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
 			    struct nfs_fh *, struct nfs_fattr *);
 extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
@@ -411,6 +424,8 @@ extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
+extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
+extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_server_scope(struct nfs_client *,
 				      struct nfs41_server_scope **);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index a860ab566d6e..b4a160a405ce 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -197,6 +197,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
 	clp->cl_minorversion = cl_init->minorversion;
 	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
+	clp->cl_mig_gen = 1;
 	return clp;
 
 error:
@@ -368,6 +369,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	if (clp->cl_minorversion != 0)
 		__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
 	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+	__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
 	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
 	if (error == -EINVAL)
 		error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
@@ -924,7 +926,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 	dprintk("Server FSID: %llx:%llx\n",
 			(unsigned long long) server->fsid.major,
 			(unsigned long long) server->fsid.minor);
-	dprintk("Mount FH: %d\n", mntfh->size);
+	nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
 
 	nfs4_session_set_rwsize(server);
 
@@ -947,9 +949,8 @@ out:
  * Create a version 4 volume record
  */
 static int nfs4_init_server(struct nfs_server *server,
-		const struct nfs_parsed_mount_data *data)
+		struct nfs_parsed_mount_data *data)
 {
-	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
 	struct rpc_timeout timeparms;
 	int error;
 
@@ -961,9 +962,15 @@ static int nfs4_init_server(struct nfs_server *server,
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
 	server->options = data->options;
+	server->auth_info = data->auth_info;
 
-	if (data->auth_flavor_len >= 1)
-		pseudoflavor = data->auth_flavors[0];
+	/* Use the first specified auth flavor. If this flavor isn't
+	 * allowed by the server, use the SECINFO path to try the
+	 * other specified flavors */
+	if (data->auth_info.flavor_len >= 1)
+		data->selected_flavor = data->auth_info.flavors[0];
+	else
+		data->selected_flavor = RPC_AUTH_UNIX;
 
 	/* Get a client record */
 	error = nfs4_set_client(server,
@@ -971,7 +978,7 @@ static int nfs4_init_server(struct nfs_server *server,
 			(const struct sockaddr *)&data->nfs_server.address,
 			data->nfs_server.addrlen,
 			data->client_address,
-			pseudoflavor,
+			data->selected_flavor,
 			data->nfs_server.protocol,
 			&timeparms,
 			data->minorversion,
@@ -991,7 +998,8 @@ static int nfs4_init_server(struct nfs_server *server,
 
 	server->port = data->nfs_server.port;
 
-	error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor);
+	error = nfs_init_server_rpcclient(server, &timeparms,
+					  data->selected_flavor);
 
 error:
 	/* Done */
@@ -1018,7 +1026,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
-	auth_probe = mount_info->parsed->auth_flavor_len < 1;
+	auth_probe = mount_info->parsed->auth_info.flavor_len < 1;
 
 	/* set up the general RPC client */
 	error = nfs4_init_server(server, mount_info->parsed);
@@ -1046,6 +1054,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 {
 	struct nfs_client *parent_client;
 	struct nfs_server *server, *parent_server;
+	bool auth_probe;
 	int error;
 
 	dprintk("--> nfs4_create_referral_server()\n");
@@ -1078,8 +1087,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 	if (error < 0)
 		goto error;
 
-	error = nfs4_server_common_setup(server, mntfh,
-			!(parent_server->flags & NFS_MOUNT_SECFLAVOUR));
+	auth_probe = parent_server->auth_info.flavor_len < 1;
+
+	error = nfs4_server_common_setup(server, mntfh, auth_probe);
 	if (error < 0)
 		goto error;
 
@@ -1091,3 +1101,111 @@ error:
 	dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
 	return ERR_PTR(error);
 }
+
+/*
+ * Grab the destination's particulars, including lease expiry time.
+ *
+ * Returns zero if probe succeeded and retrieved FSID matches the FSID
+ * we have cached.
+ */
+static int nfs_probe_destination(struct nfs_server *server)
+{
+	struct inode *inode = server->super->s_root->d_inode;
+	struct nfs_fattr *fattr;
+	int error;
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		return -ENOMEM;
+
+	/* Sanity: the probe won't work if the destination server
+	 * does not recognize the migrated FH. */
+	error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr);
+
+	nfs_free_fattr(fattr);
+	return error;
+}
+
+/**
+ * nfs4_update_server - Move an nfs_server to a different nfs_client
+ *
+ * @server: represents FSID to be moved
+ * @hostname: new end-point's hostname
+ * @sap: new end-point's socket address
+ * @salen: size of "sap"
+ *
+ * The nfs_server must be quiescent before this function is invoked.
+ * Either its session is drained (NFSv4.1+), or its transport is
+ * plugged and drained (NFSv4.0).
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_update_server(struct nfs_server *server, const char *hostname,
+		       struct sockaddr *sap, size_t salen)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct rpc_clnt *clnt = server->client;
+	struct xprt_create xargs = {
+		.ident		= clp->cl_proto,
+		.net		= &init_net,
+		.dstaddr	= sap,
+		.addrlen	= salen,
+		.servername	= hostname,
+	};
+	char buf[INET6_ADDRSTRLEN + 1];
+	struct sockaddr_storage address;
+	struct sockaddr *localaddr = (struct sockaddr *)&address;
+	int error;
+
+	dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			hostname);
+
+	error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout);
+	if (error != 0) {
+		dprintk("<-- %s(): rpc_switch_client_transport returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	error = rpc_localaddr(clnt, localaddr, sizeof(address));
+	if (error != 0) {
+		dprintk("<-- %s(): rpc_localaddr returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	error = -EAFNOSUPPORT;
+	if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) {
+		dprintk("<-- %s(): rpc_ntop returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	nfs_server_remove_lists(server);
+	error = nfs4_set_client(server, hostname, sap, salen, buf,
+				clp->cl_rpcclient->cl_auth->au_flavor,
+				clp->cl_proto, clnt->cl_timeout,
+				clp->cl_minorversion, clp->cl_net);
+	nfs_put_client(clp);
+	if (error != 0) {
+		nfs_server_insert_lists(server);
+		dprintk("<-- %s(): nfs4_set_client returned %d\n",
+			__func__, error);
+		goto out;
+	}
+
+	if (server->nfs_client->cl_hostname == NULL)
+		server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+	nfs_server_insert_lists(server);
+
+	error = nfs_probe_destination(server);
+	if (error < 0)
+		goto out;
+
+	dprintk("<-- %s() succeeded\n", __func__);
+
+out:
+	return error;
+}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e5b804dd944c..1f01b55692ee 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -19,6 +19,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 	struct inode *dir;
 	unsigned openflags = filp->f_flags;
 	struct iattr attr;
+	int opened = 0;
 	int err;
 
 	/*
@@ -55,7 +56,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 		nfs_wb_all(inode);
 	}
 
-	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		switch (err) {
@@ -74,7 +75,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	nfs_file_set_open_context(filp, ctx);
-	nfs_fscache_set_inode_cookie(inode, filp);
+	nfs_fscache_open_file(inode, filp);
 	err = 0;
 
 out_put_ctx:
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 95604f64cab8..c7c295e556ed 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -185,6 +185,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
 	if (status)
 		goto out_put;
 
+	smp_wmb();
 	ds->ds_clp = clp;
 	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
 out:
@@ -801,34 +802,35 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
 	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
 	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
-
-	if (filelayout_test_devid_unavailable(devid))
-		return NULL;
+	struct nfs4_pnfs_ds *ret = ds;
 
 	if (ds == NULL) {
 		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
 			__func__, ds_idx);
 		filelayout_mark_devid_invalid(devid);
-		return NULL;
+		goto out;
 	}
+	smp_rmb();
 	if (ds->ds_clp)
-		return ds;
+		goto out_test_devid;
 
 	if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
 		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
 		int err;
 
 		err = nfs4_ds_connect(s, ds);
-		if (err) {
+		if (err)
 			nfs4_mark_deviceid_unavailable(devid);
-			ds = NULL;
-		}
 		nfs4_clear_ds_conn_bit(ds);
 	} else {
 		/* Either ds is connected, or ds is NULL */
 		nfs4_wait_ds_connect(ds);
 	}
-	return ds;
+out_test_devid:
+	if (filelayout_test_devid_unavailable(devid))
+		ret = NULL;
+out:
+	return ret;
 }
 
 module_param(dataserver_retrans, uint, 0644);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2288cd3c9278..c08cbf40c59e 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len,
 
 /**
  * nfs_find_best_sec - Find a security mechanism supported locally
+ * @server: NFS server struct
  * @flavors: List of security tuples returned by SECINFO procedure
  *
  * Return the pseudoflavor of the first security mechanism in
@@ -145,7 +146,8 @@ static size_t nfs_parse_server_name(char *string, size_t len,
  * is searched in the order returned from the server, per RFC 3530
  * recommendation.
  */
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server,
+					  struct nfs4_secinfo_flavors *flavors)
 {
 	rpc_authflavor_t pseudoflavor;
 	struct nfs4_secinfo4 *secinfo;
@@ -160,12 +162,19 @@ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
 		case RPC_AUTH_GSS:
 			pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
 							&secinfo->flavor_info);
-			if (pseudoflavor != RPC_AUTH_MAXFLAVOR)
+			/* make sure pseudoflavor matches sec= mount opt */
+			if (pseudoflavor != RPC_AUTH_MAXFLAVOR &&
+			    nfs_auth_info_match(&server->auth_info,
+						pseudoflavor))
 				return pseudoflavor;
 			break;
 		}
 	}
 
+	/* if there were any sec= options then nothing matched */
+	if (server->auth_info.flavor_len > 0)
+		return -EPERM;
+
 	return RPC_AUTH_UNIX;
 }
 
@@ -187,7 +196,7 @@ static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr
 		goto out;
 	}
 
-	flavor = nfs_find_best_sec(flavors);
+	flavor = nfs_find_best_sec(NFS_SERVER(inode), flavors);
 
 out:
 	put_page(page);
@@ -390,7 +399,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
 
 	if (client->cl_auth->au_flavor != flavor)
 		flavor = client->cl_auth->au_flavor;
-	else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) {
+	else {
 		rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
 		if ((int)new >= 0)
 			flavor = new;
@@ -400,3 +409,104 @@ out:
 	rpc_shutdown_client(client);
 	return mnt;
 }
+
+/*
+ * Try one location from the fs_locations array.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+static int nfs4_try_replacing_one_location(struct nfs_server *server,
+		char *page, char *page2,
+		const struct nfs4_fs_location *location)
+{
+	const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+	struct sockaddr *sap;
+	unsigned int s;
+	size_t salen;
+	int error;
+
+	sap = kmalloc(addr_bufsize, GFP_KERNEL);
+	if (sap == NULL)
+		return -ENOMEM;
+
+	error = -ENOENT;
+	for (s = 0; s < location->nservers; s++) {
+		const struct nfs4_string *buf = &location->servers[s];
+		char *hostname;
+
+		if (buf->len <= 0 || buf->len > PAGE_SIZE)
+			continue;
+
+		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL)
+			continue;
+
+		salen = nfs_parse_server_name(buf->data, buf->len,
+						sap, addr_bufsize, server);
+		if (salen == 0)
+			continue;
+		rpc_set_port(sap, NFS_PORT);
+
+		error = -ENOMEM;
+		hostname = kstrndup(buf->data, buf->len, GFP_KERNEL);
+		if (hostname == NULL)
+			break;
+
+		error = nfs4_update_server(server, hostname, sap, salen);
+		kfree(hostname);
+		if (error == 0)
+			break;
+	}
+
+	kfree(sap);
+	return error;
+}
+
+/**
+ * nfs4_replace_transport - set up transport to destination server
+ *
+ * @server: export being migrated
+ * @locations: fs_locations array
+ *
+ * Returns zero on success, or a negative errno value.
+ *
+ * The client tries all the entries in the "locations" array, in the
+ * order returned by the server, until one works or the end of the
+ * array is reached.
+ */
+int nfs4_replace_transport(struct nfs_server *server,
+			   const struct nfs4_fs_locations *locations)
+{
+	char *page = NULL, *page2 = NULL;
+	int loc, error;
+
+	error = -ENOENT;
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	error = -ENOMEM;
+	page = (char *) __get_free_page(GFP_USER);
+	if (!page)
+		goto out;
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (!page2)
+		goto out;
+
+	for (loc = 0; loc < locations->nlocations; loc++) {
+		const struct nfs4_fs_location *location =
+						&locations->locations[loc];
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0)
+			continue;
+
+		error = nfs4_try_replacing_one_location(server, page,
+							page2, location);
+		if (error == 0)
+			break;
+	}
+
+out:
+	free_page((unsigned long)page);
+	free_page((unsigned long)page2);
+	return error;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 989bb9d3074d..5ab33c0792df 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -105,9 +105,6 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
 	if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
 		return NULL;
 
-	if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
-		return NULL;
-
 	err = security_dentry_init_security(dentry, sattr->ia_mode,
 				&dentry->d_name, (void **)&label->label, &label->len);
 	if (err == 0)
@@ -384,6 +381,14 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 		case -NFS4ERR_STALE_CLIENTID:
 			nfs4_schedule_lease_recovery(clp);
 			goto wait_on_recovery;
+		case -NFS4ERR_MOVED:
+			ret = nfs4_schedule_migration_recovery(server);
+			if (ret < 0)
+				break;
+			goto wait_on_recovery;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(clp);
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -431,6 +436,8 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 	return nfs4_map_errors(ret);
 wait_on_recovery:
 	ret = nfs4_wait_clnt_recover(clp);
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		return -EIO;
 	if (ret == 0)
 		exception->retry = 1;
 	return ret;
@@ -912,6 +919,7 @@ struct nfs4_opendata {
 	struct iattr attrs;
 	unsigned long timestamp;
 	unsigned int rpc_done : 1;
+	unsigned int file_created : 1;
 	unsigned int is_recover : 1;
 	int rpc_status;
 	int cancelled;
@@ -1317,31 +1325,24 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 	int ret;
 
 	if (!data->rpc_done) {
-		ret = data->rpc_status;
-		goto err;
+		if (data->rpc_status) {
+			ret = data->rpc_status;
+			goto err;
+		}
+		/* cached opens have already been processed */
+		goto update;
 	}
 
-	ret = -ESTALE;
-	if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) ||
-	    !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) ||
-	    !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE))
-		goto err;
-
-	ret = -ENOMEM;
-	state = nfs4_get_open_state(inode, data->owner);
-	if (state == NULL)
-		goto err;
-
 	ret = nfs_refresh_inode(inode, &data->f_attr);
 	if (ret)
 		goto err;
 
-	nfs_setsecurity(inode, &data->f_attr, data->f_label);
-
 	if (data->o_res.delegation_type != 0)
 		nfs4_opendata_check_deleg(data, state);
+update:
 	update_open_stateid(state, &data->o_res.stateid, NULL,
 			    data->o_arg.fmode);
+	atomic_inc(&state->count);
 
 	return state;
 err:
@@ -1574,6 +1575,12 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
 			/* Don't recall a delegation if it was lost */
 			nfs4_schedule_lease_recovery(server->nfs_client);
 			return -EAGAIN;
+		case -NFS4ERR_MOVED:
+			nfs4_schedule_migration_recovery(server);
+			return -EAGAIN;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(server->nfs_client);
+			return -EAGAIN;
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_BAD_STATEID:
@@ -1946,8 +1953,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 
 	nfs_fattr_map_and_free_names(server, &data->f_attr);
 
-	if (o_arg->open_flags & O_CREAT)
+	if (o_arg->open_flags & O_CREAT) {
 		update_changeattr(dir, &o_res->cinfo);
+		if (o_arg->open_flags & O_EXCL)
+			data->file_created = 1;
+		else if (o_res->cinfo.before != o_res->cinfo.after)
+			data->file_created = 1;
+	}
 	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
 		server->caps &= ~NFS_CAP_POSIX_LOCK;
 	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
@@ -2191,7 +2203,8 @@ static int _nfs4_do_open(struct inode *dir,
 			struct nfs_open_context *ctx,
 			int flags,
 			struct iattr *sattr,
-			struct nfs4_label *label)
+			struct nfs4_label *label,
+			int *opened)
 {
 	struct nfs4_state_owner  *sp;
 	struct nfs4_state     *state = NULL;
@@ -2261,6 +2274,8 @@ static int _nfs4_do_open(struct inode *dir,
 			nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
 		}
 	}
+	if (opendata->file_created)
+		*opened |= FILE_CREATED;
 
 	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
 		*ctx_th = opendata->f_attr.mdsthreshold;
@@ -2289,7 +2304,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 					struct nfs_open_context *ctx,
 					int flags,
 					struct iattr *sattr,
-					struct nfs4_label *label)
+					struct nfs4_label *label,
+					int *opened)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs4_exception exception = { };
@@ -2297,7 +2313,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 	int status;
 
 	do {
-		status = _nfs4_do_open(dir, ctx, flags, sattr, label);
+		status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened);
 		res = ctx->state;
 		trace_nfs4_open_file(ctx, flags, status);
 		if (status == 0)
@@ -2659,7 +2675,8 @@ out:
 }
 
 static struct inode *
-nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
+		int open_flags, struct iattr *attr, int *opened)
 {
 	struct nfs4_state *state;
 	struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
@@ -2667,7 +2684,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags
 	label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
 
 	/* Protect against concurrent sillydeletes */
-	state = nfs4_do_open(dir, ctx, open_flags, attr, label);
+	state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened);
 
 	nfs4_label_release_security(label);
 
@@ -2686,6 +2703,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 		nfs4_close_state(ctx->state, ctx->mode);
 }
 
+#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
+#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_CHANGE_SECURITY_LABEL - 1UL)
+
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
 	struct nfs4_server_caps_arg args = {
@@ -2701,12 +2722,25 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 	if (status == 0) {
+		/* Sanity check the server answers */
+		switch (server->nfs_client->cl_minorversion) {
+		case 0:
+			res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
+			res.attr_bitmask[2] = 0;
+			break;
+		case 1:
+			res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK;
+			break;
+		case 2:
+			res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
+		}
 		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
 		server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
 				NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
 				NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
 				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
-				NFS_CAP_CTIME|NFS_CAP_MTIME);
+				NFS_CAP_CTIME|NFS_CAP_MTIME|
+				NFS_CAP_SECURITY_LABEL);
 		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
 			server->caps |= NFS_CAP_ACLS;
 		if (res.has_links != 0)
@@ -2735,14 +2769,12 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 #endif
 		memcpy(server->attr_bitmask_nl, res.attr_bitmask,
 				sizeof(server->attr_bitmask));
+		server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
 
-		if (server->caps & NFS_CAP_SECURITY_LABEL) {
-			server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
-			res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
-		}
 		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
 		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		server->cache_consistency_bitmask[2] = 0;
 		server->acl_bitmask = res.acl_bitmask;
 		server->fh_expire_type = res.fh_expire_type;
 	}
@@ -2853,11 +2885,24 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 	int status = -EPERM;
 	size_t i;
 
-	for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
-		status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
-		if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
-			continue;
-		break;
+	if (server->auth_info.flavor_len > 0) {
+		/* try each flavor specified by user */
+		for (i = 0; i < server->auth_info.flavor_len; i++) {
+			status = nfs4_lookup_root_sec(server, fhandle, info,
+						server->auth_info.flavors[i]);
+			if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+				continue;
+			break;
+		}
+	} else {
+		/* no flavors specified by user, try default list */
+		for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
+			status = nfs4_lookup_root_sec(server, fhandle, info,
+						      flav_array[i]);
+			if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+				continue;
+			break;
+		}
 	}
 
 	/*
@@ -2899,9 +2944,6 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
 		status = nfs4_lookup_root(server, fhandle, info);
 		if (status != -NFS4ERR_WRONGSEC)
 			break;
-		/* Did user force a 'sec=' mount option? */
-		if (server->flags & NFS_MOUNT_SECFLAVOUR)
-			break;
 	default:
 		status = nfs4_do_find_root_sec(server, fhandle, info);
 	}
@@ -2970,11 +3012,16 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
 	status = nfs4_proc_fs_locations(client, dir, name, locations, page);
 	if (status != 0)
 		goto out;
-	/* Make sure server returned a different fsid for the referral */
+
+	/*
+	 * If the fsid didn't change, this is a migration event, not a
+	 * referral.  Cause us to drop into the exception handler, which
+	 * will kick off migration recovery.
+	 */
 	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
 		dprintk("%s: server did not return a different fsid for"
 			" a referral at %s\n", __func__, name->name);
-		status = -EIO;
+		status = -NFS4ERR_MOVED;
 		goto out;
 	}
 	/* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
@@ -3154,9 +3201,6 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
 			err = -EPERM;
 			if (client != *clnt)
 				goto out;
-			/* No security negotiation if the user specified 'sec=' */
-			if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR)
-				goto out;
 			client = nfs4_create_sec_client(client, dir, name);
 			if (IS_ERR(client))
 				return PTR_ERR(client);
@@ -3332,6 +3376,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	struct nfs4_label l, *ilabel = NULL;
 	struct nfs_open_context *ctx;
 	struct nfs4_state *state;
+	int opened = 0;
 	int status = 0;
 
 	ctx = alloc_nfs_open_context(dentry, FMODE_READ);
@@ -3341,7 +3386,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
 
 	sattr->ia_mode &= ~current_umask();
-	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
+	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
 		goto out;
@@ -4209,7 +4254,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
 	unsigned long timestamp = data->timestamp;
 
 	trace_nfs4_renew_async(clp, task->tk_status);
-	if (task->tk_status < 0) {
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -NFS4ERR_LEASE_MOVED:
+		nfs4_schedule_lease_moved_recovery(clp);
+		break;
+	default:
 		/* Unless we're shutting down, schedule state recovery! */
 		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
 			return;
@@ -4563,7 +4614,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
 	struct nfs4_label label = {0, 0, buflen, buf};
 
 	u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
-	struct nfs4_getattr_arg args = {
+	struct nfs4_getattr_arg arg = {
 		.fh		= NFS_FH(inode),
 		.bitmask	= bitmask,
 	};
@@ -4574,14 +4625,14 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
 	};
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
-		.rpc_argp	= &args,
+		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
 	};
 	int ret;
 
 	nfs_fattr_init(&fattr);
 
-	ret = rpc_call_sync(server->client, &msg, 0);
+	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0);
 	if (ret)
 		return ret;
 	if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
@@ -4618,7 +4669,7 @@ static int _nfs4_do_set_security_label(struct inode *inode,
 	struct iattr sattr = {0};
 	struct nfs_server *server = NFS_SERVER(inode);
 	const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
-	struct nfs_setattrargs args = {
+	struct nfs_setattrargs arg = {
 		.fh             = NFS_FH(inode),
 		.iap            = &sattr,
 		.server		= server,
@@ -4632,14 +4683,14 @@ static int _nfs4_do_set_security_label(struct inode *inode,
 	};
 	struct rpc_message msg = {
 		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
-		.rpc_argp       = &args,
+		.rpc_argp       = &arg,
 		.rpc_resp       = &res,
 	};
 	int status;
 
-	nfs4_stateid_copy(&args.stateid, &zero_stateid);
+	nfs4_stateid_copy(&arg.stateid, &zero_stateid);
 
-	status = rpc_call_sync(server->client, &msg, 0);
+	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (status)
 		dprintk("%s failed: %d\n", __func__, status);
 
@@ -4723,17 +4774,24 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			if (state == NULL)
 				break;
 			if (nfs4_schedule_stateid_recovery(server, state) < 0)
-				goto stateid_invalid;
+				goto recovery_failed;
 			goto wait_on_recovery;
 		case -NFS4ERR_EXPIRED:
 			if (state != NULL) {
 				if (nfs4_schedule_stateid_recovery(server, state) < 0)
-					goto stateid_invalid;
+					goto recovery_failed;
 			}
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
 			nfs4_schedule_lease_recovery(clp);
 			goto wait_on_recovery;
+		case -NFS4ERR_MOVED:
+			if (nfs4_schedule_migration_recovery(server) < 0)
+				goto recovery_failed;
+			goto wait_on_recovery;
+		case -NFS4ERR_LEASE_MOVED:
+			nfs4_schedule_lease_moved_recovery(clp);
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -4745,29 +4803,28 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
 			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
-			task->tk_status = 0;
-			return -EAGAIN;
+			goto restart_call;
 #endif /* CONFIG_NFS_V4_1 */
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
 		case -NFS4ERR_GRACE:
 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
-			task->tk_status = 0;
-			return -EAGAIN;
 		case -NFS4ERR_RETRY_UNCACHED_REP:
 		case -NFS4ERR_OLD_STATEID:
-			task->tk_status = 0;
-			return -EAGAIN;
+			goto restart_call;
 	}
 	task->tk_status = nfs4_map_errors(task->tk_status);
 	return 0;
-stateid_invalid:
+recovery_failed:
 	task->tk_status = -EIO;
 	return 0;
 wait_on_recovery:
 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
 		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		goto recovery_failed;
+restart_call:
 	task->tk_status = 0;
 	return -EAGAIN;
 }
@@ -5094,6 +5151,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 			status = 0;
 	}
 	request->fl_ops->fl_release_private(request);
+	request->fl_ops = NULL;
 out:
 	return status;
 }
@@ -5767,6 +5825,7 @@ struct nfs_release_lockowner_data {
 	struct nfs_release_lockowner_args args;
 	struct nfs4_sequence_args seq_args;
 	struct nfs4_sequence_res seq_res;
+	unsigned long timestamp;
 };
 
 static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
@@ -5774,12 +5833,27 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
 	struct nfs_release_lockowner_data *data = calldata;
 	nfs40_setup_sequence(data->server,
 				&data->seq_args, &data->seq_res, task);
+	data->timestamp = jiffies;
 }
 
 static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs_release_lockowner_data *data = calldata;
+	struct nfs_server *server = data->server;
+
 	nfs40_sequence_done(task, &data->seq_res);
+
+	switch (task->tk_status) {
+	case 0:
+		renew_lease(server, data->timestamp);
+		break;
+	case -NFS4ERR_STALE_CLIENTID:
+	case -NFS4ERR_EXPIRED:
+	case -NFS4ERR_LEASE_MOVED:
+	case -NFS4ERR_DELAY:
+		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
+			rpc_restart_call_prepare(task);
+	}
 }
 
 static void nfs4_release_lockowner_release(void *calldata)
@@ -5978,6 +6052,283 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
 	return err;
 }
 
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery.  The server can stop returning
+ * NFS4ERR_LEASE_MOVED to this client.  A RENEW operation is
+ * appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_get_locations(struct inode *inode,
+				     struct nfs4_fs_locations *locations,
+				     struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	u32 bitmask[2] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.clientid	= server->nfs_client->cl_clientid,
+		.fh		= NFS_FH(inode),
+		.page		= page,
+		.bitmask	= bitmask,
+		.migration	= 1,		/* skip LOOKUP */
+		.renew		= 1,		/* append RENEW */
+	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations	= locations,
+		.migration	= 1,
+		.renew		= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	unsigned long now = jiffies;
+	int status;
+
+	nfs_fattr_init(&locations->fattr);
+	locations->server = server;
+	locations->nlocations = 0;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+					&args.seq_args, &res.seq_res);
+	if (status)
+		return status;
+
+	renew_lease(server, now);
+	return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery.  The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID
+ * performing this operation is identified in the SEQUENCE
+ * operation in this compound.
+ *
+ * When the client supports GETATTR(fs_locations_info), it can
+ * be plumbed in here.
+ */
+static int _nfs41_proc_get_locations(struct inode *inode,
+				     struct nfs4_fs_locations *locations,
+				     struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	u32 bitmask[2] = {
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+	};
+	struct nfs4_fs_locations_arg args = {
+		.fh		= NFS_FH(inode),
+		.page		= page,
+		.bitmask	= bitmask,
+		.migration	= 1,		/* skip LOOKUP */
+	};
+	struct nfs4_fs_locations_res res = {
+		.fs_locations	= locations,
+		.migration	= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	int status;
+
+	nfs_fattr_init(&locations->fattr);
+	locations->server = server;
+	locations->nlocations = 0;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+					&args.seq_args, &res.seq_res);
+	if (status == NFS4_OK &&
+	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+		status = -NFS4ERR_LEASE_MOVED;
+	return status;
+}
+
+#endif	/* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_get_locations - discover locations for a migrated FSID
+ * @inode: inode on FSID that is migrating
+ * @locations: result of query
+ * @page: buffer
+ * @cred: credential to use for this operation
+ *
+ * Returns NFS4_OK on success, a negative NFS4ERR status code if the
+ * operation failed, or a negative errno if a local error occurred.
+ *
+ * On success, "locations" is filled in, but if the server has
+ * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not
+ * asserted.
+ *
+ * -NFS4ERR_LEASE_MOVED is returned if the server still has leases
+ * from this client that require migration recovery.
+ */
+int nfs4_proc_get_locations(struct inode *inode,
+			    struct nfs4_fs_locations *locations,
+			    struct page *page, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	const struct nfs4_mig_recovery_ops *ops =
+					clp->cl_mvops->mig_recovery_ops;
+	struct nfs4_exception exception = { };
+	int status;
+
+	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+		(unsigned long long)server->fsid.major,
+		(unsigned long long)server->fsid.minor,
+		clp->cl_hostname);
+	nfs_display_fhandle(NFS_FH(inode), __func__);
+
+	do {
+		status = ops->get_locations(inode, locations, page, cred);
+		if (status != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, status, &exception);
+	} while (exception.retry);
+	return status;
+}
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery.  The server can stop
+ * returning NFS4ERR_LEASE_MOVED to this client.  A RENEW operation
+ * is appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct rpc_clnt *clnt = server->client;
+	struct nfs4_fsid_present_arg args = {
+		.fh		= NFS_FH(inode),
+		.clientid	= clp->cl_clientid,
+		.renew		= 1,		/* append RENEW */
+	};
+	struct nfs4_fsid_present_res res = {
+		.renew		= 1,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	unsigned long now = jiffies;
+	int status;
+
+	res.fh = nfs_alloc_fhandle();
+	if (res.fh == NULL)
+		return -ENOMEM;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+						&args.seq_args, &res.seq_res);
+	nfs_free_fhandle(res.fh);
+	if (status)
+		return status;
+
+	do_renew_lease(clp, now);
+	return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery.  The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID performing
+ * this operation is identified in the SEQUENCE operation in this
+ * compound.
+ */
+static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct rpc_clnt *clnt = server->client;
+	struct nfs4_fsid_present_arg args = {
+		.fh		= NFS_FH(inode),
+	};
+	struct nfs4_fsid_present_res res = {
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
+	};
+	int status;
+
+	res.fh = nfs_alloc_fhandle();
+	if (res.fh == NULL)
+		return -ENOMEM;
+
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+	nfs4_set_sequence_privileged(&args.seq_args);
+	status = nfs4_call_sync_sequence(clnt, server, &msg,
+						&args.seq_args, &res.seq_res);
+	nfs_free_fhandle(res.fh);
+	if (status == NFS4_OK &&
+	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+		status = -NFS4ERR_LEASE_MOVED;
+	return status;
+}
+
+#endif	/* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_fsid_present - Is this FSID present or absent on server?
+ * @inode: inode on FSID to check
+ * @cred: credential to use for this operation
+ *
+ * Server indicates whether the FSID is present, moved, or not
+ * recognized.  This operation is necessary to clear a LEASE_MOVED
+ * condition for this client ID.
+ *
+ * Returns NFS4_OK if the FSID is present on this server,
+ * -NFS4ERR_MOVED if the FSID is no longer present, a negative
+ *  NFS4ERR code if some error occurred on the server, or a
+ *  negative errno if a local failure occurred.
+ */
+int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_client *clp = server->nfs_client;
+	const struct nfs4_mig_recovery_ops *ops =
+					clp->cl_mvops->mig_recovery_ops;
+	struct nfs4_exception exception = { };
+	int status;
+
+	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+		(unsigned long long)server->fsid.major,
+		(unsigned long long)server->fsid.minor,
+		clp->cl_hostname);
+	nfs_display_fhandle(NFS_FH(inode), __func__);
+
+	do {
+		status = ops->fsid_present(inode, cred);
+		if (status != -NFS4ERR_DELAY)
+			break;
+		nfs4_handle_exception(server, status, &exception);
+	} while (exception.retry);
+	return status;
+}
+
 /**
  * If 'use_integrity' is true and the state managment nfs_client
  * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
@@ -6264,8 +6615,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	struct nfs41_exchange_id_args args = {
 		.verifier = &verifier,
 		.client = clp,
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+			 EXCHGID4_FLAG_BIND_PRINC_STATEID |
+			 EXCHGID4_FLAG_SUPP_MOVED_MIGR,
+#else
 		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-			EXCHGID4_FLAG_BIND_PRINC_STATEID,
+			 EXCHGID4_FLAG_BIND_PRINC_STATEID,
+#endif
 	};
 	struct nfs41_exchange_id_res res = {
 		0
@@ -7564,8 +7921,10 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 {
 	int err;
 	struct page *page;
-	rpc_authflavor_t flavor;
+	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
 	struct nfs4_secinfo_flavors *flavors;
+	struct nfs4_secinfo4 *secinfo;
+	int i;
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page) {
@@ -7587,9 +7946,34 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 	if (err)
 		goto out_freepage;
 
-	flavor = nfs_find_best_sec(flavors);
-	if (err == 0)
-		err = nfs4_lookup_root_sec(server, fhandle, info, flavor);
+	for (i = 0; i < flavors->num_flavors; i++) {
+		secinfo = &flavors->flavors[i];
+
+		switch (secinfo->flavor) {
+		case RPC_AUTH_NULL:
+		case RPC_AUTH_UNIX:
+		case RPC_AUTH_GSS:
+			flavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+					&secinfo->flavor_info);
+			break;
+		default:
+			flavor = RPC_AUTH_MAXFLAVOR;
+			break;
+		}
+
+		if (!nfs_auth_info_match(&server->auth_info, flavor))
+			flavor = RPC_AUTH_MAXFLAVOR;
+
+		if (flavor != RPC_AUTH_MAXFLAVOR) {
+			err = nfs4_lookup_root_sec(server, fhandle,
+						   info, flavor);
+			if (!err)
+				break;
+		}
+	}
+
+	if (flavor == RPC_AUTH_MAXFLAVOR)
+		err = -EPERM;
 
 out_freepage:
 	put_page(page);
@@ -7851,6 +8235,18 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 };
 #endif
 
+static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
+	.get_locations = _nfs40_proc_get_locations,
+	.fsid_present = _nfs40_proc_fsid_present,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
+	.get_locations = _nfs41_proc_get_locations,
+	.fsid_present = _nfs41_proc_fsid_present,
+};
+#endif	/* CONFIG_NFS_V4_1 */
+
 static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.minor_version = 0,
 	.init_caps = NFS_CAP_READDIRPLUS
@@ -7866,6 +8262,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
 	.state_renewal_ops = &nfs40_state_renewal_ops,
+	.mig_recovery_ops = &nfs40_mig_recovery_ops,
 };
 
 #if defined(CONFIG_NFS_V4_1)
@@ -7886,6 +8283,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
 	.state_renewal_ops = &nfs41_state_renewal_ops,
+	.mig_recovery_ops = &nfs41_mig_recovery_ops,
 };
 #endif
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cc14cbb78b73..c8e729deb4f7 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -239,8 +239,6 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
 	}
 }
 
-#if defined(CONFIG_NFS_V4_1)
-
 static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
 {
 	set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
@@ -270,6 +268,8 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
 	return nfs4_drain_slot_tbl(&ses->fc_slot_table);
 }
 
+#if defined(CONFIG_NFS_V4_1)
+
 static int nfs41_setup_state_renewal(struct nfs_client *clp)
 {
 	int status;
@@ -1197,20 +1197,74 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
 
+/**
+ * nfs4_schedule_migration_recovery - trigger migration recovery
+ *
+ * @server: FSID that is migrating
+ *
+ * Returns zero if recovery has started, otherwise a negative NFS4ERR
+ * value is returned.
+ */
+int nfs4_schedule_migration_recovery(const struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	if (server->fh_expire_type != NFS4_FH_PERSISTENT) {
+		pr_err("NFS: volatile file handles not supported (server %s)\n",
+				clp->cl_hostname);
+		return -NFS4ERR_IO;
+	}
+
+	if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+		return -NFS4ERR_IO;
+
+	dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n",
+			__func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			clp->cl_hostname);
+
+	set_bit(NFS_MIG_IN_TRANSITION,
+			&((struct nfs_server *)server)->mig_status);
+	set_bit(NFS4CLNT_MOVED, &clp->cl_state);
+
+	nfs4_schedule_state_manager(clp);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery);
+
+/**
+ * nfs4_schedule_lease_moved_recovery - start lease-moved recovery
+ *
+ * @clp: server to check for moved leases
+ *
+ */
+void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp)
+{
+	dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n",
+		__func__, clp->cl_clientid, clp->cl_hostname);
+
+	set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state);
+	nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery);
+
 int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
 	int res;
 
 	might_sleep();
 
+	atomic_inc(&clp->cl_count);
 	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
 			nfs_wait_bit_killable, TASK_KILLABLE);
 	if (res)
-		return res;
-
+		goto out;
 	if (clp->cl_cons_state < 0)
-		return clp->cl_cons_state;
-	return 0;
+		res = clp->cl_cons_state;
+out:
+	nfs_put_client(clp);
+	return res;
 }
 
 int nfs4_client_recover_expired_lease(struct nfs_client *clp)
@@ -1375,8 +1429,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 				goto out;
 			default:
-				printk(KERN_ERR "NFS: %s: unhandled error %d. "
-					"Zeroing state\n", __func__, status);
+				printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+					 __func__, status);
 			case -ENOMEM:
 			case -NFS4ERR_DENIED:
 			case -NFS4ERR_RECLAIM_BAD:
@@ -1422,7 +1476,7 @@ restart:
 		if (status >= 0) {
 			status = nfs4_reclaim_locks(state, ops);
 			if (status >= 0) {
-				if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) {
+				if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
 					spin_lock(&state->state_lock);
 					list_for_each_entry(lock, &state->lock_states, ls_locks) {
 						if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
@@ -1439,15 +1493,12 @@ restart:
 		}
 		switch (status) {
 			default:
-				printk(KERN_ERR "NFS: %s: unhandled error %d. "
-					"Zeroing state\n", __func__, status);
+				printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+					__func__, status);
 			case -ENOENT:
 			case -ENOMEM:
 			case -ESTALE:
-				/*
-				 * Open state on this file cannot be recovered
-				 * All we can do is revert to using the zero stateid.
-				 */
+				/* Open state on this file cannot be recovered */
 				nfs4_state_mark_recovery_failed(state, status);
 				break;
 			case -EAGAIN:
@@ -1628,7 +1679,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 			nfs4_state_end_reclaim_reboot(clp);
 			break;
 		case -NFS4ERR_STALE_CLIENTID:
-		case -NFS4ERR_LEASE_MOVED:
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 			nfs4_state_clear_reclaim_reboot(clp);
 			nfs4_state_start_reclaim_reboot(clp);
@@ -1829,6 +1879,168 @@ static int nfs4_purge_lease(struct nfs_client *clp)
 	return 0;
 }
 
+/*
+ * Try remote migration of one FSID from a source server to a
+ * destination server.  The source server provides a list of
+ * potential destinations.
+ *
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_fs_locations *locations = NULL;
+	struct inode *inode;
+	struct page *page;
+	int status, result;
+
+	dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__,
+			(unsigned long long)server->fsid.major,
+			(unsigned long long)server->fsid.minor,
+			clp->cl_hostname);
+
+	result = 0;
+	page = alloc_page(GFP_KERNEL);
+	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (page == NULL || locations == NULL) {
+		dprintk("<-- %s: no memory\n", __func__);
+		goto out;
+	}
+
+	inode = server->super->s_root->d_inode;
+	result = nfs4_proc_get_locations(inode, locations, page, cred);
+	if (result) {
+		dprintk("<-- %s: failed to retrieve fs_locations: %d\n",
+			__func__, result);
+		goto out;
+	}
+
+	result = -NFS4ERR_NXIO;
+	if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
+		dprintk("<-- %s: No fs_locations data, migration skipped\n",
+			__func__);
+		goto out;
+	}
+
+	nfs4_begin_drain_session(clp);
+
+	status = nfs4_replace_transport(server, locations);
+	if (status != 0) {
+		dprintk("<-- %s: failed to replace transport: %d\n",
+			__func__, status);
+		goto out;
+	}
+
+	result = 0;
+	dprintk("<-- %s: migration succeeded\n", __func__);
+
+out:
+	if (page != NULL)
+		__free_page(page);
+	kfree(locations);
+	if (result) {
+		pr_err("NFS: migration recovery failed (server %s)\n",
+				clp->cl_hostname);
+		set_bit(NFS_MIG_FAILED, &server->mig_status);
+	}
+	return result;
+}
+
+/*
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_handle_migration(struct nfs_client *clp)
+{
+	const struct nfs4_state_maintenance_ops *ops =
+				clp->cl_mvops->state_renewal_ops;
+	struct nfs_server *server;
+	struct rpc_cred *cred;
+
+	dprintk("%s: migration reported on \"%s\"\n", __func__,
+			clp->cl_hostname);
+
+	spin_lock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred == NULL)
+		return -NFS4ERR_NOENT;
+
+	clp->cl_mig_gen++;
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		int status;
+
+		if (server->mig_gen == clp->cl_mig_gen)
+			continue;
+		server->mig_gen = clp->cl_mig_gen;
+
+		if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION,
+						&server->mig_status))
+			continue;
+
+		rcu_read_unlock();
+		status = nfs4_try_migration(server, cred);
+		if (status < 0) {
+			put_rpccred(cred);
+			return status;
+		}
+		goto restart;
+	}
+	rcu_read_unlock();
+	put_rpccred(cred);
+	return 0;
+}
+
+/*
+ * Test each nfs_server on the clp's cl_superblocks list to see
+ * if it's moved to another server.  Stop when the server no longer
+ * returns NFS4ERR_LEASE_MOVED.
+ */
+static int nfs4_handle_lease_moved(struct nfs_client *clp)
+{
+	const struct nfs4_state_maintenance_ops *ops =
+				clp->cl_mvops->state_renewal_ops;
+	struct nfs_server *server;
+	struct rpc_cred *cred;
+
+	dprintk("%s: lease moved reported on \"%s\"\n", __func__,
+			clp->cl_hostname);
+
+	spin_lock(&clp->cl_lock);
+	cred = ops->get_state_renewal_cred_locked(clp);
+	spin_unlock(&clp->cl_lock);
+	if (cred == NULL)
+		return -NFS4ERR_NOENT;
+
+	clp->cl_mig_gen++;
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		struct inode *inode;
+		int status;
+
+		if (server->mig_gen == clp->cl_mig_gen)
+			continue;
+		server->mig_gen = clp->cl_mig_gen;
+
+		rcu_read_unlock();
+
+		inode = server->super->s_root->d_inode;
+		status = nfs4_proc_fsid_present(inode, cred);
+		if (status != -NFS4ERR_MOVED)
+			goto restart;	/* wasn't this one */
+		if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED)
+			goto restart;	/* there are more */
+		goto out;
+	}
+	rcu_read_unlock();
+
+out:
+	put_rpccred(cred);
+	return 0;
+}
+
 /**
  * nfs4_discover_server_trunking - Detect server IP address trunking
  *
@@ -2017,9 +2229,10 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 		nfs41_handle_server_reboot(clp);
 	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
 			    SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
-			    SEQ4_STATUS_ADMIN_STATE_REVOKED |
-			    SEQ4_STATUS_LEASE_MOVED))
+			    SEQ4_STATUS_ADMIN_STATE_REVOKED))
 		nfs41_handle_state_revoked(clp);
+	if (flags & SEQ4_STATUS_LEASE_MOVED)
+		nfs4_schedule_lease_moved_recovery(clp);
 	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
 		nfs41_handle_recallable_state_revoked(clp);
 	if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
@@ -2157,7 +2370,20 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			status = nfs4_check_lease(clp);
 			if (status < 0)
 				goto out_error;
-			continue;
+		}
+
+		if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
+			section = "migration";
+			status = nfs4_handle_migration(clp);
+			if (status < 0)
+				goto out_error;
+		}
+
+		if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) {
+			section = "lease moved";
+			status = nfs4_handle_lease_moved(clp);
+			if (status < 0)
+				goto out_error;
 		}
 
 		/* First recover reboot state... */
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index e26acdd1a645..65ab0a0ca1c4 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -261,9 +261,9 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name,
 
 	res = nfs_follow_remote_path(root_mnt, export_path);
 
-	dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
-			IS_ERR(res) ? PTR_ERR(res) : 0,
-			IS_ERR(res) ? " [error]" : "");
+	dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n",
+		 PTR_ERR_OR_ZERO(res),
+		 IS_ERR(res) ? " [error]" : "");
 	return res;
 }
 
@@ -319,9 +319,9 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
 	data->mnt_path = export_path;
 
 	res = nfs_follow_remote_path(root_mnt, export_path);
-	dprintk("<-- nfs4_referral_mount() = %ld%s\n",
-			IS_ERR(res) ? PTR_ERR(res) : 0,
-			IS_ERR(res) ? " [error]" : "");
+	dprintk("<-- nfs4_referral_mount() = %d%s\n",
+		PTR_ERR_OR_ZERO(res),
+		IS_ERR(res) ? " [error]" : "");
 	return res;
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 79210d23f607..5be2868c02f1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -105,12 +105,8 @@ static int nfs4_stat_to_errno(int);
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 /* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
 #define	nfs4_label_maxsz	(4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
-#define encode_readdir_space 24
-#define encode_readdir_bitmask_sz 3
 #else
 #define	nfs4_label_maxsz	0
-#define encode_readdir_space 20
-#define encode_readdir_bitmask_sz 2
 #endif
 /* We support only one layout type per file system */
 #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
@@ -595,11 +591,13 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_getattr_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_getattr_maxsz)
+				encode_getattr_maxsz + \
+				encode_renew_maxsz)
 #define NFS4_dec_getattr_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_getattr_maxsz)
+				decode_getattr_maxsz + \
+				decode_renew_maxsz)
 #define NFS4_enc_lookup_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -736,13 +734,15 @@ static int nfs4_stat_to_errno(int);
 				 encode_sequence_maxsz + \
 				 encode_putfh_maxsz + \
 				 encode_lookup_maxsz + \
-				 encode_fs_locations_maxsz)
+				 encode_fs_locations_maxsz + \
+				 encode_renew_maxsz)
 #define NFS4_dec_fs_locations_sz \
 				(compound_decode_hdr_maxsz + \
 				 decode_sequence_maxsz + \
 				 decode_putfh_maxsz + \
 				 decode_lookup_maxsz + \
-				 decode_fs_locations_maxsz)
+				 decode_fs_locations_maxsz + \
+				 decode_renew_maxsz)
 #define NFS4_enc_secinfo_sz 	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -751,6 +751,18 @@ static int nfs4_stat_to_errno(int);
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
 				decode_secinfo_maxsz)
+#define NFS4_enc_fsid_present_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_sequence_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_getfh_maxsz + \
+				 encode_renew_maxsz)
+#define NFS4_dec_fsid_present_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_sequence_maxsz + \
+				 decode_putfh_maxsz + \
+				 decode_getfh_maxsz + \
+				 decode_renew_maxsz)
 #if defined(CONFIG_NFS_V4_1)
 #define NFS4_enc_bind_conn_to_session_sz \
 				(compound_encode_hdr_maxsz + \
@@ -1565,6 +1577,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	};
 	uint32_t dircount = readdir->count >> 1;
 	__be32 *p, verf[2];
+	uint32_t attrlen = 0;
+	unsigned int i;
 
 	if (readdir->plus) {
 		attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
@@ -1573,26 +1587,27 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 			FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
 			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
 			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+		attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
 		dircount >>= 1;
 	}
 	/* Use mounted_on_fileid only if the server supports it */
 	if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
 		attrs[0] |= FATTR4_WORD0_FILEID;
+	for (i = 0; i < ARRAY_SIZE(attrs); i++) {
+		attrs[i] &= readdir->bitmask[i];
+		if (attrs[i] != 0)
+			attrlen = i+1;
+	}
 
 	encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
 	encode_uint64(xdr, readdir->cookie);
 	encode_nfs4_verifier(xdr, &readdir->verifier);
-	p = reserve_space(xdr, encode_readdir_space);
+	p = reserve_space(xdr, 12 + (attrlen << 2));
 	*p++ = cpu_to_be32(dircount);
 	*p++ = cpu_to_be32(readdir->count);
-	*p++ = cpu_to_be32(encode_readdir_bitmask_sz);
-	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
-	*p   = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
-	if (encode_readdir_bitmask_sz > 2) {
-		if (hdr->minorversion > 1)
-			attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
-		p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
-	}
+	*p++ = cpu_to_be32(attrlen);
+	for (i = 0; i < attrlen; i++)
+		*p++ = cpu_to_be32(attrs[i]);
 	memcpy(verf, readdir->verifier.data, sizeof(verf));
 
 	dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
@@ -2687,11 +2702,20 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
 
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
-	encode_putfh(xdr, args->dir_fh, &hdr);
-	encode_lookup(xdr, args->name, &hdr);
-	replen = hdr.replen;	/* get the attribute into args->page */
-	encode_fs_locations(xdr, args->bitmask, &hdr);
+	if (args->migration) {
+		encode_putfh(xdr, args->fh, &hdr);
+		replen = hdr.replen;
+		encode_fs_locations(xdr, args->bitmask, &hdr);
+		if (args->renew)
+			encode_renew(xdr, args->clientid, &hdr);
+	} else {
+		encode_putfh(xdr, args->dir_fh, &hdr);
+		encode_lookup(xdr, args->name, &hdr);
+		replen = hdr.replen;
+		encode_fs_locations(xdr, args->bitmask, &hdr);
+	}
 
+	/* Set up reply kvec to capture returned fs_locations array. */
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
 			0, PAGE_SIZE);
 	encode_nops(&hdr);
@@ -2715,6 +2739,26 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode FSID_PRESENT request
+ */
+static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_fsid_present_arg *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_getfh(xdr, &hdr);
+	if (args->renew)
+		encode_renew(xdr, args->clientid, &hdr);
+	encode_nops(&hdr);
+}
+
 #if defined(CONFIG_NFS_V4_1)
 /*
  * BIND_CONN_TO_SESSION request
@@ -6824,13 +6868,26 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_lookup(xdr);
-	if (status)
-		goto out;
-	xdr_enter_page(xdr, PAGE_SIZE);
-	status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
+	if (res->migration) {
+		xdr_enter_page(xdr, PAGE_SIZE);
+		status = decode_getfattr_generic(xdr,
+					&res->fs_locations->fattr,
+					 NULL, res->fs_locations,
+					 NULL, res->fs_locations->server);
+		if (status)
+			goto out;
+		if (res->renew)
+			status = decode_renew(xdr);
+	} else {
+		status = decode_lookup(xdr);
+		if (status)
+			goto out;
+		xdr_enter_page(xdr, PAGE_SIZE);
+		status = decode_getfattr_generic(xdr,
+					&res->fs_locations->fattr,
 					 NULL, res->fs_locations,
 					 NULL, res->fs_locations->server);
+	}
 out:
 	return status;
 }
@@ -6859,6 +6916,34 @@ out:
 	return status;
 }
 
+/*
+ * Decode FSID_PRESENT response
+ */
+static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs4_fsid_present_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_getfh(xdr, res->fh);
+	if (status)
+		goto out;
+	if (res->renew)
+		status = decode_renew(xdr);
+out:
+	return status;
+}
+
 #if defined(CONFIG_NFS_V4_1)
 /*
  * Decode BIND_CONN_TO_SESSION response
@@ -7373,6 +7458,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(FS_LOCATIONS,	enc_fs_locations,	dec_fs_locations),
 	PROC(RELEASE_LOCKOWNER,	enc_release_lockowner,	dec_release_lockowner),
 	PROC(SECINFO,		enc_secinfo,		dec_secinfo),
+	PROC(FSID_PRESENT,	enc_fsid_present,	dec_fsid_present),
 #if defined(CONFIG_NFS_V4_1)
 	PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
 	PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a03b9c6f9489..317d6fc2160e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -497,7 +497,8 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
 	static const struct {
 		rpc_authflavor_t flavour;
 		const char *str;
-	} sec_flavours[] = {
+	} sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = {
+		/* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */
 		{ RPC_AUTH_NULL, "null" },
 		{ RPC_AUTH_UNIX, "sys" },
 		{ RPC_AUTH_GSS_KRB5, "krb5" },
@@ -923,8 +924,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
 		data->mount_server.port	= NFS_UNSPEC_PORT;
 		data->nfs_server.port	= NFS_UNSPEC_PORT;
 		data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-		data->auth_flavors[0]	= RPC_AUTH_MAXFLAVOR;
-		data->auth_flavor_len	= 0;
+		data->selected_flavor	= RPC_AUTH_MAXFLAVOR;
 		data->minorversion	= 0;
 		data->need_mount	= true;
 		data->net		= current->nsproxy->net_ns;
@@ -1019,12 +1019,51 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
 	}
 }
 
-static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data,
-		rpc_authflavor_t pseudoflavor)
+/*
+ * Add 'flavor' to 'auth_info' if not already present.
+ * Returns true if 'flavor' ends up in the list, false otherwise
+ */
+static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
+			      rpc_authflavor_t flavor)
+{
+	unsigned int i;
+	unsigned int max_flavor_len = (sizeof(auth_info->flavors) /
+				       sizeof(auth_info->flavors[0]));
+
+	/* make sure this flavor isn't already in the list */
+	for (i = 0; i < auth_info->flavor_len; i++) {
+		if (flavor == auth_info->flavors[i])
+			return true;
+	}
+
+	if (auth_info->flavor_len + 1 >= max_flavor_len) {
+		dfprintk(MOUNT, "NFS: too many sec= flavors\n");
+		return false;
+	}
+
+	auth_info->flavors[auth_info->flavor_len++] = flavor;
+	return true;
+}
+
+/*
+ * Return true if 'match' is in auth_info or auth_info is empty.
+ * Return false otherwise.
+ */
+bool nfs_auth_info_match(const struct nfs_auth_info *auth_info,
+			 rpc_authflavor_t match)
 {
-	data->auth_flavors[0] = pseudoflavor;
-	data->auth_flavor_len = 1;
+	int i;
+
+	if (!auth_info->flavor_len)
+		return true;
+
+	for (i = 0; i < auth_info->flavor_len; i++) {
+		if (auth_info->flavors[i] == match)
+			return true;
+	}
+	return false;
 }
+EXPORT_SYMBOL_GPL(nfs_auth_info_match);
 
 /*
  * Parse the value of the 'sec=' option.
@@ -1034,49 +1073,55 @@ static int nfs_parse_security_flavors(char *value,
 {
 	substring_t args[MAX_OPT_ARGS];
 	rpc_authflavor_t pseudoflavor;
+	char *p;
 
 	dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
 
-	switch (match_token(value, nfs_secflavor_tokens, args)) {
-	case Opt_sec_none:
-		pseudoflavor = RPC_AUTH_NULL;
-		break;
-	case Opt_sec_sys:
-		pseudoflavor = RPC_AUTH_UNIX;
-		break;
-	case Opt_sec_krb5:
-		pseudoflavor = RPC_AUTH_GSS_KRB5;
-		break;
-	case Opt_sec_krb5i:
-		pseudoflavor = RPC_AUTH_GSS_KRB5I;
-		break;
-	case Opt_sec_krb5p:
-		pseudoflavor = RPC_AUTH_GSS_KRB5P;
-		break;
-	case Opt_sec_lkey:
-		pseudoflavor = RPC_AUTH_GSS_LKEY;
-		break;
-	case Opt_sec_lkeyi:
-		pseudoflavor = RPC_AUTH_GSS_LKEYI;
-		break;
-	case Opt_sec_lkeyp:
-		pseudoflavor = RPC_AUTH_GSS_LKEYP;
-		break;
-	case Opt_sec_spkm:
-		pseudoflavor = RPC_AUTH_GSS_SPKM;
-		break;
-	case Opt_sec_spkmi:
-		pseudoflavor = RPC_AUTH_GSS_SPKMI;
-		break;
-	case Opt_sec_spkmp:
-		pseudoflavor = RPC_AUTH_GSS_SPKMP;
-		break;
-	default:
-		return 0;
+	while ((p = strsep(&value, ":")) != NULL) {
+		switch (match_token(p, nfs_secflavor_tokens, args)) {
+		case Opt_sec_none:
+			pseudoflavor = RPC_AUTH_NULL;
+			break;
+		case Opt_sec_sys:
+			pseudoflavor = RPC_AUTH_UNIX;
+			break;
+		case Opt_sec_krb5:
+			pseudoflavor = RPC_AUTH_GSS_KRB5;
+			break;
+		case Opt_sec_krb5i:
+			pseudoflavor = RPC_AUTH_GSS_KRB5I;
+			break;
+		case Opt_sec_krb5p:
+			pseudoflavor = RPC_AUTH_GSS_KRB5P;
+			break;
+		case Opt_sec_lkey:
+			pseudoflavor = RPC_AUTH_GSS_LKEY;
+			break;
+		case Opt_sec_lkeyi:
+			pseudoflavor = RPC_AUTH_GSS_LKEYI;
+			break;
+		case Opt_sec_lkeyp:
+			pseudoflavor = RPC_AUTH_GSS_LKEYP;
+			break;
+		case Opt_sec_spkm:
+			pseudoflavor = RPC_AUTH_GSS_SPKM;
+			break;
+		case Opt_sec_spkmi:
+			pseudoflavor = RPC_AUTH_GSS_SPKMI;
+			break;
+		case Opt_sec_spkmp:
+			pseudoflavor = RPC_AUTH_GSS_SPKMP;
+			break;
+		default:
+			dfprintk(MOUNT,
+				 "NFS: sec= option '%s' not recognized\n", p);
+			return 0;
+		}
+
+		if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor))
+			return 0;
 	}
 
-	mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-	nfs_set_auth_parsed_mount_data(mnt, pseudoflavor);
 	return 1;
 }
 
@@ -1623,12 +1668,14 @@ out_security_failure:
 }
 
 /*
- * Ensure that the specified authtype in args->auth_flavors[0] is supported by
- * the server. Returns 0 if it's ok, and -EACCES if not.
+ * Ensure that a specified authtype in args->auth_info is supported by
+ * the server. Returns 0 and sets args->selected_flavor if it's ok, and
+ * -EACCES if not.
  */
-static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
+static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
 			rpc_authflavor_t *server_authlist, unsigned int count)
 {
+	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
 	unsigned int i;
 
 	/*
@@ -1640,17 +1687,20 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
 	 * can be used.
 	 */
 	for (i = 0; i < count; i++) {
-		if (args->auth_flavors[0] == server_authlist[i] ||
-		    server_authlist[i] == RPC_AUTH_NULL)
+		flavor = server_authlist[i];
+
+		if (nfs_auth_info_match(&args->auth_info, flavor) ||
+		    flavor == RPC_AUTH_NULL)
 			goto out;
 	}
 
-	dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
-		args->auth_flavors[0]);
+	dfprintk(MOUNT,
+		 "NFS: specified auth flavors not supported by server\n");
 	return -EACCES;
 
 out:
-	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+	args->selected_flavor = flavor;
+	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor);
 	return 0;
 }
 
@@ -1738,9 +1788,10 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 	 * Was a sec= authflavor specified in the options? First, verify
 	 * whether the server supports it, and then just try to use it if so.
 	 */
-	if (args->auth_flavor_len > 0) {
-		status = nfs_verify_authflavor(args, authlist, authlist_len);
-		dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+	if (args->auth_info.flavor_len > 0) {
+		status = nfs_verify_authflavors(args, authlist, authlist_len);
+		dfprintk(MOUNT, "NFS: using auth flavor %u\n",
+			 args->selected_flavor);
 		if (status)
 			return ERR_PTR(status);
 		return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
@@ -1769,7 +1820,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 			/* Fallthrough */
 		}
 		dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
-		nfs_set_auth_parsed_mount_data(args, flavor);
+		args->selected_flavor = flavor;
 		server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
 		if (!IS_ERR(server))
 			return server;
@@ -1785,7 +1836,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
 
 	/* Last chance! Try AUTH_UNIX */
 	dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
-	nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+	args->selected_flavor = RPC_AUTH_UNIX;
 	return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
 }
 
@@ -1972,9 +2023,9 @@ static int nfs23_validate_mount_data(void *options,
 		args->bsize		= data->bsize;
 
 		if (data->flags & NFS_MOUNT_SECFLAVOUR)
-			nfs_set_auth_parsed_mount_data(args, data->pseudoflavor);
+			args->selected_flavor = data->pseudoflavor;
 		else
-			nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+			args->selected_flavor = RPC_AUTH_UNIX;
 		if (!args->nfs_server.hostname)
 			goto out_nomem;
 
@@ -2108,9 +2159,6 @@ static int nfs_validate_text_mount_data(void *options,
 
 	nfs_set_port(sap, &args->nfs_server.port, port);
 
-	if (args->auth_flavor_len > 1)
-		goto out_bad_auth;
-
 	return nfs_parse_devname(dev_name,
 				   &args->nfs_server.hostname,
 				   max_namelen,
@@ -2130,10 +2178,6 @@ out_invalid_transport_udp:
 out_no_address:
 	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
 	return -EINVAL;
-
-out_bad_auth:
-	dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n");
-	return -EINVAL;
 }
 
 static int
@@ -2143,8 +2187,10 @@ nfs_compare_remount_data(struct nfs_server *nfss,
 	if (data->flags != nfss->flags ||
 	    data->rsize != nfss->rsize ||
 	    data->wsize != nfss->wsize ||
+	    data->version != nfss->nfs_client->rpc_ops->version ||
+	    data->minorversion != nfss->nfs_client->cl_minorversion ||
 	    data->retrans != nfss->client->cl_timeout->to_retries ||
-	    data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+	    data->selected_flavor != nfss->client->cl_auth->au_flavor ||
 	    data->acregmin != nfss->acregmin / HZ ||
 	    data->acregmax != nfss->acregmax / HZ ||
 	    data->acdirmin != nfss->acdirmin / HZ ||
@@ -2189,7 +2235,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	data->rsize = nfss->rsize;
 	data->wsize = nfss->wsize;
 	data->retrans = nfss->client->cl_timeout->to_retries;
-	nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor);
+	data->selected_flavor = nfss->client->cl_auth->au_flavor;
+	data->auth_info = nfss->auth_info;
 	data->acregmin = nfss->acregmin / HZ;
 	data->acregmax = nfss->acregmax / HZ;
 	data->acdirmin = nfss->acdirmin / HZ;
@@ -2197,12 +2244,14 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
 	data->nfs_server.port = nfss->port;
 	data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+	data->version = nfsvers;
+	data->minorversion = nfss->nfs_client->cl_minorversion;
 	memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
 		data->nfs_server.addrlen);
 
 	/* overwrite those values with any that were specified */
-	error = nfs_parse_mount_options((char *)options, data);
-	if (error < 0)
+	error = -EINVAL;
+	if (!nfs_parse_mount_options((char *)options, data))
 		goto out;
 
 	/*
@@ -2332,7 +2381,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
 		goto Ebusy;
 	if (a->acdirmax != b->acdirmax)
 		goto Ebusy;
-	if (b->flags & NFS_MOUNT_SECFLAVOUR &&
+	if (b->auth_info.flavor_len > 0 &&
 	   clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
 		goto Ebusy;
 	return 1;
@@ -2530,6 +2579,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 			mntroot = ERR_PTR(error);
 			goto error_splat_bdi;
 		}
+		server->super = s;
 	}
 
 	if (!s->s_root) {
@@ -2713,9 +2763,9 @@ static int nfs4_validate_mount_data(void *options,
 					   data->auth_flavours,
 					   sizeof(pseudoflavor)))
 				return -EFAULT;
-			nfs_set_auth_parsed_mount_data(args, pseudoflavor);
+			args->selected_flavor = pseudoflavor;
 		} else
-			nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+			args->selected_flavor = RPC_AUTH_UNIX;
 
 		c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
 		if (IS_ERR(c))
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index bb939edd4c99..0c29b1bb3936 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -493,7 +493,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	unsigned long long fileid;
 	struct dentry *sdentry;
 	struct rpc_task *task;
-	int            error = -EIO;
+	int            error = -EBUSY;
 
 	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -503,7 +503,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	/*
 	 * We don't allow a dentry to be silly-renamed twice.
 	 */
-	error = -EBUSY;
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
 		goto out;
 
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0ba679866e50..da276640f776 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -94,6 +94,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
 	clear_buffer_nilfs_volatile(bh);
 	clear_buffer_nilfs_checked(bh);
 	clear_buffer_nilfs_redirected(bh);
+	clear_buffer_async_write(bh);
 	clear_buffer_dirty(bh);
 	if (nilfs_page_buffers_clean(page))
 		__nilfs_clear_page_dirty(page);
@@ -429,6 +430,7 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
 					"discard block %llu, size %zu",
 					(u64)bh->b_blocknr, bh->b_size);
 			}
+			clear_buffer_async_write(bh);
 			clear_buffer_dirty(bh);
 			clear_buffer_nilfs_volatile(bh);
 			clear_buffer_nilfs_checked(bh);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index bd88a7461063..9f6b486b6c01 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -665,7 +665,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 
 		bh = head = page_buffers(page);
 		do {
-			if (!buffer_dirty(bh))
+			if (!buffer_dirty(bh) || buffer_async_write(bh))
 				continue;
 			get_bh(bh);
 			list_add_tail(&bh->b_assoc_buffers, listp);
@@ -699,7 +699,8 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			bh = head = page_buffers(pvec.pages[i]);
 			do {
-				if (buffer_dirty(bh)) {
+				if (buffer_dirty(bh) &&
+						!buffer_async_write(bh)) {
 					get_bh(bh);
 					list_add_tail(&bh->b_assoc_buffers,
 						      listp);
@@ -1579,6 +1580,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
 
 		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
 				    b_assoc_buffers) {
+			set_buffer_async_write(bh);
 			if (bh->b_page != bd_page) {
 				if (bd_page) {
 					lock_page(bd_page);
@@ -1592,6 +1594,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
 
 		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
 				    b_assoc_buffers) {
+			set_buffer_async_write(bh);
 			if (bh == segbuf->sb_super_root) {
 				if (bh->b_page != bd_page) {
 					lock_page(bd_page);
@@ -1677,6 +1680,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
 	list_for_each_entry(segbuf, logs, sb_list) {
 		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
 				    b_assoc_buffers) {
+			clear_buffer_async_write(bh);
 			if (bh->b_page != bd_page) {
 				if (bd_page)
 					end_page_writeback(bd_page);
@@ -1686,6 +1690,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
 
 		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
 				    b_assoc_buffers) {
+			clear_buffer_async_write(bh);
 			if (bh == segbuf->sb_super_root) {
 				if (bh->b_page != bd_page) {
 					end_page_writeback(bd_page);
@@ -1755,6 +1760,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 				    b_assoc_buffers) {
 			set_buffer_uptodate(bh);
 			clear_buffer_dirty(bh);
+			clear_buffer_async_write(bh);
 			if (bh->b_page != bd_page) {
 				if (bd_page)
 					end_page_writeback(bd_page);
@@ -1776,6 +1782,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 				    b_assoc_buffers) {
 			set_buffer_uptodate(bh);
 			clear_buffer_dirty(bh);
+			clear_buffer_async_write(bh);
 			clear_buffer_delay(bh);
 			clear_buffer_nilfs_volatile(bh);
 			clear_buffer_nilfs_redirected(bh);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index ef999729e274..0d3a97d2d5f6 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -70,9 +70,10 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 	 */
 	if (inode == NULL) {
 		unsigned long gen = (unsigned long) dentry->d_fsdata;
-		unsigned long pgen =
-			OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
-
+		unsigned long pgen;
+		spin_lock(&dentry->d_lock);
+		pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+		spin_unlock(&dentry->d_lock);
 		trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
 						       dentry->d_name.name,
 						       pgen, gen);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 121da2dc3be8..d4e81e4a9b04 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1924,7 +1924,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 {
 	int tmp, hangup_needed = 0;
 	struct ocfs2_super *osb = NULL;
-	char nodestr[8];
+	char nodestr[12];
 
 	trace_ocfs2_dismount_volume(sb);
 
diff --git a/fs/open.c b/fs/open.c
index 2a731b0d08bc..d420331ca32a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -744,14 +744,24 @@ cleanup_file:
 
 /**
  * finish_open - finish opening a file
- * @od: opaque open data
+ * @file: file pointer
  * @dentry: pointer to dentry
  * @open: open callback
+ * @opened: state of open
  *
  * This can be used to finish opening a file passed to i_op->atomic_open().
  *
  * If the open callback is set to NULL, then the standard f_op->open()
  * filesystem callback is substituted.
+ *
+ * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
+ * the return value of d_splice_alias(), then the caller needs to perform dput()
+ * on it after finish_open().
+ *
+ * On successful return @file is a fully instantiated open file.  After this, if
+ * an error occurs in ->atomic_open(), it needs to clean up with fput().
+ *
+ * Returns zero on success or -errno if the open failed.
  */
 int finish_open(struct file *file, struct dentry *dentry,
 		int (*open)(struct inode *, struct file *),
@@ -772,11 +782,16 @@ EXPORT_SYMBOL(finish_open);
 /**
  * finish_no_open - finish ->atomic_open() without opening the file
  *
- * @od: opaque open data
+ * @file: file pointer
  * @dentry: dentry or NULL (as returned from ->lookup())
  *
  * This can be used to set the result of a successful lookup in ->atomic_open().
- * The filesystem's atomic_open() method shall return NULL after calling this.
+ *
+ * NB: unlike finish_open() this function does consume the dentry reference and
+ * the caller need not dput() it.
+ *
+ * Returns "1" which must be the return value of ->atomic_open() after having
+ * called this function.
  */
 int finish_no_open(struct file *file, struct dentry *dentry)
 {
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cbd0f1b324b9..1bd2077187fd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -183,6 +183,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	seq_printf(m,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
+		"Ngid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
 		"TracerPid:\t%d\n"
@@ -190,6 +191,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
 		task_tgid_nr_ns(p, ns),
+		task_numa_group_id(p),
 		pid_nr_ns(pid, ns),
 		ppid, tpid,
 		from_kuid_munged(user_ns, cred->uid),
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 9f8ef9b7674d..8eaa1ba793fc 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -288,10 +288,14 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
 static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
-	int rv = -EIO;
-	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+	unsigned long rv = -EIO;
+	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL;
 	if (use_pde(pde)) {
-		get_unmapped_area = pde->proc_fops->get_unmapped_area;
+#ifdef CONFIG_MMU
+		get_unmapped_area = current->mm->get_unmapped_area;
+#endif
+		if (pde->proc_fops->get_unmapped_area)
+			get_unmapped_area = pde->proc_fops->get_unmapped_area;
 		if (get_unmapped_area)
 			rv = get_unmapped_area(file, orig_addr, len, pgoff, flags);
 		unuse_pde(pde);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7366e9d63cee..390bdab01c3c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -941,6 +941,8 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 		frame = pte_pfn(pte);
 		flags = PM_PRESENT;
 		page = vm_normal_page(vma, addr, pte);
+		if (pte_soft_dirty(pte))
+			flags2 |= __PM_SOFT_DIRTY;
 	} else if (is_swap_pte(pte)) {
 		swp_entry_t entry;
 		if (pte_swp_soft_dirty(pte))
@@ -960,7 +962,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 
 	if (page && !PageAnon(page))
 		flags |= PM_FILE;
-	if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
+	if ((vma->vm_flags & VM_SOFTDIRTY))
 		flags2 |= __PM_SOFT_DIRTY;
 
 	*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 4ffb7ab5e397..b8e93a40a5d3 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -168,7 +168,7 @@ static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen)
 	int err, ret;
 
 	ret = -EIO;
-	err = zlib_inflateInit(&stream);
+	err = zlib_inflateInit2(&stream, WINDOW_BITS);
 	if (err != Z_OK)
 		goto error;
 
@@ -195,8 +195,29 @@ error:
 static void allocate_buf_for_compression(void)
 {
 	size_t size;
+	size_t cmpr;
+
+	switch (psinfo->bufsize) {
+	/* buffer range for efivars */
+	case 1000 ... 2000:
+		cmpr = 56;
+		break;
+	case 2001 ... 3000:
+		cmpr = 54;
+		break;
+	case 3001 ... 3999:
+		cmpr = 52;
+		break;
+	/* buffer range for nvram, erst */
+	case 4000 ... 10000:
+		cmpr = 45;
+		break;
+	default:
+		cmpr = 60;
+		break;
+	}
 
-	big_oops_buf_sz = (psinfo->bufsize * 100) / 45;
+	big_oops_buf_sz = (psinfo->bufsize * 100) / cmpr;
 	big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
 	if (big_oops_buf) {
 		size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL),
@@ -295,10 +316,6 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 				compressed = true;
 				total_len = zipped_len;
 			} else {
-				pr_err("pstore: compression failed for Part %d"
-					" returned %d\n", part, zipped_len);
-				pr_err("pstore: Capture uncompressed"
-					" oops/panic report of Part %d\n", part);
 				compressed = false;
 				total_len = copy_kmsg_to_buffer(hsize, len);
 			}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 73feacc49b2e..fd777032c2ba 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1163,21 +1163,6 @@ static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
 	return NULL;
 }
 
-static int newer_jl_done(struct reiserfs_journal_cnode *cn)
-{
-	struct super_block *sb = cn->sb;
-	b_blocknr_t blocknr = cn->blocknr;
-
-	cn = cn->hprev;
-	while (cn) {
-		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist &&
-		    atomic_read(&cn->jlist->j_commit_left) != 0)
-				    return 0;
-		cn = cn->hprev;
-	}
-	return 1;
-}
-
 static void remove_journal_hash(struct super_block *,
 				struct reiserfs_journal_cnode **,
 				struct reiserfs_journal_list *, unsigned long,
@@ -1353,7 +1338,6 @@ static int flush_journal_list(struct super_block *s,
 		reiserfs_warning(s, "clm-2048", "called with wcount %d",
 				 atomic_read(&journal->j_wcount));
 	}
-	BUG_ON(jl->j_trans_id == 0);
 
 	/* if flushall == 0, the lock is already held */
 	if (flushall) {
@@ -1593,31 +1577,6 @@ static int flush_journal_list(struct super_block *s,
 	return err;
 }
 
-static int test_transaction(struct super_block *s,
-                            struct reiserfs_journal_list *jl)
-{
-	struct reiserfs_journal_cnode *cn;
-
-	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0)
-		return 1;
-
-	cn = jl->j_realblock;
-	while (cn) {
-		/* if the blocknr == 0, this has been cleared from the hash,
-		 ** skip it
-		 */
-		if (cn->blocknr == 0) {
-			goto next;
-		}
-		if (cn->bh && !newer_jl_done(cn))
-			return 0;
-	      next:
-		cn = cn->next;
-		cond_resched();
-	}
-	return 0;
-}
-
 static int write_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl,
 				 struct buffer_chunk *chunk)
@@ -1805,6 +1764,8 @@ static int flush_used_journal_lists(struct super_block *s,
 			break;
 		tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
 	}
+	get_journal_list(jl);
+	get_journal_list(flush_jl);
 	/* try to find a group of blocks we can flush across all the
 	 ** transactions, but only bother if we've actually spanned
 	 ** across multiple lists
@@ -1813,6 +1774,8 @@ static int flush_used_journal_lists(struct super_block *s,
 		ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
 	}
 	flush_journal_list(s, flush_jl, 1);
+	put_journal_list(s, flush_jl);
+	put_journal_list(s, jl);
 	return 0;
 }
 
@@ -3868,27 +3831,6 @@ int reiserfs_prepare_for_journal(struct super_block *sb,
 	return 1;
 }
 
-static void flush_old_journal_lists(struct super_block *s)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	struct reiserfs_journal_list *jl;
-	struct list_head *entry;
-	time_t now = get_seconds();
-
-	while (!list_empty(&journal->j_journal_list)) {
-		entry = journal->j_journal_list.next;
-		jl = JOURNAL_LIST_ENTRY(entry);
-		/* this check should always be run, to send old lists to disk */
-		if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4)) &&
-		    atomic_read(&jl->j_commit_left) == 0 &&
-		    test_transaction(s, jl)) {
-			flush_used_journal_lists(s, jl);
-		} else {
-			break;
-		}
-	}
-}
-
 /*
 ** long and ugly.  If flush, will not return until all commit
 ** blocks and all real buffers in the trans are on disk.
@@ -4232,7 +4174,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 			}
 		}
 	}
-	flush_old_journal_lists(sb);
 
 	journal->j_current_jl->j_list_bitmap =
 	    get_list_bitmap(sb, journal->j_current_jl);
diff --git a/fs/select.c b/fs/select.c
index 35d4adc749d9..dfd5cb18c012 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -238,8 +238,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 
 	set_current_state(state);
 	if (!pwq->triggered)
-		rc = freezable_schedule_hrtimeout_range(expires, slack,
-							HRTIMER_MODE_ABS);
+		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
 	__set_current_state(TASK_RUNNING);
 
 	/*
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 3135c2525c76..a290157265ef 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -328,6 +328,8 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
 				m->read_pos = offset;
 				retval = file->f_pos = offset;
 			}
+		} else {
+			file->f_pos = offset;
 		}
 	}
 	file->f_version = m->version;
diff --git a/fs/statfs.c b/fs/statfs.c
index c219e733f553..083dc0ac9140 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -94,7 +94,7 @@ retry:
 
 int fd_statfs(int fd, struct kstatfs *st)
 {
-	struct fd f = fdget(fd);
+	struct fd f = fdget_raw(fd);
 	int error = -EBADF;
 	if (f.file) {
 		error = vfs_statfs(&f.file->f_path, st);
diff --git a/fs/super.c b/fs/super.c
index 3a96c9783a8b..0225c20f8770 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -264,6 +264,8 @@ out_free_sb:
  */
 static inline void destroy_super(struct super_block *s)
 {
+	list_lru_destroy(&s->s_dentry_lru);
+	list_lru_destroy(&s->s_inode_lru);
 #ifdef CONFIG_SMP
 	free_percpu(s->s_files);
 #endif
@@ -323,8 +325,6 @@ void deactivate_locked_super(struct super_block *s)
 
 		/* caches are now gone, we can safely kill the shrinker now */
 		unregister_shrinker(&s->s_shrink);
-		list_lru_destroy(&s->s_dentry_lru);
-		list_lru_destroy(&s->s_inode_lru);
 
 		put_filesystem(fs);
 		put_super(s);
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 7a1ceb946b80..8876ac183373 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,5 +2,4 @@
 # Makefile for the sysfs virtual filesystem
 #
 
-obj-y		:= inode.o file.o dir.o symlink.o mount.o bin.o \
-		   group.o
+obj-y		:= inode.o file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
deleted file mode 100644
index c590cabd57bb..000000000000
--- a/fs/sysfs/bin.c
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- * fs/sysfs/bin.c - sysfs binary file implementation
- *
- * Copyright (c) 2003 Patrick Mochel
- * Copyright (c) 2003 Matthew Wilcox
- * Copyright (c) 2004 Silicon Graphics, Inc.
- * Copyright (c) 2007 SUSE Linux Products GmbH
- * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
- *
- * This file is released under the GPLv2.
- *
- * Please see Documentation/filesystems/sysfs.txt for more information.
- */
-
-#undef DEBUG
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/kobject.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/mm.h>
-#include <linux/uaccess.h>
-
-#include "sysfs.h"
-
-/*
- * There's one bin_buffer for each open file.
- *
- * filp->private_data points to bin_buffer and
- * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s
- * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock
- */
-static DEFINE_MUTEX(sysfs_bin_lock);
-
-struct bin_buffer {
-	struct mutex			mutex;
-	void				*buffer;
-	int				mmapped;
-	const struct vm_operations_struct *vm_ops;
-	struct file			*file;
-	struct hlist_node		list;
-};
-
-static int
-fill_read(struct file *file, char *buffer, loff_t off, size_t count)
-{
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	int rc;
-
-	/* need attr_sd for attr, its parent for kobj */
-	if (!sysfs_get_active(attr_sd))
-		return -ENODEV;
-
-	rc = -EIO;
-	if (attr->read)
-		rc = attr->read(file, kobj, attr, buffer, off, count);
-
-	sysfs_put_active(attr_sd);
-
-	return rc;
-}
-
-static ssize_t
-read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
-{
-	struct bin_buffer *bb = file->private_data;
-	int size = file_inode(file)->i_size;
-	loff_t offs = *off;
-	int count = min_t(size_t, bytes, PAGE_SIZE);
-	char *temp;
-
-	if (!bytes)
-		return 0;
-
-	if (size) {
-		if (offs > size)
-			return 0;
-		if (offs + count > size)
-			count = size - offs;
-	}
-
-	temp = kmalloc(count, GFP_KERNEL);
-	if (!temp)
-		return -ENOMEM;
-
-	mutex_lock(&bb->mutex);
-
-	count = fill_read(file, bb->buffer, offs, count);
-	if (count < 0) {
-		mutex_unlock(&bb->mutex);
-		goto out_free;
-	}
-
-	memcpy(temp, bb->buffer, count);
-
-	mutex_unlock(&bb->mutex);
-
-	if (copy_to_user(userbuf, temp, count)) {
-		count = -EFAULT;
-		goto out_free;
-	}
-
-	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
-
-	*off = offs + count;
-
- out_free:
-	kfree(temp);
-	return count;
-}
-
-static int
-flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
-{
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	int rc;
-
-	/* need attr_sd for attr, its parent for kobj */
-	if (!sysfs_get_active(attr_sd))
-		return -ENODEV;
-
-	rc = -EIO;
-	if (attr->write)
-		rc = attr->write(file, kobj, attr, buffer, offset, count);
-
-	sysfs_put_active(attr_sd);
-
-	return rc;
-}
-
-static ssize_t write(struct file *file, const char __user *userbuf,
-		     size_t bytes, loff_t *off)
-{
-	struct bin_buffer *bb = file->private_data;
-	int size = file_inode(file)->i_size;
-	loff_t offs = *off;
-	int count = min_t(size_t, bytes, PAGE_SIZE);
-	char *temp;
-
-	if (!bytes)
-		return 0;
-
-	if (size) {
-		if (offs > size)
-			return 0;
-		if (offs + count > size)
-			count = size - offs;
-	}
-
-	temp = memdup_user(userbuf, count);
-	if (IS_ERR(temp))
-		return PTR_ERR(temp);
-
-	mutex_lock(&bb->mutex);
-
-	memcpy(bb->buffer, temp, count);
-
-	count = flush_write(file, bb->buffer, offs, count);
-	mutex_unlock(&bb->mutex);
-
-	if (count > 0)
-		*off = offs + count;
-
-	kfree(temp);
-	return count;
-}
-
-static void bin_vma_open(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-
-	if (!bb->vm_ops)
-		return;
-
-	if (!sysfs_get_active(attr_sd))
-		return;
-
-	if (bb->vm_ops->open)
-		bb->vm_ops->open(vma);
-
-	sysfs_put_active(attr_sd);
-}
-
-static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return VM_FAULT_SIGBUS;
-
-	if (!sysfs_get_active(attr_sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = VM_FAULT_SIGBUS;
-	if (bb->vm_ops->fault)
-		ret = bb->vm_ops->fault(vma, vmf);
-
-	sysfs_put_active(attr_sd);
-	return ret;
-}
-
-static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return VM_FAULT_SIGBUS;
-
-	if (!sysfs_get_active(attr_sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = 0;
-	if (bb->vm_ops->page_mkwrite)
-		ret = bb->vm_ops->page_mkwrite(vma, vmf);
-	else
-		file_update_time(file);
-
-	sysfs_put_active(attr_sd);
-	return ret;
-}
-
-static int bin_access(struct vm_area_struct *vma, unsigned long addr,
-		  void *buf, int len, int write)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return -EINVAL;
-
-	if (!sysfs_get_active(attr_sd))
-		return -EINVAL;
-
-	ret = -EINVAL;
-	if (bb->vm_ops->access)
-		ret = bb->vm_ops->access(vma, addr, buf, len, write);
-
-	sysfs_put_active(attr_sd);
-	return ret;
-}
-
-#ifdef CONFIG_NUMA
-static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return 0;
-
-	if (!sysfs_get_active(attr_sd))
-		return -EINVAL;
-
-	ret = 0;
-	if (bb->vm_ops->set_policy)
-		ret = bb->vm_ops->set_policy(vma, new);
-
-	sysfs_put_active(attr_sd);
-	return ret;
-}
-
-static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
-					unsigned long addr)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct mempolicy *pol;
-
-	if (!bb->vm_ops)
-		return vma->vm_policy;
-
-	if (!sysfs_get_active(attr_sd))
-		return vma->vm_policy;
-
-	pol = vma->vm_policy;
-	if (bb->vm_ops->get_policy)
-		pol = bb->vm_ops->get_policy(vma, addr);
-
-	sysfs_put_active(attr_sd);
-	return pol;
-}
-
-static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
-			const nodemask_t *to, unsigned long flags)
-{
-	struct file *file = vma->vm_file;
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	int ret;
-
-	if (!bb->vm_ops)
-		return 0;
-
-	if (!sysfs_get_active(attr_sd))
-		return 0;
-
-	ret = 0;
-	if (bb->vm_ops->migrate)
-		ret = bb->vm_ops->migrate(vma, from, to, flags);
-
-	sysfs_put_active(attr_sd);
-	return ret;
-}
-#endif
-
-static const struct vm_operations_struct bin_vm_ops = {
-	.open		= bin_vma_open,
-	.fault		= bin_fault,
-	.page_mkwrite	= bin_page_mkwrite,
-	.access		= bin_access,
-#ifdef CONFIG_NUMA
-	.set_policy	= bin_set_policy,
-	.get_policy	= bin_get_policy,
-	.migrate	= bin_migrate,
-#endif
-};
-
-static int mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct bin_buffer *bb = file->private_data;
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	int rc;
-
-	mutex_lock(&bb->mutex);
-
-	/* need attr_sd for attr, its parent for kobj */
-	rc = -ENODEV;
-	if (!sysfs_get_active(attr_sd))
-		goto out_unlock;
-
-	rc = -EINVAL;
-	if (!attr->mmap)
-		goto out_put;
-
-	rc = attr->mmap(file, kobj, attr, vma);
-	if (rc)
-		goto out_put;
-
-	/*
-	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
-	 * to satisfy versions of X which crash if the mmap fails: that
-	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
-	 */
-	if (vma->vm_file != file)
-		goto out_put;
-
-	rc = -EINVAL;
-	if (bb->mmapped && bb->vm_ops != vma->vm_ops)
-		goto out_put;
-
-	/*
-	 * It is not possible to successfully wrap close.
-	 * So error if someone is trying to use close.
-	 */
-	rc = -EINVAL;
-	if (vma->vm_ops && vma->vm_ops->close)
-		goto out_put;
-
-	rc = 0;
-	bb->mmapped = 1;
-	bb->vm_ops = vma->vm_ops;
-	vma->vm_ops = &bin_vm_ops;
-out_put:
-	sysfs_put_active(attr_sd);
-out_unlock:
-	mutex_unlock(&bb->mutex);
-
-	return rc;
-}
-
-static int open(struct inode *inode, struct file *file)
-{
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
-	struct bin_buffer *bb = NULL;
-	int error;
-
-	/* binary file operations requires both @sd and its parent */
-	if (!sysfs_get_active(attr_sd))
-		return -ENODEV;
-
-	error = -EACCES;
-	if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap))
-		goto err_out;
-	if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap))
-		goto err_out;
-
-	error = -ENOMEM;
-	bb = kzalloc(sizeof(*bb), GFP_KERNEL);
-	if (!bb)
-		goto err_out;
-
-	bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!bb->buffer)
-		goto err_out;
-
-	mutex_init(&bb->mutex);
-	bb->file = file;
-	file->private_data = bb;
-
-	mutex_lock(&sysfs_bin_lock);
-	hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers);
-	mutex_unlock(&sysfs_bin_lock);
-
-	/* open succeeded, put active references */
-	sysfs_put_active(attr_sd);
-	return 0;
-
- err_out:
-	sysfs_put_active(attr_sd);
-	kfree(bb);
-	return error;
-}
-
-static int release(struct inode *inode, struct file *file)
-{
-	struct bin_buffer *bb = file->private_data;
-
-	mutex_lock(&sysfs_bin_lock);
-	hlist_del(&bb->list);
-	mutex_unlock(&sysfs_bin_lock);
-
-	kfree(bb->buffer);
-	kfree(bb);
-	return 0;
-}
-
-const struct file_operations bin_fops = {
-	.read		= read,
-	.write		= write,
-	.mmap		= mmap,
-	.llseek		= generic_file_llseek,
-	.open		= open,
-	.release	= release,
-};
-
-
-void unmap_bin_file(struct sysfs_dirent *attr_sd)
-{
-	struct bin_buffer *bb;
-
-	if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
-		return;
-
-	mutex_lock(&sysfs_bin_lock);
-
-	hlist_for_each_entry(bb, &attr_sd->s_bin_attr.buffers, list) {
-		struct inode *inode = file_inode(bb->file);
-
-		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
-	}
-
-	mutex_unlock(&sysfs_bin_lock);
-}
-
-/**
- *	sysfs_create_bin_file - create binary file for object.
- *	@kobj:	object.
- *	@attr:	attribute descriptor.
- */
-int sysfs_create_bin_file(struct kobject *kobj,
-			  const struct bin_attribute *attr)
-{
-	BUG_ON(!kobj || !kobj->sd || !attr);
-
-	return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
-}
-EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
-
-/**
- *	sysfs_remove_bin_file - remove binary file for object.
- *	@kobj:	object.
- *	@attr:	attribute descriptor.
- */
-void sysfs_remove_bin_file(struct kobject *kobj,
-			   const struct bin_attribute *attr)
-{
-	sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
-}
-EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4d83cedb9fcb..5e73d6626e50 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -26,21 +26,21 @@
 #include "sysfs.h"
 
 DEFINE_MUTEX(sysfs_mutex);
-DEFINE_SPINLOCK(sysfs_assoc_lock);
+DEFINE_SPINLOCK(sysfs_symlink_target_lock);
 
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb);
+#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
 
 static DEFINE_SPINLOCK(sysfs_ino_lock);
 static DEFINE_IDA(sysfs_ino_ida);
 
 /**
  *	sysfs_name_hash
- *	@ns:   Namespace tag to hash
  *	@name: Null terminated string to hash
+ *	@ns:   Namespace tag to hash
  *
  *	Returns 31 bit hash of ns + name (so it fits in an off_t )
  */
-static unsigned int sysfs_name_hash(const void *ns, const char *name)
+static unsigned int sysfs_name_hash(const char *name, const void *ns)
 {
 	unsigned long hash = init_name_hash();
 	unsigned int len = strlen(name);
@@ -56,8 +56,8 @@ static unsigned int sysfs_name_hash(const void *ns, const char *name)
 	return hash;
 }
 
-static int sysfs_name_compare(unsigned int hash, const void *ns,
-	const char *name, const struct sysfs_dirent *sd)
+static int sysfs_name_compare(unsigned int hash, const char *name,
+			      const void *ns, const struct sysfs_dirent *sd)
 {
 	if (hash != sd->s_hash)
 		return hash - sd->s_hash;
@@ -69,7 +69,7 @@ static int sysfs_name_compare(unsigned int hash, const void *ns,
 static int sysfs_sd_compare(const struct sysfs_dirent *left,
 			    const struct sysfs_dirent *right)
 {
-	return sysfs_name_compare(left->s_hash, left->s_ns, left->s_name,
+	return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
 				  right);
 }
 
@@ -132,24 +132,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
 }
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-/* Test for attributes that want to ignore lockdep for read-locking */
-static bool ignore_lockdep(struct sysfs_dirent *sd)
-{
-	return sysfs_type(sd) == SYSFS_KOBJ_ATTR &&
-			sd->s_attr.attr->ignore_lockdep;
-}
-
-#else
-
-static inline bool ignore_lockdep(struct sysfs_dirent *sd)
-{
-	return true;
-}
-
-#endif
-
 /**
  *	sysfs_get_active - get an active reference to sysfs_dirent
  *	@sd: sysfs_dirent to get an active reference to
@@ -168,7 +150,7 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 	if (!atomic_inc_unless_negative(&sd->s_active))
 		return NULL;
 
-	if (likely(!ignore_lockdep(sd)))
+	if (likely(!sysfs_ignore_lockdep(sd)))
 		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
 	return sd;
 }
@@ -187,7 +169,7 @@ void sysfs_put_active(struct sysfs_dirent *sd)
 	if (unlikely(!sd))
 		return;
 
-	if (likely(!ignore_lockdep(sd)))
+	if (likely(!sysfs_ignore_lockdep(sd)))
 		rwsem_release(&sd->dep_map, 1, _RET_IP_);
 	v = atomic_dec_return(&sd->s_active);
 	if (likely(v != SD_DEACTIVATED_BIAS))
@@ -400,22 +382,19 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
 /**
  *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
  *	@acxt: pointer to sysfs_addrm_cxt to be used
- *	@parent_sd: parent sysfs_dirent
  *
- *	This function is called when the caller is about to add or
- *	remove sysfs_dirent under @parent_sd.  This function acquires
- *	sysfs_mutex.  @acxt is used to keep and pass context to
- *	other addrm functions.
+ *	This function is called when the caller is about to add or remove
+ *	sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
+ *	to keep and pass context to other addrm functions.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).  sysfs_mutex is locked on
  *	return.
  */
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
-		       struct sysfs_dirent *parent_sd)
+void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
+	__acquires(sysfs_mutex)
 {
 	memset(acxt, 0, sizeof(*acxt));
-	acxt->parent_sd = parent_sd;
 
 	mutex_lock(&sysfs_mutex);
 }
@@ -424,10 +403,11 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  *	__sysfs_add_one - add sysfs_dirent to parent without warning
  *	@acxt: addrm context to use
  *	@sd: sysfs_dirent to be added
+ *	@parent_sd: the parent sysfs_dirent to add @sd to
  *
- *	Get @acxt->parent_sd and set sd->s_parent to it and increment
- *	nlink of parent inode if @sd is a directory and link into the
- *	children list of the parent.
+ *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
+ *	the parent inode if @sd is a directory and link into the children
+ *	list of the parent.
  *
  *	This function should be called between calls to
  *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
@@ -440,27 +420,28 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		    struct sysfs_dirent *parent_sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
 
-	if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
+	if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-			sysfs_ns_type(acxt->parent_sd) ? "required" : "invalid",
-			acxt->parent_sd->s_name, sd->s_name);
+			sysfs_ns_type(parent_sd) ? "required" : "invalid",
+			parent_sd->s_name, sd->s_name);
 		return -EINVAL;
 	}
 
-	sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
-	sd->s_parent = sysfs_get(acxt->parent_sd);
+	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
+	sd->s_parent = sysfs_get(parent_sd);
 
 	ret = sysfs_link_sibling(sd);
 	if (ret)
 		return ret;
 
 	/* Update timestamps on the parent */
-	ps_iattr = acxt->parent_sd->s_iattr;
+	ps_iattr = parent_sd->s_iattr;
 	if (ps_iattr) {
 		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
 		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
@@ -490,14 +471,32 @@ static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
 	return path;
 }
 
+void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
+{
+	char *path;
+
+	path = kzalloc(PATH_MAX, GFP_KERNEL);
+	if (path) {
+		sysfs_pathname(parent, path);
+		strlcat(path, "/", PATH_MAX);
+		strlcat(path, name, PATH_MAX);
+	}
+
+	WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n",
+	     path ? path : name);
+
+	kfree(path);
+}
+
 /**
  *	sysfs_add_one - add sysfs_dirent to parent
  *	@acxt: addrm context to use
  *	@sd: sysfs_dirent to be added
+ *	@parent_sd: the parent sysfs_dirent to add @sd to
  *
- *	Get @acxt->parent_sd and set sd->s_parent to it and increment
- *	nlink of parent inode if @sd is a directory and link into the
- *	children list of the parent.
+ *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
+ *	the parent inode if @sd is a directory and link into the children
+ *	list of the parent.
  *
  *	This function should be called between calls to
  *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
@@ -510,23 +509,15 @@ static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		  struct sysfs_dirent *parent_sd)
 {
 	int ret;
 
-	ret = __sysfs_add_one(acxt, sd);
-	if (ret == -EEXIST) {
-		char *path = kzalloc(PATH_MAX, GFP_KERNEL);
-		WARN(1, KERN_WARNING
-		     "sysfs: cannot create duplicate filename '%s'\n",
-		     (path == NULL) ? sd->s_name
-				    : (sysfs_pathname(acxt->parent_sd, path),
-				       strlcat(path, "/", PATH_MAX),
-				       strlcat(path, sd->s_name, PATH_MAX),
-				       path));
-		kfree(path);
-	}
+	ret = __sysfs_add_one(acxt, sd, parent_sd);
 
+	if (ret == -EEXIST)
+		sysfs_warn_dup(parent_sd, sd->s_name);
 	return ret;
 }
 
@@ -545,16 +536,22 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
  *	LOCKING:
  *	Determined by sysfs_addrm_start().
  */
-void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
+			     struct sysfs_dirent *sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 
-	BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED);
+	/*
+	 * Removal can be called multiple times on the same node.  Only the
+	 * first invocation is effective and puts the base ref.
+	 */
+	if (sd->s_flags & SYSFS_FLAG_REMOVED)
+		return;
 
 	sysfs_unlink_sibling(sd);
 
 	/* Update timestamps on the parent */
-	ps_iattr = acxt->parent_sd->s_iattr;
+	ps_iattr = sd->s_parent->s_iattr;
 	if (ps_iattr) {
 		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
 		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
@@ -577,6 +574,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
  *	sysfs_mutex is released.
  */
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
+	__releases(sysfs_mutex)
 {
 	/* release resources acquired by sysfs_addrm_start() */
 	mutex_unlock(&sysfs_mutex);
@@ -588,7 +586,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 		acxt->removed = sd->u.removed_list;
 
 		sysfs_deactivate(sd);
-		unmap_bin_file(sd);
+		sysfs_unmap_bin_file(sd);
 		sysfs_put(sd);
 	}
 }
@@ -597,6 +595,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
  *	sysfs_find_dirent - find sysfs_dirent with the given name
  *	@parent_sd: sysfs_dirent to search under
  *	@name: name to look for
+ *	@ns: the namespace tag to use
  *
  *	Look for sysfs_dirent with name @name under @parent_sd.
  *
@@ -607,8 +606,8 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
  *	Pointer to sysfs_dirent if found, NULL if not.
  */
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-				       const void *ns,
-				       const unsigned char *name)
+				       const unsigned char *name,
+				       const void *ns)
 {
 	struct rb_node *node = parent_sd->s_dir.children.rb_node;
 	unsigned int hash;
@@ -620,13 +619,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 		return NULL;
 	}
 
-	hash = sysfs_name_hash(ns, name);
+	hash = sysfs_name_hash(name, ns);
 	while (node) {
 		struct sysfs_dirent *sd;
 		int result;
 
 		sd = to_sysfs_dirent(node);
-		result = sysfs_name_compare(hash, ns, name, sd);
+		result = sysfs_name_compare(hash, name, ns, sd);
 		if (result < 0)
 			node = node->rb_left;
 		else if (result > 0)
@@ -638,9 +637,10 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 }
 
 /**
- *	sysfs_get_dirent - find and get sysfs_dirent with the given name
+ *	sysfs_get_dirent_ns - find and get sysfs_dirent with the given name
  *	@parent_sd: sysfs_dirent to search under
  *	@name: name to look for
+ *	@ns: the namespace tag to use
  *
  *	Look for sysfs_dirent with name @name under @parent_sd and get
  *	it if found.
@@ -651,24 +651,25 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
  *	RETURNS:
  *	Pointer to sysfs_dirent if found, NULL if not.
  */
-struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
-				      const void *ns,
-				      const unsigned char *name)
+struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
+					 const unsigned char *name,
+					 const void *ns)
 {
 	struct sysfs_dirent *sd;
 
 	mutex_lock(&sysfs_mutex);
-	sd = sysfs_find_dirent(parent_sd, ns, name);
+	sd = sysfs_find_dirent(parent_sd, name, ns);
 	sysfs_get(sd);
 	mutex_unlock(&sysfs_mutex);
 
 	return sd;
 }
-EXPORT_SYMBOL_GPL(sysfs_get_dirent);
+EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
 
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-	enum kobj_ns_type type, const void *ns, const char *name,
-	struct sysfs_dirent **p_sd)
+		      enum kobj_ns_type type,
+		      const char *name, const void *ns,
+		      struct sysfs_dirent **p_sd)
 {
 	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
 	struct sysfs_addrm_cxt acxt;
@@ -685,8 +686,8 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	sd->s_dir.kobj = kobj;
 
 	/* link in */
-	sysfs_addrm_start(&acxt, parent_sd);
-	rc = sysfs_add_one(&acxt, sd);
+	sysfs_addrm_start(&acxt);
+	rc = sysfs_add_one(&acxt, sd, parent_sd);
 	sysfs_addrm_finish(&acxt);
 
 	if (rc == 0)
@@ -701,7 +702,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd)
 {
 	return create_dir(kobj, kobj->sd,
-			  KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
+			  KOBJ_NS_TYPE_NONE, name, NULL, p_sd);
 }
 
 /**
@@ -730,14 +731,14 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
 }
 
 /**
- *	sysfs_create_dir - create a directory for an object.
- *	@kobj:		object we're creating directory for.
+ * sysfs_create_dir_ns - create a directory for an object with a namespace tag
+ * @kobj: object we're creating directory for
+ * @ns: the namespace tag to use
  */
-int sysfs_create_dir(struct kobject *kobj)
+int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
 	enum kobj_ns_type type;
 	struct sysfs_dirent *parent_sd, *sd;
-	const void *ns = NULL;
 	int error = 0;
 
 	BUG_ON(!kobj);
@@ -750,11 +751,9 @@ int sysfs_create_dir(struct kobject *kobj)
 	if (!parent_sd)
 		return -ENOENT;
 
-	if (sysfs_ns_type(parent_sd))
-		ns = kobj->ktype->namespace(kobj);
 	type = sysfs_read_ns_type(kobj);
 
-	error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
+	error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd);
 	if (!error)
 		kobj->sd = sd;
 	return error;
@@ -776,7 +775,7 @@ static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	type = sysfs_ns_type(parent_sd);
 	ns = sysfs_info(dir->i_sb)->ns[type];
 
-	sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
+	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
 
 	/* no such entry */
 	if (!sd) {
@@ -807,41 +806,128 @@ const struct inode_operations sysfs_dir_inode_operations = {
 	.setxattr	= sysfs_setxattr,
 };
 
-static void remove_dir(struct sysfs_dirent *sd)
+static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
 {
-	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent *last;
 
-	sysfs_addrm_start(&acxt, sd->s_parent);
-	sysfs_remove_one(&acxt, sd);
-	sysfs_addrm_finish(&acxt);
+	while (true) {
+		struct rb_node *rbn;
+
+		last = pos;
+
+		if (sysfs_type(pos) != SYSFS_DIR)
+			break;
+
+		rbn = rb_first(&pos->s_dir.children);
+		if (!rbn)
+			break;
+
+		pos = to_sysfs_dirent(rbn);
+	}
+
+	return last;
 }
 
-void sysfs_remove_subdir(struct sysfs_dirent *sd)
+/**
+ * sysfs_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: sysfs_dirent whose descendants to walk
+ *
+ * Find the next descendant to visit for post-order traversal of @root's
+ * descendants.  @root is included in the iteration and the last node to be
+ * visited.
+ */
+static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
+						       struct sysfs_dirent *root)
 {
-	remove_dir(sd);
+	struct rb_node *rbn;
+
+	lockdep_assert_held(&sysfs_mutex);
+
+	/* if first iteration, visit leftmost descendant which may be root */
+	if (!pos)
+		return sysfs_leftmost_descendant(root);
+
+	/* if we visited @root, we're done */
+	if (pos == root)
+		return NULL;
+
+	/* if there's an unvisited sibling, visit its leftmost descendant */
+	rbn = rb_next(&pos->s_rb);
+	if (rbn)
+		return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
+
+	/* no sibling left, visit parent */
+	return pos->s_parent;
 }
 
+static void __sysfs_remove(struct sysfs_addrm_cxt *acxt,
+			   struct sysfs_dirent *sd)
+{
+	struct sysfs_dirent *pos, *next;
+
+	if (!sd)
+		return;
 
-static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
+	pr_debug("sysfs %s: removing\n", sd->s_name);
+
+	next = NULL;
+	do {
+		pos = next;
+		next = sysfs_next_descendant_post(pos, sd);
+		if (pos)
+			sysfs_remove_one(acxt, pos);
+	} while (next);
+}
+
+/**
+ * sysfs_remove - remove a sysfs_dirent recursively
+ * @sd: the sysfs_dirent to remove
+ *
+ * Remove @sd along with all its subdirectories and files.
+ */
+void sysfs_remove(struct sysfs_dirent *sd)
 {
 	struct sysfs_addrm_cxt acxt;
-	struct rb_node *pos;
 
-	if (!dir_sd)
-		return;
+	sysfs_addrm_start(&acxt);
+	__sysfs_remove(&acxt, sd);
+	sysfs_addrm_finish(&acxt);
+}
 
-	pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
-	sysfs_addrm_start(&acxt, dir_sd);
-	pos = rb_first(&dir_sd->s_dir.children);
-	while (pos) {
-		struct sysfs_dirent *sd = to_sysfs_dirent(pos);
-		pos = rb_next(pos);
-		if (sysfs_type(sd) != SYSFS_DIR)
-			sysfs_remove_one(&acxt, sd);
+/**
+ * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it
+ * @dir_sd: parent of the target
+ * @name: name of the sysfs_dirent to remove
+ * @ns: namespace tag of the sysfs_dirent to remove
+ *
+ * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
+ * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
+ */
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
+			  const void *ns)
+{
+	struct sysfs_addrm_cxt acxt;
+	struct sysfs_dirent *sd;
+
+	if (!dir_sd) {
+		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
+			name);
+		return -ENOENT;
 	}
+
+	sysfs_addrm_start(&acxt);
+
+	sd = sysfs_find_dirent(dir_sd, name, ns);
+	if (sd)
+		__sysfs_remove(&acxt, sd);
+
 	sysfs_addrm_finish(&acxt);
 
-	remove_dir(dir_sd);
+	if (sd)
+		return 0;
+	else
+		return -ENOENT;
 }
 
 /**
@@ -852,21 +938,34 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
  *	the directory before we remove the directory, and we've inlined
  *	what used to be sysfs_rmdir() below, instead of calling separately.
  */
-
 void sysfs_remove_dir(struct kobject *kobj)
 {
 	struct sysfs_dirent *sd = kobj->sd;
 
-	spin_lock(&sysfs_assoc_lock);
+	/*
+	 * In general, kboject owner is responsible for ensuring removal
+	 * doesn't race with other operations and sysfs doesn't provide any
+	 * protection; however, when @kobj is used as a symlink target, the
+	 * symlinking entity usually doesn't own @kobj and thus has no
+	 * control over removal.  @kobj->sd may be removed anytime and
+	 * symlink code may end up dereferencing an already freed sd.
+	 *
+	 * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation
+	 * against symlink operations so that symlink code can safely
+	 * dereference @kobj->sd.
+	 */
+	spin_lock(&sysfs_symlink_target_lock);
 	kobj->sd = NULL;
-	spin_unlock(&sysfs_assoc_lock);
+	spin_unlock(&sysfs_symlink_target_lock);
 
-	__sysfs_remove_dir(sd);
+	if (sd) {
+		WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
+		sysfs_remove(sd);
+	}
 }
 
-int sysfs_rename(struct sysfs_dirent *sd,
-	struct sysfs_dirent *new_parent_sd, const void *new_ns,
-	const char *new_name)
+int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
+		 const char *new_name, const void *new_ns)
 {
 	int error;
 
@@ -878,7 +977,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
 		goto out;	/* nothing to rename */
 
 	error = -EEXIST;
-	if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
+	if (sysfs_find_dirent(new_parent_sd, new_name, new_ns))
 		goto out;
 
 	/* rename sysfs_dirent */
@@ -899,7 +998,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
 	sysfs_get(new_parent_sd);
 	sysfs_put(sd->s_parent);
 	sd->s_ns = new_ns;
-	sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
+	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
 	sd->s_parent = new_parent_sd;
 	sysfs_link_sibling(sd);
 
@@ -909,30 +1008,25 @@ int sysfs_rename(struct sysfs_dirent *sd,
 	return error;
 }
 
-int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
+int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
+			const void *new_ns)
 {
 	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
-	const void *new_ns = NULL;
-
-	if (sysfs_ns_type(parent_sd))
-		new_ns = kobj->ktype->namespace(kobj);
 
-	return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
+	return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns);
 }
 
-int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
+int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
+		      const void *new_ns)
 {
 	struct sysfs_dirent *sd = kobj->sd;
 	struct sysfs_dirent *new_parent_sd;
-	const void *new_ns = NULL;
 
 	BUG_ON(!sd->s_parent);
-	if (sysfs_ns_type(sd->s_parent))
-		new_ns = kobj->ktype->namespace(kobj);
 	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
 		new_parent_kobj->sd : &sysfs_root;
 
-	return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
+	return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns);
 }
 
 /* Relationship between s_mode and the DT_xxx types */
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 15ef5eb13663..79b5da2acbe1 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -21,70 +21,114 @@
 #include <linux/mutex.h>
 #include <linux/limits.h>
 #include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/mm.h>
 
 #include "sysfs.h"
 
 /*
- * There's one sysfs_buffer for each open file and one
- * sysfs_open_dirent for each sysfs_dirent with one or more open
- * files.
+ * There's one sysfs_open_file for each open file and one sysfs_open_dirent
+ * for each sysfs_dirent with one or more open files.
  *
- * filp->private_data points to sysfs_buffer and
- * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open
- * is protected by sysfs_open_dirent_lock.
+ * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
+ * protected by sysfs_open_dirent_lock.
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * sysfs_open_file.  sysfs_open_files are chained at
+ * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
  */
 static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
+static DEFINE_MUTEX(sysfs_open_file_mutex);
 
 struct sysfs_open_dirent {
 	atomic_t		refcnt;
 	atomic_t		event;
 	wait_queue_head_t	poll;
-	struct list_head	buffers; /* goes through sysfs_buffer.list */
+	struct list_head	files; /* goes through sysfs_open_file.list */
 };
 
-struct sysfs_buffer {
-	size_t			count;
-	loff_t			pos;
-	char			*page;
-	const struct sysfs_ops	*ops;
+struct sysfs_open_file {
+	struct sysfs_dirent	*sd;
+	struct file		*file;
 	struct mutex		mutex;
-	int			needs_read_fill;
 	int			event;
 	struct list_head	list;
+
+	bool			mmapped;
+	const struct vm_operations_struct *vm_ops;
 };
 
-/**
- *	fill_read_buffer - allocate and fill buffer from object.
- *	@dentry:	dentry pointer.
- *	@buffer:	data buffer for file.
- *
- *	Allocate @buffer->page, if it hasn't been already, then call the
- *	kobject's show() method to fill the buffer with this attribute's
- *	data.
- *	This is called only once, on the file's first read unless an error
- *	is returned.
+static bool sysfs_is_bin(struct sysfs_dirent *sd)
+{
+	return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
+}
+
+static struct sysfs_open_file *sysfs_of(struct file *file)
+{
+	return ((struct seq_file *)file->private_data)->private;
+}
+
+/*
+ * Determine ktype->sysfs_ops for the given sysfs_dirent.  This function
+ * must be called while holding an active reference.
  */
-static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
+static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
 {
-	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	const struct sysfs_ops *ops = buffer->ops;
-	int ret = 0;
+	struct kobject *kobj = sd->s_parent->s_dir.kobj;
+
+	if (!sysfs_ignore_lockdep(sd))
+		lockdep_assert_held(sd);
+	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
+}
+
+/*
+ * Reads on sysfs are handled through seq_file, which takes care of hairy
+ * details like buffering and seeking.  The following function pipes
+ * sysfs_ops->show() result through seq_file.
+ */
+static int sysfs_seq_show(struct seq_file *sf, void *v)
+{
+	struct sysfs_open_file *of = sf->private;
+	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+	const struct sysfs_ops *ops;
+	char *buf;
 	ssize_t count;
 
-	if (!buffer->page)
-		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
-	if (!buffer->page)
-		return -ENOMEM;
+	/* acquire buffer and ensure that it's >= PAGE_SIZE */
+	count = seq_get_buf(sf, &buf);
+	if (count < PAGE_SIZE) {
+		seq_commit(sf, -1);
+		return 0;
+	}
 
-	/* need attr_sd for attr and ops, its parent for kobj */
-	if (!sysfs_get_active(attr_sd))
+	/*
+	 * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
+	 * nests outside active ref and is just to ensure that the ops
+	 * aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd)) {
+		mutex_unlock(&of->mutex);
 		return -ENODEV;
+	}
 
-	buffer->event = atomic_read(&attr_sd->s_attr.open->event);
-	count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
+	of->event = atomic_read(&of->sd->s_attr.open->event);
 
-	sysfs_put_active(attr_sd);
+	/*
+	 * Lookup @ops and invoke show().  Control may reach here via seq
+	 * file lseek even if @ops->show() isn't implemented.
+	 */
+	ops = sysfs_file_ops(of->sd);
+	if (ops->show)
+		count = ops->show(kobj, of->sd->s_attr.attr, buf);
+	else
+		count = 0;
+
+	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
+
+	if (count < 0)
+		return count;
 
 	/*
 	 * The code works fine with PAGE_SIZE return but it's likely to
@@ -96,155 +140,389 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
 		/* Try to struggle along */
 		count = PAGE_SIZE - 1;
 	}
-	if (count >= 0) {
-		buffer->needs_read_fill = 0;
-		buffer->count = count;
-	} else {
-		ret = count;
-	}
-	return ret;
+	seq_commit(sf, count);
+	return 0;
 }
 
-/**
- *	sysfs_read_file - read an attribute.
- *	@file:	file pointer.
- *	@buf:	buffer to fill.
- *	@count:	number of bytes to read.
- *	@ppos:	starting offset in file.
- *
- *	Userspace wants to read an attribute file. The attribute descriptor
- *	is in the file's ->d_fsdata. The target object is in the directory's
- *	->d_fsdata.
- *
- *	We call fill_read_buffer() to allocate and fill the buffer from the
- *	object's show() method exactly once (if the read is happening from
- *	the beginning of the file). That should fill the entire buffer with
- *	all the data the object has to offer for that attribute.
- *	We then call flush_read_buffer() to copy the buffer to userspace
- *	in the increments specified.
+/*
+ * Read method for bin files.  As reading a bin file can have side-effects,
+ * the exact offset and bytes specified in read(2) call should be passed to
+ * the read callback making it difficult to use seq_file.  Implement
+ * simplistic custom buffering for bin files.
  */
-
-static ssize_t
-sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
+			      size_t bytes, loff_t *off)
 {
-	struct sysfs_buffer *buffer = file->private_data;
-	ssize_t retval = 0;
+	struct sysfs_open_file *of = sysfs_of(file);
+	struct bin_attribute *battr = of->sd->s_attr.bin_attr;
+	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+	loff_t size = file_inode(file)->i_size;
+	int count = min_t(size_t, bytes, PAGE_SIZE);
+	loff_t offs = *off;
+	char *buf;
+
+	if (!bytes)
+		return 0;
 
-	mutex_lock(&buffer->mutex);
-	if (buffer->needs_read_fill || *ppos == 0) {
-		retval = fill_read_buffer(file->f_path.dentry, buffer);
-		if (retval)
-			goto out;
+	if (size) {
+		if (offs > size)
+			return 0;
+		if (offs + count > size)
+			count = size - offs;
 	}
-	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
-		 __func__, count, *ppos, buffer->page);
-	retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
-					 buffer->count);
-out:
-	mutex_unlock(&buffer->mutex);
-	return retval;
-}
 
-/**
- *	fill_write_buffer - copy buffer from userspace.
- *	@buffer:	data buffer for file.
- *	@buf:		data from user.
- *	@count:		number of bytes in @userbuf.
- *
- *	Allocate @buffer->page if it hasn't been already, then
- *	copy the user-supplied buffer into it.
- */
-static int fill_write_buffer(struct sysfs_buffer *buffer,
-			     const char __user *buf, size_t count)
-{
-	int error;
-
-	if (!buffer->page)
-		buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
-	if (!buffer->page)
+	buf = kmalloc(count, GFP_KERNEL);
+	if (!buf)
 		return -ENOMEM;
 
-	if (count >= PAGE_SIZE)
-		count = PAGE_SIZE - 1;
-	error = copy_from_user(buffer->page, buf, count);
-	buffer->needs_read_fill = 1;
-	/* if buf is assumed to contain a string, terminate it by \0,
-	   so e.g. sscanf() can scan the string easily */
-	buffer->page[count] = 0;
-	return error ? -EFAULT : count;
-}
+	/* need of->sd for battr, its parent for kobj */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd)) {
+		count = -ENODEV;
+		mutex_unlock(&of->mutex);
+		goto out_free;
+	}
+
+	if (battr->read)
+		count = battr->read(file, kobj, battr, buf, offs, count);
+	else
+		count = -EIO;
 
+	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
+
+	if (count < 0)
+		goto out_free;
+
+	if (copy_to_user(userbuf, buf, count)) {
+		count = -EFAULT;
+		goto out_free;
+	}
+
+	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
+
+	*off = offs + count;
+
+ out_free:
+	kfree(buf);
+	return count;
+}
 
 /**
- *	flush_write_buffer - push buffer to kobject.
- *	@dentry:	dentry to the attribute
- *	@buffer:	data buffer for file.
- *	@count:		number of bytes
+ * flush_write_buffer - push buffer to kobject
+ * @of: open file
+ * @buf: data buffer for file
+ * @off: file offset to write to
+ * @count: number of bytes
  *
- *	Get the correct pointers for the kobject and the attribute we're
- *	dealing with, then call the store() method for the attribute,
- *	passing the buffer that we acquired in fill_write_buffer().
+ * Get the correct pointers for the kobject and the attribute we're dealing
+ * with, then call the store() method for it with @buf.
  */
-static int flush_write_buffer(struct dentry *dentry,
-			      struct sysfs_buffer *buffer, size_t count)
+static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
+			      size_t count)
 {
-	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	const struct sysfs_ops *ops = buffer->ops;
-	int rc;
+	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+	int rc = 0;
 
-	/* need attr_sd for attr and ops, its parent for kobj */
-	if (!sysfs_get_active(attr_sd))
+	/*
+	 * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
+	 * nests outside active ref and is just to ensure that the ops
+	 * aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!sysfs_get_active(of->sd)) {
+		mutex_unlock(&of->mutex);
 		return -ENODEV;
+	}
 
-	rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
+	if (sysfs_is_bin(of->sd)) {
+		struct bin_attribute *battr = of->sd->s_attr.bin_attr;
 
-	sysfs_put_active(attr_sd);
+		rc = -EIO;
+		if (battr->write)
+			rc = battr->write(of->file, kobj, battr, buf, off,
+					  count);
+	} else {
+		const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
+
+		rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
+	}
+
+	sysfs_put_active(of->sd);
+	mutex_unlock(&of->mutex);
 
 	return rc;
 }
 
-
 /**
- *	sysfs_write_file - write an attribute.
- *	@file:	file pointer
- *	@buf:	data to write
- *	@count:	number of bytes
- *	@ppos:	starting offset
+ * sysfs_write_file - write an attribute
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Copy data in from userland and pass it to the matching
+ * sysfs_ops->store() by invoking flush_write_buffer().
  *
- *	Similar to sysfs_read_file(), though working in the opposite direction.
- *	We allocate and fill the data from the user in fill_write_buffer(),
- *	then push it to the kobject in flush_write_buffer().
- *	There is no easy way for us to know if userspace is only doing a partial
- *	write, so we don't support them. We expect the entire buffer to come
- *	on the first write.
- *	Hint: if you're writing a value, first read the file, modify only the
- *	the value you're changing, then write entire buffer back.
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on
+ * the first write.  Hint: if you're writing a value, first read the file,
+ * modify only the the value you're changing, then write entire buffer
+ * back.
  */
-static ssize_t sysfs_write_file(struct file *file, const char __user *buf,
+static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
 				size_t count, loff_t *ppos)
 {
-	struct sysfs_buffer *buffer = file->private_data;
-	ssize_t len;
+	struct sysfs_open_file *of = sysfs_of(file);
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	loff_t size = file_inode(file)->i_size;
+	char *buf;
+
+	if (sysfs_is_bin(of->sd) && size) {
+		if (size <= *ppos)
+			return 0;
+		len = min_t(ssize_t, len, size - *ppos);
+	}
 
-	mutex_lock(&buffer->mutex);
-	len = fill_write_buffer(buffer, buf, count);
-	if (len > 0)
-		len = flush_write_buffer(file->f_path.dentry, buffer, len);
+	if (!len)
+		return 0;
+
+	buf = kmalloc(len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, user_buf, len)) {
+		len = -EFAULT;
+		goto out_free;
+	}
+	buf[len] = '\0';	/* guarantee string termination */
+
+	len = flush_write_buffer(of, buf, *ppos, len);
 	if (len > 0)
 		*ppos += len;
-	mutex_unlock(&buffer->mutex);
+out_free:
+	kfree(buf);
 	return len;
 }
 
+static void sysfs_bin_vma_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+
+	if (!of->vm_ops)
+		return;
+
+	if (!sysfs_get_active(of->sd))
+		return;
+
+	if (of->vm_ops->open)
+		of->vm_ops->open(vma);
+
+	sysfs_put_active(of->sd);
+}
+
+static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!sysfs_get_active(of->sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = VM_FAULT_SIGBUS;
+	if (of->vm_ops->fault)
+		ret = of->vm_ops->fault(vma, vmf);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma,
+				  struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!sysfs_get_active(of->sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = 0;
+	if (of->vm_ops->page_mkwrite)
+		ret = of->vm_ops->page_mkwrite(vma, vmf);
+	else
+		file_update_time(file);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
+			    void *buf, int len, int write)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return -EINVAL;
+
+	if (!sysfs_get_active(of->sd))
+		return -EINVAL;
+
+	ret = -EINVAL;
+	if (of->vm_ops->access)
+		ret = of->vm_ops->access(vma, addr, buf, len, write);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int sysfs_bin_set_policy(struct vm_area_struct *vma,
+				struct mempolicy *new)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!sysfs_get_active(of->sd))
+		return -EINVAL;
+
+	ret = 0;
+	if (of->vm_ops->set_policy)
+		ret = of->vm_ops->set_policy(vma, new);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+
+static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
+					      unsigned long addr)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	struct mempolicy *pol;
+
+	if (!of->vm_ops)
+		return vma->vm_policy;
+
+	if (!sysfs_get_active(of->sd))
+		return vma->vm_policy;
+
+	pol = vma->vm_policy;
+	if (of->vm_ops->get_policy)
+		pol = of->vm_ops->get_policy(vma, addr);
+
+	sysfs_put_active(of->sd);
+	return pol;
+}
+
+static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
+			     const nodemask_t *to, unsigned long flags)
+{
+	struct file *file = vma->vm_file;
+	struct sysfs_open_file *of = sysfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!sysfs_get_active(of->sd))
+		return 0;
+
+	ret = 0;
+	if (of->vm_ops->migrate)
+		ret = of->vm_ops->migrate(vma, from, to, flags);
+
+	sysfs_put_active(of->sd);
+	return ret;
+}
+#endif
+
+static const struct vm_operations_struct sysfs_bin_vm_ops = {
+	.open		= sysfs_bin_vma_open,
+	.fault		= sysfs_bin_fault,
+	.page_mkwrite	= sysfs_bin_page_mkwrite,
+	.access		= sysfs_bin_access,
+#ifdef CONFIG_NUMA
+	.set_policy	= sysfs_bin_set_policy,
+	.get_policy	= sysfs_bin_get_policy,
+	.migrate	= sysfs_bin_migrate,
+#endif
+};
+
+static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct sysfs_open_file *of = sysfs_of(file);
+	struct bin_attribute *battr = of->sd->s_attr.bin_attr;
+	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
+	int rc;
+
+	mutex_lock(&of->mutex);
+
+	/* need of->sd for battr, its parent for kobj */
+	rc = -ENODEV;
+	if (!sysfs_get_active(of->sd))
+		goto out_unlock;
+
+	if (!battr->mmap)
+		goto out_put;
+
+	rc = battr->mmap(file, kobj, battr, vma);
+	if (rc)
+		goto out_put;
+
+	/*
+	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+	 * to satisfy versions of X which crash if the mmap fails: that
+	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
+	 */
+	if (vma->vm_file != file)
+		goto out_put;
+
+	rc = -EINVAL;
+	if (of->mmapped && of->vm_ops != vma->vm_ops)
+		goto out_put;
+
+	/*
+	 * It is not possible to successfully wrap close.
+	 * So error if someone is trying to use close.
+	 */
+	rc = -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->close)
+		goto out_put;
+
+	rc = 0;
+	of->mmapped = 1;
+	of->vm_ops = vma->vm_ops;
+	vma->vm_ops = &sysfs_bin_vm_ops;
+out_put:
+	sysfs_put_active(of->sd);
+out_unlock:
+	mutex_unlock(&of->mutex);
+
+	return rc;
+}
+
 /**
  *	sysfs_get_open_dirent - get or create sysfs_open_dirent
  *	@sd: target sysfs_dirent
- *	@buffer: sysfs_buffer for this instance of open
+ *	@of: sysfs_open_file for this instance of open
  *
  *	If @sd->s_attr.open exists, increment its reference count;
- *	otherwise, create one.  @buffer is chained to the buffers
- *	list.
+ *	otherwise, create one.  @of is chained to the files list.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).
@@ -253,11 +531,12 @@ static ssize_t sysfs_write_file(struct file *file, const char __user *buf,
  *	0 on success, -errno on failure.
  */
 static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
-				 struct sysfs_buffer *buffer)
+				 struct sysfs_open_file *of)
 {
 	struct sysfs_open_dirent *od, *new_od = NULL;
 
  retry:
+	mutex_lock(&sysfs_open_file_mutex);
 	spin_lock_irq(&sysfs_open_dirent_lock);
 
 	if (!sd->s_attr.open && new_od) {
@@ -268,10 +547,11 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 	od = sd->s_attr.open;
 	if (od) {
 		atomic_inc(&od->refcnt);
-		list_add_tail(&buffer->list, &od->buffers);
+		list_add_tail(&of->list, &od->files);
 	}
 
 	spin_unlock_irq(&sysfs_open_dirent_lock);
+	mutex_unlock(&sysfs_open_file_mutex);
 
 	if (od) {
 		kfree(new_od);
@@ -286,36 +566,40 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 	atomic_set(&new_od->refcnt, 0);
 	atomic_set(&new_od->event, 1);
 	init_waitqueue_head(&new_od->poll);
-	INIT_LIST_HEAD(&new_od->buffers);
+	INIT_LIST_HEAD(&new_od->files);
 	goto retry;
 }
 
 /**
  *	sysfs_put_open_dirent - put sysfs_open_dirent
  *	@sd: target sysfs_dirent
- *	@buffer: associated sysfs_buffer
+ *	@of: associated sysfs_open_file
  *
- *	Put @sd->s_attr.open and unlink @buffer from the buffers list.
- *	If reference count reaches zero, disassociate and free it.
+ *	Put @sd->s_attr.open and unlink @of from the files list.  If
+ *	reference count reaches zero, disassociate and free it.
  *
  *	LOCKING:
  *	None.
  */
 static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
-				  struct sysfs_buffer *buffer)
+				  struct sysfs_open_file *of)
 {
 	struct sysfs_open_dirent *od = sd->s_attr.open;
 	unsigned long flags;
 
+	mutex_lock(&sysfs_open_file_mutex);
 	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
 
-	list_del(&buffer->list);
+	if (of)
+		list_del(&of->list);
+
 	if (atomic_dec_and_test(&od->refcnt))
 		sd->s_attr.open = NULL;
 	else
 		od = NULL;
 
 	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
+	mutex_unlock(&sysfs_open_file_mutex);
 
 	kfree(od);
 }
@@ -324,67 +608,81 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	struct sysfs_buffer *buffer;
-	const struct sysfs_ops *ops;
+	struct sysfs_open_file *of;
+	bool has_read, has_write;
 	int error = -EACCES;
 
 	/* need attr_sd for attr and ops, its parent for kobj */
 	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
-	/* every kobject with an attribute needs a ktype assigned */
-	if (kobj->ktype && kobj->ktype->sysfs_ops)
-		ops = kobj->ktype->sysfs_ops;
-	else {
-		WARN(1, KERN_ERR
-		     "missing sysfs attribute operations for kobject: %s\n",
-		     kobject_name(kobj));
-		goto err_out;
-	}
+	if (sysfs_is_bin(attr_sd)) {
+		struct bin_attribute *battr = attr_sd->s_attr.bin_attr;
 
-	/* File needs write support.
-	 * The inode's perms must say it's ok,
-	 * and we must have a store method.
-	 */
-	if (file->f_mode & FMODE_WRITE) {
-		if (!(inode->i_mode & S_IWUGO) || !ops->store)
-			goto err_out;
-	}
+		has_read = battr->read || battr->mmap;
+		has_write = battr->write || battr->mmap;
+	} else {
+		const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
 
-	/* File needs read support.
-	 * The inode's perms must say it's ok, and we there
-	 * must be a show method for it.
-	 */
-	if (file->f_mode & FMODE_READ) {
-		if (!(inode->i_mode & S_IRUGO) || !ops->show)
+		/* every kobject with an attribute needs a ktype assigned */
+		if (WARN(!ops, KERN_ERR
+			 "missing sysfs attribute operations for kobject: %s\n",
+			 kobject_name(kobj)))
 			goto err_out;
+
+		has_read = ops->show;
+		has_write = ops->store;
 	}
 
-	/* No error? Great, allocate a buffer for the file, and store it
-	 * it in file->private_data for easy access.
-	 */
+	/* check perms and supported operations */
+	if ((file->f_mode & FMODE_WRITE) &&
+	    (!(inode->i_mode & S_IWUGO) || !has_write))
+		goto err_out;
+
+	if ((file->f_mode & FMODE_READ) &&
+	    (!(inode->i_mode & S_IRUGO) || !has_read))
+		goto err_out;
+
+	/* allocate a sysfs_open_file for the file */
 	error = -ENOMEM;
-	buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL);
-	if (!buffer)
+	of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
+	if (!of)
 		goto err_out;
 
-	mutex_init(&buffer->mutex);
-	buffer->needs_read_fill = 1;
-	buffer->ops = ops;
-	file->private_data = buffer;
+	mutex_init(&of->mutex);
+	of->sd = attr_sd;
+	of->file = file;
 
-	/* make sure we have open dirent struct */
-	error = sysfs_get_open_dirent(attr_sd, buffer);
+	/*
+	 * Always instantiate seq_file even if read access doesn't use
+	 * seq_file or is not requested.  This unifies private data access
+	 * and readable regular files are the vast majority anyway.
+	 */
+	if (sysfs_is_bin(attr_sd))
+		error = single_open(file, NULL, of);
+	else
+		error = single_open(file, sysfs_seq_show, of);
 	if (error)
 		goto err_free;
 
+	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
+	if (file->f_mode & FMODE_WRITE)
+		file->f_mode |= FMODE_PWRITE;
+
+	/* make sure we have open dirent struct */
+	error = sysfs_get_open_dirent(attr_sd, of);
+	if (error)
+		goto err_close;
+
 	/* open succeeded, put active references */
 	sysfs_put_active(attr_sd);
 	return 0;
 
- err_free:
-	kfree(buffer);
- err_out:
+err_close:
+	single_release(inode, file);
+err_free:
+	kfree(of);
+err_out:
 	sysfs_put_active(attr_sd);
 	return error;
 }
@@ -392,17 +690,41 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 static int sysfs_release(struct inode *inode, struct file *filp)
 {
 	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
-	struct sysfs_buffer *buffer = filp->private_data;
+	struct sysfs_open_file *of = sysfs_of(filp);
 
-	sysfs_put_open_dirent(sd, buffer);
-
-	if (buffer->page)
-		free_page((unsigned long)buffer->page);
-	kfree(buffer);
+	sysfs_put_open_dirent(sd, of);
+	single_release(inode, filp);
+	kfree(of);
 
 	return 0;
 }
 
+void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
+{
+	struct sysfs_open_dirent *od;
+	struct sysfs_open_file *of;
+
+	if (!sysfs_is_bin(sd))
+		return;
+
+	spin_lock_irq(&sysfs_open_dirent_lock);
+	od = sd->s_attr.open;
+	if (od)
+		atomic_inc(&od->refcnt);
+	spin_unlock_irq(&sysfs_open_dirent_lock);
+	if (!od)
+		return;
+
+	mutex_lock(&sysfs_open_file_mutex);
+	list_for_each_entry(of, &od->files, list) {
+		struct inode *inode = file_inode(of->file);
+		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+	}
+	mutex_unlock(&sysfs_open_file_mutex);
+
+	sysfs_put_open_dirent(sd, NULL);
+}
+
 /* Sysfs attribute files are pollable.  The idea is that you read
  * the content and then you use 'poll' or 'select' to wait for
  * the content to change.  When the content changes (assuming the
@@ -418,7 +740,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
  */
 static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 {
-	struct sysfs_buffer *buffer = filp->private_data;
+	struct sysfs_open_file *of = sysfs_of(filp);
 	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
 	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
 
@@ -430,13 +752,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 
 	sysfs_put_active(attr_sd);
 
-	if (buffer->event != atomic_read(&od->event))
+	if (of->event != atomic_read(&od->event))
 		goto trigger;
 
 	return DEFAULT_POLLMASK;
 
  trigger:
-	buffer->needs_read_fill = 1;
 	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
 }
 
@@ -466,9 +787,9 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 	mutex_lock(&sysfs_mutex);
 
 	if (sd && dir)
-		sd = sysfs_find_dirent(sd, NULL, dir);
+		sd = sysfs_find_dirent(sd, dir, NULL);
 	if (sd && attr)
-		sd = sysfs_find_dirent(sd, NULL, attr);
+		sd = sysfs_find_dirent(sd, attr, NULL);
 	if (sd)
 		sysfs_notify_dirent(sd);
 
@@ -477,7 +798,7 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 EXPORT_SYMBOL_GPL(sysfs_notify);
 
 const struct file_operations sysfs_file_operations = {
-	.read		= sysfs_read_file,
+	.read		= seq_read,
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
 	.open		= sysfs_open_file,
@@ -485,58 +806,25 @@ const struct file_operations sysfs_file_operations = {
 	.poll		= sysfs_poll,
 };
 
-static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
-			 const void **pns)
-{
-	struct sysfs_dirent *dir_sd = kobj->sd;
-	const struct sysfs_ops *ops;
-	const void *ns = NULL;
-	int err;
-
-	if (!dir_sd) {
-		WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n",
-			kobject_name(kobj));
-		return -ENOENT;
-	}
-
-	err = 0;
-	if (!sysfs_ns_type(dir_sd))
-		goto out;
-
-	err = -EINVAL;
-	if (!kobj->ktype)
-		goto out;
-	ops = kobj->ktype->sysfs_ops;
-	if (!ops)
-		goto out;
-	if (!ops->namespace)
-		goto out;
-
-	err = 0;
-	ns = ops->namespace(kobj, attr);
-out:
-	if (err) {
-		WARN(1, KERN_ERR
-		     "missing sysfs namespace attribute operation for kobject: %s\n",
-		     kobject_name(kobj));
-	}
-	*pns = ns;
-	return err;
-}
+const struct file_operations sysfs_bin_operations = {
+	.read		= sysfs_bin_read,
+	.write		= sysfs_write_file,
+	.llseek		= generic_file_llseek,
+	.mmap		= sysfs_bin_mmap,
+	.open		= sysfs_open_file,
+	.release	= sysfs_release,
+	.poll		= sysfs_poll,
+};
 
-int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
-			const struct attribute *attr, int type, umode_t amode)
+int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
+			   const struct attribute *attr, int type,
+			   umode_t amode, const void *ns)
 {
 	umode_t mode = (amode & S_IALLUGO) | S_IFREG;
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
-	const void *ns;
 	int rc;
 
-	rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns);
-	if (rc)
-		return rc;
-
 	sd = sysfs_new_dirent(attr->name, mode, type);
 	if (!sd)
 		return -ENOMEM;
@@ -545,8 +833,8 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
 	sd->s_attr.attr = (void *)attr;
 	sysfs_dirent_init_lockdep(sd);
 
-	sysfs_addrm_start(&acxt, dir_sd);
-	rc = sysfs_add_one(&acxt, sd);
+	sysfs_addrm_start(&acxt);
+	rc = sysfs_add_one(&acxt, sd, dir_sd);
 	sysfs_addrm_finish(&acxt);
 
 	if (rc)
@@ -559,23 +847,25 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
 int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
 		   int type)
 {
-	return sysfs_add_file_mode(dir_sd, attr, type, attr->mode);
+	return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL);
 }
 
-
 /**
- *	sysfs_create_file - create an attribute file for an object.
- *	@kobj:	object we're creating for.
- *	@attr:	attribute descriptor.
+ * sysfs_create_file_ns - create an attribute file for an object with custom ns
+ * @kobj: object we're creating for
+ * @attr: attribute descriptor
+ * @ns: namespace the new file should belong to
  */
-int sysfs_create_file(struct kobject *kobj, const struct attribute *attr)
+int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
+			 const void *ns)
 {
 	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR);
+	return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR,
+				      attr->mode, ns);
 
 }
-EXPORT_SYMBOL_GPL(sysfs_create_file);
+EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
 
 int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
 {
@@ -604,7 +894,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 	int error;
 
 	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
+		dir_sd = sysfs_get_dirent(kobj->sd, group);
 	else
 		dir_sd = sysfs_get(kobj->sd);
 
@@ -630,17 +920,12 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 {
 	struct sysfs_dirent *sd;
 	struct iattr newattrs;
-	const void *ns;
 	int rc;
 
-	rc = sysfs_attr_ns(kobj, attr, &ns);
-	if (rc)
-		return rc;
-
 	mutex_lock(&sysfs_mutex);
 
 	rc = -ENOENT;
-	sd = sysfs_find_dirent(kobj->sd, ns, attr->name);
+	sd = sysfs_find_dirent(kobj->sd, attr->name, NULL);
 	if (!sd)
 		goto out;
 
@@ -655,22 +940,21 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 
 /**
- *	sysfs_remove_file - remove an object attribute.
- *	@kobj:	object we're acting for.
- *	@attr:	attribute descriptor.
+ * sysfs_remove_file_ns - remove an object attribute with a custom ns tag
+ * @kobj: object we're acting for
+ * @attr: attribute descriptor
+ * @ns: namespace tag of the file to remove
  *
- *	Hash the attribute name and kill the victim.
+ * Hash the attribute name and namespace tag and kill the victim.
  */
-void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr)
+void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
+			  const void *ns)
 {
-	const void *ns;
-
-	if (sysfs_attr_ns(kobj, attr, &ns))
-		return;
+	struct sysfs_dirent *dir_sd = kobj->sd;
 
-	sysfs_hash_and_remove(kobj->sd, ns, attr->name);
+	sysfs_hash_and_remove(dir_sd, attr->name, ns);
 }
-EXPORT_SYMBOL_GPL(sysfs_remove_file);
+EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
 
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
 {
@@ -692,16 +976,42 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 	struct sysfs_dirent *dir_sd;
 
 	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
+		dir_sd = sysfs_get_dirent(kobj->sd, group);
 	else
 		dir_sd = sysfs_get(kobj->sd);
 	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, NULL, attr->name);
+		sysfs_hash_and_remove(dir_sd, attr->name, NULL);
 		sysfs_put(dir_sd);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
 
+/**
+ *	sysfs_create_bin_file - create binary file for object.
+ *	@kobj:	object.
+ *	@attr:	attribute descriptor.
+ */
+int sysfs_create_bin_file(struct kobject *kobj,
+			  const struct bin_attribute *attr)
+{
+	BUG_ON(!kobj || !kobj->sd || !attr);
+
+	return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
+}
+EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
+
+/**
+ *	sysfs_remove_bin_file - remove binary file for object.
+ *	@kobj:	object.
+ *	@attr:	attribute descriptor.
+ */
+void sysfs_remove_bin_file(struct kobject *kobj,
+			   const struct bin_attribute *attr)
+{
+	sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
+
 struct sysfs_schedule_callback_struct {
 	struct list_head	workq_list;
 	struct kobject		*kobj;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 5f92cd2f61c1..1898a10e38ce 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -26,7 +26,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 
 	if (grp->attrs)
 		for (attr = grp->attrs; *attr; attr++)
-			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+			sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
 	if (grp->bin_attrs)
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
 			sysfs_remove_bin_file(kobj, *bin_attr);
@@ -49,16 +49,17 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			 * re-adding (if required) the file.
 			 */
 			if (update)
-				sysfs_hash_and_remove(dir_sd, NULL,
-						      (*attr)->name);
+				sysfs_hash_and_remove(dir_sd, (*attr)->name,
+						      NULL);
 			if (grp->is_visible) {
 				mode = grp->is_visible(kobj, *attr, i);
 				if (!mode)
 					continue;
 			}
-			error = sysfs_add_file_mode(dir_sd, *attr,
-						    SYSFS_KOBJ_ATTR,
-						    (*attr)->mode | mode);
+			error = sysfs_add_file_mode_ns(dir_sd, *attr,
+						       SYSFS_KOBJ_ATTR,
+						       (*attr)->mode | mode,
+						       NULL);
 			if (unlikely(error))
 				break;
 		}
@@ -110,7 +111,7 @@ static int internal_create_group(struct kobject *kobj, int update,
 	error = create_files(sd, kobj, grp, update);
 	if (error) {
 		if (grp->name)
-			sysfs_remove_subdir(sd);
+			sysfs_remove(sd);
 	}
 	sysfs_put(sd);
 	return error;
@@ -206,7 +207,7 @@ void sysfs_remove_group(struct kobject *kobj,
 	struct sysfs_dirent *sd;
 
 	if (grp->name) {
-		sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
+		sd = sysfs_get_dirent(dir_sd, grp->name);
 		if (!sd) {
 			WARN(!sd, KERN_WARNING
 			     "sysfs group %p not found for kobject '%s'\n",
@@ -218,7 +219,7 @@ void sysfs_remove_group(struct kobject *kobj,
 
 	remove_files(sd, kobj, grp);
 	if (grp->name)
-		sysfs_remove_subdir(sd);
+		sysfs_remove(sd);
 
 	sysfs_put(sd);
 }
@@ -261,7 +262,7 @@ int sysfs_merge_group(struct kobject *kobj,
 	struct attribute *const *attr;
 	int i;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
 	if (!dir_sd)
 		return -ENOENT;
 
@@ -269,7 +270,7 @@ int sysfs_merge_group(struct kobject *kobj,
 		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
 	if (error) {
 		while (--i >= 0)
-			sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
+			sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL);
 	}
 	sysfs_put(dir_sd);
 
@@ -288,10 +289,10 @@ void sysfs_unmerge_group(struct kobject *kobj,
 	struct sysfs_dirent *dir_sd;
 	struct attribute *const *attr;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
 	if (dir_sd) {
 		for (attr = grp->attrs; *attr; ++attr)
-			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+			sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
 		sysfs_put(dir_sd);
 	}
 }
@@ -310,7 +311,7 @@ int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
 	struct sysfs_dirent *dir_sd;
 	int error = 0;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
+	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
 	if (!dir_sd)
 		return -ENOENT;
 
@@ -332,9 +333,9 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 {
 	struct sysfs_dirent *dir_sd;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
+	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
 	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, NULL, link_name);
+		sysfs_hash_and_remove(dir_sd, link_name, NULL);
 		sysfs_put(dir_sd);
 	}
 }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 963f910c8034..1750f790af3b 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -258,9 +258,9 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 		inode->i_fop = &sysfs_file_operations;
 		break;
 	case SYSFS_KOBJ_BIN_ATTR:
-		bin_attr = sd->s_bin_attr.bin_attr;
+		bin_attr = sd->s_attr.bin_attr;
 		inode->i_size = bin_attr->size;
-		inode->i_fop = &bin_fops;
+		inode->i_fop = &sysfs_bin_operations;
 		break;
 	case SYSFS_KOBJ_LINK:
 		inode->i_op = &sysfs_symlink_inode_operations;
@@ -314,32 +314,6 @@ void sysfs_evict_inode(struct inode *inode)
 	sysfs_put(sd);
 }
 
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
-			  const char *name)
-{
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-
-	if (!dir_sd) {
-		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
-			name);
-		return -ENOENT;
-	}
-
-	sysfs_addrm_start(&acxt, dir_sd);
-
-	sd = sysfs_find_dirent(dir_sd, ns, name);
-	if (sd)
-		sysfs_remove_one(&acxt, sd);
-
-	sysfs_addrm_finish(&acxt);
-
-	if (sd)
-		return 0;
-	else
-		return -ENOENT;
-}
-
 int sysfs_permission(struct inode *inode, int mask)
 {
 	struct sysfs_dirent *sd;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 2dd4507d9edd..3ae3f1bf1a09 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -33,13 +33,15 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 
 	BUG_ON(!name || !parent_sd);
 
-	/* target->sd can go away beneath us but is protected with
-	 * sysfs_assoc_lock.  Fetch target_sd from it.
+	/*
+	 * We don't own @target and it may be removed at any time.
+	 * Synchronize using sysfs_symlink_target_lock.  See
+	 * sysfs_remove_dir() for details.
 	 */
-	spin_lock(&sysfs_assoc_lock);
+	spin_lock(&sysfs_symlink_target_lock);
 	if (target->sd)
 		target_sd = sysfs_get(target->sd);
-	spin_unlock(&sysfs_assoc_lock);
+	spin_unlock(&sysfs_symlink_target_lock);
 
 	error = -ENOENT;
 	if (!target_sd)
@@ -52,18 +54,18 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 
 	ns_type = sysfs_ns_type(parent_sd);
 	if (ns_type)
-		sd->s_ns = target->ktype->namespace(target);
+		sd->s_ns = target_sd->s_ns;
 	sd->s_symlink.target_sd = target_sd;
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
-	sysfs_addrm_start(&acxt, parent_sd);
+	sysfs_addrm_start(&acxt);
 	/* Symlinks must be between directories with the same ns_type */
 	if (!ns_type ||
 	    (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
 		if (warn)
-			error = sysfs_add_one(&acxt, sd);
+			error = sysfs_add_one(&acxt, sd, parent_sd);
 		else
-			error = __sysfs_add_one(&acxt, sd);
+			error = __sysfs_add_one(&acxt, sd, parent_sd);
 	} else {
 		error = -EINVAL;
 		WARN(1, KERN_WARNING
@@ -155,11 +157,17 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 			const char *name)
 {
 	const void *ns = NULL;
-	spin_lock(&sysfs_assoc_lock);
+
+	/*
+	 * We don't own @target and it may be removed at any time.
+	 * Synchronize using sysfs_symlink_target_lock.  See
+	 * sysfs_remove_dir() for details.
+	 */
+	spin_lock(&sysfs_symlink_target_lock);
 	if (targ->sd && sysfs_ns_type(kobj->sd))
 		ns = targ->sd->s_ns;
-	spin_unlock(&sysfs_assoc_lock);
-	sysfs_hash_and_remove(kobj->sd, ns, name);
+	spin_unlock(&sysfs_symlink_target_lock);
+	sysfs_hash_and_remove(kobj->sd, name, ns);
 }
 
 /**
@@ -176,24 +184,25 @@ void sysfs_remove_link(struct kobject *kobj, const char *name)
 	else
 		parent_sd = kobj->sd;
 
-	sysfs_hash_and_remove(parent_sd, NULL, name);
+	sysfs_hash_and_remove(parent_sd, name, NULL);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
 
 /**
- *	sysfs_rename_link - rename symlink in object's directory.
+ *	sysfs_rename_link_ns - rename symlink in object's directory.
  *	@kobj:	object we're acting for.
  *	@targ:	object we're pointing to.
  *	@old:	previous name of the symlink.
  *	@new:	new name of the symlink.
+ *	@new_ns: new namespace of the symlink.
  *
  *	A helper function for the common rename symlink idiom.
  */
-int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
-			const char *old, const char *new)
+int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
+			 const char *old, const char *new, const void *new_ns)
 {
 	struct sysfs_dirent *parent_sd, *sd = NULL;
-	const void *old_ns = NULL, *new_ns = NULL;
+	const void *old_ns = NULL;
 	int result;
 
 	if (!kobj)
@@ -205,7 +214,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
 		old_ns = targ->sd->s_ns;
 
 	result = -ENOENT;
-	sd = sysfs_get_dirent(parent_sd, old_ns, old);
+	sd = sysfs_get_dirent_ns(parent_sd, old, old_ns);
 	if (!sd)
 		goto out;
 
@@ -215,16 +224,13 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
 	if (sd->s_symlink.target_sd->s_dir.kobj != targ)
 		goto out;
 
-	if (sysfs_ns_type(parent_sd))
-		new_ns = targ->ktype->namespace(targ);
-
-	result = sysfs_rename(sd, parent_sd, new_ns, new);
+	result = sysfs_rename(sd, parent_sd, new, new_ns);
 
 out:
 	sysfs_put(sd);
 	return result;
 }
-EXPORT_SYMBOL_GPL(sysfs_rename_link);
+EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
 
 static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
 				 struct sysfs_dirent *target_sd, char *path)
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index b6deca3e301d..0af09fbfb3f6 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -29,15 +29,13 @@ struct sysfs_elem_symlink {
 };
 
 struct sysfs_elem_attr {
-	struct attribute	*attr;
+	union {
+		struct attribute	*attr;
+		struct bin_attribute	*bin_attr;
+	};
 	struct sysfs_open_dirent *open;
 };
 
-struct sysfs_elem_bin_attr {
-	struct bin_attribute	*bin_attr;
-	struct hlist_head	buffers;
-};
-
 struct sysfs_inode_attrs {
 	struct iattr	ia_iattr;
 	void		*ia_secdata;
@@ -74,7 +72,6 @@ struct sysfs_dirent {
 		struct sysfs_elem_dir		s_dir;
 		struct sysfs_elem_symlink	s_symlink;
 		struct sysfs_elem_attr		s_attr;
-		struct sysfs_elem_bin_attr	s_bin_attr;
 	};
 
 	unsigned short		s_flags;
@@ -115,6 +112,7 @@ static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
+
 #define sysfs_dirent_init_lockdep(sd)				\
 do {								\
 	struct attribute *attr = sd->s_attr.attr;		\
@@ -124,15 +122,31 @@ do {								\
 								\
 	lockdep_init_map(&sd->dep_map, "s_active", key, 0);	\
 } while (0)
+
+/* Test for attributes that want to ignore lockdep for read-locking */
+static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
+{
+	int type = sysfs_type(sd);
+
+	return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) &&
+		sd->s_attr.attr->ignore_lockdep;
+}
+
 #else
+
 #define sysfs_dirent_init_lockdep(sd) do {} while (0)
+
+static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
+{
+	return true;
+}
+
 #endif
 
 /*
  * Context structure to be used while adding/removing nodes.
  */
 struct sysfs_addrm_cxt {
-	struct sysfs_dirent	*parent_sd;
 	struct sysfs_dirent	*removed;
 };
 
@@ -156,38 +170,37 @@ extern struct kmem_cache *sysfs_dir_cachep;
  * dir.c
  */
 extern struct mutex sysfs_mutex;
-extern spinlock_t sysfs_assoc_lock;
+extern spinlock_t sysfs_symlink_target_lock;
 extern const struct dentry_operations sysfs_dentry_ops;
 
 extern const struct file_operations sysfs_dir_operations;
 extern const struct inode_operations sysfs_dir_inode_operations;
 
-struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
 struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
 void sysfs_put_active(struct sysfs_dirent *sd);
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
-		       struct sysfs_dirent *parent_sd);
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
-void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
+void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
+void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		    struct sysfs_dirent *parent_sd);
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
+		  struct sysfs_dirent *parent_sd);
+void sysfs_remove(struct sysfs_dirent *sd);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
+			  const void *ns);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-				       const void *ns,
-				       const unsigned char *name);
-struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
-				      const void *ns,
-				      const unsigned char *name);
+				       const unsigned char *name,
+				       const void *ns);
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
 
 void release_sysfs_dirent(struct sysfs_dirent *sd);
 
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd);
-void sysfs_remove_subdir(struct sysfs_dirent *sd);
 
 int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-		 const void *ns, const char *new_name);
+		 const char *new_name, const void *new_ns);
 
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
@@ -218,25 +231,21 @@ int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
-			  const char *name);
 int sysfs_inode_init(void);
 
 /*
  * file.c
  */
 extern const struct file_operations sysfs_file_operations;
+extern const struct file_operations sysfs_bin_operations;
 
 int sysfs_add_file(struct sysfs_dirent *dir_sd,
 		   const struct attribute *attr, int type);
 
-int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
-			const struct attribute *attr, int type, umode_t amode);
-/*
- * bin.c
- */
-extern const struct file_operations bin_fops;
-void unmap_bin_file(struct sysfs_dirent *attr_sd);
+int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
+			   const struct attribute *attr, int type,
+			   umode_t amode, const void *ns);
+void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
 
 /*
  * symlink.c
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index d0c6a007ce83..eda10959714f 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -487,6 +487,7 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_sb = sb;
 	sbi->s_block_base = 0;
 	sbi->s_type = FSTYPE_V7;
+	mutex_init(&sbi->s_lock);
 	sb->s_fs_info = sbi;
 	
 	sb_set_blocksize(sb, 512);
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 7e5aae4bf46f..6eaf5edf1ea1 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -30,18 +30,17 @@ void udf_free_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
 
-	mutex_lock(&sbi->s_alloc_mutex);
-	if (sbi->s_lvid_bh) {
-		struct logicalVolIntegrityDescImpUse *lvidiu =
-							udf_sb_lvidiu(sbi);
+	if (lvidiu) {
+		mutex_lock(&sbi->s_alloc_mutex);
 		if (S_ISDIR(inode->i_mode))
 			le32_add_cpu(&lvidiu->numDirs, -1);
 		else
 			le32_add_cpu(&lvidiu->numFiles, -1);
 		udf_updated_lvid(sb);
+		mutex_unlock(&sbi->s_alloc_mutex);
 	}
-	mutex_unlock(&sbi->s_alloc_mutex);
 
 	udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
 }
@@ -55,6 +54,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
 	uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
 	struct udf_inode_info *iinfo;
 	struct udf_inode_info *dinfo = UDF_I(dir);
+	struct logicalVolIntegrityDescImpUse *lvidiu;
 
 	inode = new_inode(sb);
 
@@ -92,12 +92,10 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
 		return NULL;
 	}
 
-	if (sbi->s_lvid_bh) {
-		struct logicalVolIntegrityDescImpUse *lvidiu;
-
+	lvidiu = udf_sb_lvidiu(sb);
+	if (lvidiu) {
 		iinfo->i_unique = lvid_get_unique_id(sb);
 		mutex_lock(&sbi->s_alloc_mutex);
-		lvidiu = udf_sb_lvidiu(sbi);
 		if (S_ISDIR(mode))
 			le32_add_cpu(&lvidiu->numDirs, 1);
 		else
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 839a2bad7f45..91219385691d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -94,13 +94,25 @@ static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct dentry *, struct kstatfs *);
 static int udf_show_options(struct seq_file *, struct dentry *);
 
-struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
+struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
 {
-	struct logicalVolIntegrityDesc *lvid =
-		(struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
-	__u32 number_of_partitions = le32_to_cpu(lvid->numOfPartitions);
-	__u32 offset = number_of_partitions * 2 *
-				sizeof(uint32_t)/sizeof(uint8_t);
+	struct logicalVolIntegrityDesc *lvid;
+	unsigned int partnum;
+	unsigned int offset;
+
+	if (!UDF_SB(sb)->s_lvid_bh)
+		return NULL;
+	lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data;
+	partnum = le32_to_cpu(lvid->numOfPartitions);
+	if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) -
+	     offsetof(struct logicalVolIntegrityDesc, impUse)) /
+	     (2 * sizeof(uint32_t)) < partnum) {
+		udf_err(sb, "Logical volume integrity descriptor corrupted "
+			"(numOfPartitions = %u)!\n", partnum);
+		return NULL;
+	}
+	/* The offset is to skip freeSpaceTable and sizeTable arrays */
+	offset = partnum * 2 * sizeof(uint32_t);
 	return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]);
 }
 
@@ -629,9 +641,10 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	struct udf_options uopt;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	int error = 0;
+	struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
 
-	if (sbi->s_lvid_bh) {
-		int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
+	if (lvidiu) {
+		int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
 		if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
 			return -EACCES;
 	}
@@ -1905,11 +1918,12 @@ static void udf_open_lvid(struct super_block *sb)
 
 	if (!bh)
 		return;
-
-	mutex_lock(&sbi->s_alloc_mutex);
 	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
-	lvidiu = udf_sb_lvidiu(sbi);
+	lvidiu = udf_sb_lvidiu(sb);
+	if (!lvidiu)
+		return;
 
+	mutex_lock(&sbi->s_alloc_mutex);
 	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
 	udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
@@ -1937,10 +1951,12 @@ static void udf_close_lvid(struct super_block *sb)
 
 	if (!bh)
 		return;
+	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+	lvidiu = udf_sb_lvidiu(sb);
+	if (!lvidiu)
+		return;
 
 	mutex_lock(&sbi->s_alloc_mutex);
-	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
-	lvidiu = udf_sb_lvidiu(sbi);
 	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
 	udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME);
@@ -2093,15 +2109,19 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 
 	if (sbi->s_lvid_bh) {
 		struct logicalVolIntegrityDescImpUse *lvidiu =
-							udf_sb_lvidiu(sbi);
-		uint16_t minUDFReadRev = le16_to_cpu(lvidiu->minUDFReadRev);
-		uint16_t minUDFWriteRev = le16_to_cpu(lvidiu->minUDFWriteRev);
-		/* uint16_t maxUDFWriteRev =
-				le16_to_cpu(lvidiu->maxUDFWriteRev); */
+							udf_sb_lvidiu(sb);
+		uint16_t minUDFReadRev;
+		uint16_t minUDFWriteRev;
 
+		if (!lvidiu) {
+			ret = -EINVAL;
+			goto error_out;
+		}
+		minUDFReadRev = le16_to_cpu(lvidiu->minUDFReadRev);
+		minUDFWriteRev = le16_to_cpu(lvidiu->minUDFWriteRev);
 		if (minUDFReadRev > UDF_MAX_READ_VERSION) {
 			udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
-				le16_to_cpu(lvidiu->minUDFReadRev),
+				minUDFReadRev,
 				UDF_MAX_READ_VERSION);
 			ret = -EINVAL;
 			goto error_out;
@@ -2265,11 +2285,7 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct logicalVolIntegrityDescImpUse *lvidiu;
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
-	if (sbi->s_lvid_bh != NULL)
-		lvidiu = udf_sb_lvidiu(sbi);
-	else
-		lvidiu = NULL;
-
+	lvidiu = udf_sb_lvidiu(sb);
 	buf->f_type = UDF_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index ed401e94aa8c..1f32c7bd9f57 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -162,7 +162,7 @@ static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi);
+struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb);
 
 int udf_compute_nr_groups(struct super_block *sb, u32 partition);
 
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 88c5ea75ebf6..f1d85cfc0a54 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -628,6 +628,7 @@ xfs_buf_item_unlock(
 		else if (aborted) {
 			ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
 			if (lip->li_flags & XFS_LI_IN_AIL) {
+				spin_lock(&lip->li_ailp->xa_lock);
 				xfs_trans_ail_delete(lip->li_ailp, lip,
 						     SHUTDOWN_LOG_IO_ERROR);
 			}
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 069537c845e5..20bf8e8002d6 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1224,6 +1224,7 @@ xfs_da3_node_toosmall(
 	/* start with smaller blk num */
 	forward = nodehdr.forw < nodehdr.back;
 	for (i = 0; i < 2; forward = !forward, i++) {
+		struct xfs_da3_icnode_hdr thdr;
 		if (forward)
 			blkno = nodehdr.forw;
 		else
@@ -1236,10 +1237,10 @@ xfs_da3_node_toosmall(
 			return(error);
 
 		node = bp->b_addr;
-		xfs_da3_node_hdr_from_disk(&nodehdr, node);
+		xfs_da3_node_hdr_from_disk(&thdr, node);
 		xfs_trans_brelse(state->args->trans, bp);
 
-		if (count - nodehdr.count >= 0)
+		if (count - thdr.count >= 0)
 			break;	/* fits with at least 25% to spare */
 	}
 	if (i >= 2) {
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 0957aa98b6c0..12dad188939d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -1158,7 +1158,7 @@ xfs_dir2_sf_to_block(
 	/*
 	 * Create entry for .
 	 */
-	dep = xfs_dir3_data_dot_entry_p(hdr);
+	dep = xfs_dir3_data_dot_entry_p(mp, hdr);
 	dep->inumber = cpu_to_be64(dp->i_ino);
 	dep->namelen = 1;
 	dep->name[0] = '.';
@@ -1172,7 +1172,7 @@ xfs_dir2_sf_to_block(
 	/*
 	 * Create entry for ..
 	 */
-	dep = xfs_dir3_data_dotdot_entry_p(hdr);
+	dep = xfs_dir3_data_dotdot_entry_p(mp, hdr);
 	dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
 	dep->namelen = 2;
 	dep->name[0] = dep->name[1] = '.';
@@ -1183,7 +1183,7 @@ xfs_dir2_sf_to_block(
 	blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
 	blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
 				(char *)dep - (char *)hdr));
-	offset = xfs_dir3_data_first_offset(hdr);
+	offset = xfs_dir3_data_first_offset(mp);
 	/*
 	 * Loop over existing entries, stuff them in.
 	 */
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
index a0961a61ac1a..9cf67381adf6 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -497,69 +497,58 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
 /*
  * Offsets of . and .. in data space (always block 0)
  *
- * The macros are used for shortform directories as they have no headers to read
- * the magic number out of. Shortform directories need to know the size of the
- * data block header because the sfe embeds the block offset of the entry into
- * it so that it doesn't change when format conversion occurs. Bad Things Happen
- * if we don't follow this rule.
- *
  * XXX: there is scope for significant optimisation of the logic here. Right
  * now we are checking for "dir3 format" over and over again. Ideally we should
  * only do it once for each operation.
  */
-#define	XFS_DIR3_DATA_DOT_OFFSET(mp)	\
-	xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb))
-#define	XFS_DIR3_DATA_DOTDOT_OFFSET(mp)	\
-	(XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 1))
-#define	XFS_DIR3_DATA_FIRST_OFFSET(mp)		\
-	(XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 2))
-
 static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dot_offset(struct xfs_mount *mp)
 {
-	return xfs_dir3_data_entry_offset(hdr);
+	return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
 }
 
 static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dotdot_offset(struct xfs_mount *mp)
 {
-	bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
-	return xfs_dir3_data_dot_offset(hdr) +
-		__xfs_dir3_data_entsize(dir3, 1);
+	return xfs_dir3_data_dot_offset(mp) +
+		xfs_dir3_data_entsize(mp, 1);
 }
 
 static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_first_offset(struct xfs_mount *mp)
 {
-	bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
-		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
-	return xfs_dir3_data_dotdot_offset(hdr) +
-		__xfs_dir3_data_entsize(dir3, 2);
+	return xfs_dir3_data_dotdot_offset(mp) +
+		xfs_dir3_data_entsize(mp, 2);
 }
 
 /*
  * location of . and .. in data space (always block 0)
  */
 static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dot_entry_p(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_dot_offset(hdr));
+		((char *)hdr + xfs_dir3_data_dot_offset(mp));
 }
 
 static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dotdot_entry_p(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_dotdot_offset(hdr));
+		((char *)hdr + xfs_dir3_data_dotdot_offset(mp));
 }
 
 static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_first_entry_p(
+	struct xfs_mount	*mp,
+	struct xfs_dir2_data_hdr *hdr)
 {
 	return (struct xfs_dir2_data_entry *)
-		((char *)hdr + xfs_dir3_data_first_offset(hdr));
+		((char *)hdr + xfs_dir3_data_first_offset(mp));
 }
 
 /*
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 8993ec17452c..8f84153e98a8 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -119,9 +119,9 @@ xfs_dir2_sf_getdents(
 	 * mp->m_dirdatablk.
 	 */
 	dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-					     XFS_DIR3_DATA_DOT_OFFSET(mp));
+					     xfs_dir3_data_dot_offset(mp));
 	dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-						XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
+						xfs_dir3_data_dotdot_offset(mp));
 
 	/*
 	 * Put . entry unless we're starting past it.
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index bb6e2848f473..3ef6d402084c 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -557,7 +557,7 @@ xfs_dir2_sf_addname_hard(
 	 * to insert the new entry.
 	 * If it's going to end up at the end then oldsfep will point there.
 	 */
-	for (offset = XFS_DIR3_DATA_FIRST_OFFSET(mp),
+	for (offset = xfs_dir3_data_first_offset(mp),
 	      oldsfep = xfs_dir2_sf_firstentry(oldsfp),
 	      add_datasize = xfs_dir3_data_entsize(mp, args->namelen),
 	      eof = (char *)oldsfep == &buf[old_isize];
@@ -640,7 +640,7 @@ xfs_dir2_sf_addname_pick(
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	size = xfs_dir3_data_entsize(mp, args->namelen);
-	offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
+	offset = xfs_dir3_data_first_offset(mp);
 	sfep = xfs_dir2_sf_firstentry(sfp);
 	holefit = 0;
 	/*
@@ -713,7 +713,7 @@ xfs_dir2_sf_check(
 	mp = dp->i_mount;
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
+	offset = xfs_dir3_data_first_offset(mp);
 	ino = xfs_dir2_sf_get_parent_ino(sfp);
 	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
 
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 71520e6e5d65..1ee776d477c3 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -64,7 +64,8 @@ int xfs_dqerror_mod = 33;
 struct kmem_zone		*xfs_qm_dqtrxzone;
 static struct kmem_zone		*xfs_qm_dqzone;
 
-static struct lock_class_key xfs_dquot_other_class;
+static struct lock_class_key xfs_dquot_group_class;
+static struct lock_class_key xfs_dquot_project_class;
 
 /*
  * This is called to free all the memory associated with a dquot
@@ -703,8 +704,20 @@ xfs_qm_dqread(
 	 * Make sure group quotas have a different lock class than user
 	 * quotas.
 	 */
-	if (!(type & XFS_DQ_USER))
-		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+	switch (type) {
+	case XFS_DQ_USER:
+		/* uses the default lock class */
+		break;
+	case XFS_DQ_GROUP:
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
+		break;
+	case XFS_DQ_PROJ:
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
 
 	XFS_STATS_INC(xs_qm_dquot);
 
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 1edb5cc3e5f4..18272c766a50 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -515,7 +515,7 @@ typedef struct xfs_swapext
 /*	XFS_IOC_GETBIOSIZE ---- deprecated 47	   */
 #define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
 #define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
-#define XFS_IOC_FREE_EOFBLOCKS	_IOR ('X', 58, struct xfs_eofblocks)
+#define XFS_IOC_FREE_EOFBLOCKS	_IOR ('X', 58, struct xfs_fs_eofblocks)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 193206ba4358..474807a401c8 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -119,11 +119,6 @@ xfs_inode_free(
 		ip->i_itemp = NULL;
 	}
 
-	/* asserts to verify all state is correct here */
-	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(!xfs_isiflocked(ip));
-
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always
 	 * appears to be reclaimed with an invalid inode number when in the
@@ -135,6 +130,10 @@ xfs_inode_free(
 	ip->i_ino = 0;
 	spin_unlock(&ip->i_flags_lock);
 
+	/* asserts to verify all state is correct here */
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!xfs_isiflocked(ip));
+
 	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index dabda9521b4b..39797490a1f1 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1585,6 +1585,7 @@ xlog_recover_add_to_trans(
 		"bad number of regions (%d) in inode log format",
 				  in_f->ilf_size);
 			ASSERT(0);
+			kmem_free(ptr);
 			return XFS_ERROR(EIO);
 		}
 
@@ -1970,6 +1971,13 @@ xlog_recover_do_inode_buffer(
  * magic number.  If we don't recognise the magic number in the buffer, then
  * return a LSN of -1 so that the caller knows it was an unrecognised block and
  * so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
  */
 static xfs_lsn_t
 xlog_recover_get_buf_lsn(
@@ -1980,6 +1988,8 @@ xlog_recover_get_buf_lsn(
 	__uint16_t		magic16;
 	__uint16_t		magicda;
 	void			*blk = bp->b_addr;
+	uuid_t			*uuid;
+	xfs_lsn_t		lsn = -1;
 
 	/* v4 filesystems always recover immediately */
 	if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -1992,43 +2002,79 @@ xlog_recover_get_buf_lsn(
 	case XFS_ABTB_MAGIC:
 	case XFS_ABTC_MAGIC:
 	case XFS_IBT_CRC_MAGIC:
-	case XFS_IBT_MAGIC:
-		return be64_to_cpu(
-				((struct xfs_btree_block *)blk)->bb_u.s.bb_lsn);
+	case XFS_IBT_MAGIC: {
+		struct xfs_btree_block *btb = blk;
+
+		lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+		uuid = &btb->bb_u.s.bb_uuid;
+		break;
+	}
 	case XFS_BMAP_CRC_MAGIC:
-	case XFS_BMAP_MAGIC:
-		return be64_to_cpu(
-				((struct xfs_btree_block *)blk)->bb_u.l.bb_lsn);
+	case XFS_BMAP_MAGIC: {
+		struct xfs_btree_block *btb = blk;
+
+		lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+		uuid = &btb->bb_u.l.bb_uuid;
+		break;
+	}
 	case XFS_AGF_MAGIC:
-		return be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+		lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+		uuid = &((struct xfs_agf *)blk)->agf_uuid;
+		break;
 	case XFS_AGFL_MAGIC:
-		return be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+		lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+		uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+		break;
 	case XFS_AGI_MAGIC:
-		return be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+		lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+		uuid = &((struct xfs_agi *)blk)->agi_uuid;
+		break;
 	case XFS_SYMLINK_MAGIC:
-		return be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+		lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+		uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+		break;
 	case XFS_DIR3_BLOCK_MAGIC:
 	case XFS_DIR3_DATA_MAGIC:
 	case XFS_DIR3_FREE_MAGIC:
-		return be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+		lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+		uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+		break;
 	case XFS_ATTR3_RMT_MAGIC:
-		return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+		lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+		uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid;
+		break;
 	case XFS_SB_MAGIC:
-		return be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+		lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+		uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+		break;
 	default:
 		break;
 	}
 
+	if (lsn != (xfs_lsn_t)-1) {
+		if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+			goto recover_immediately;
+		return lsn;
+	}
+
 	magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
 	switch (magicda) {
 	case XFS_DIR3_LEAF1_MAGIC:
 	case XFS_DIR3_LEAFN_MAGIC:
 	case XFS_DA3_NODE_MAGIC:
-		return be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+		lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+		uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+		break;
 	default:
 		break;
 	}
 
+	if (lsn != (xfs_lsn_t)-1) {
+		if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+			goto recover_immediately;
+		return lsn;
+	}
+
 	/*
 	 * We do individual object checks on dquot and inode buffers as they
 	 * have their own individual LSN records. Also, we could have a stale