Linux-libre 4.2-gnu

author: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-09-08 01:01:14 -0300
committer: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-09-08 01:01:14 -0300
commit: e5fd91f1ef340da553f7a79da9540c3db711c937 (patch)
tree: b11842027dc6641da63f4bcc524f8678263304a3 /fs
parent: 2a9b0348e685a63d97486f6749622b61e9e3292f (diff)
490 files changed, 17340 insertions, 41977 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 620d93489..8aa56bb6e 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -320,31 +320,21 @@ fail_option_alloc:
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		  const char *dev_name, char *data)
 {
-	int retval = -EINVAL;
 	struct p9_fid *fid;
-	int rc;
+	int rc = -ENOMEM;
 
 	v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
 	if (!v9ses->uname)
-		return ERR_PTR(-ENOMEM);
+		goto err_names;
 
 	v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
-	if (!v9ses->aname) {
-		kfree(v9ses->uname);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!v9ses->aname)
+		goto err_names;
 	init_rwsem(&v9ses->rename_sem);
 
 	rc = bdi_setup_and_register(&v9ses->bdi, "9p");
-	if (rc) {
-		kfree(v9ses->aname);
-		kfree(v9ses->uname);
-		return ERR_PTR(rc);
-	}
-
-	spin_lock(&v9fs_sessionlist_lock);
-	list_add(&v9ses->slist, &v9fs_sessionlist);
-	spin_unlock(&v9fs_sessionlist_lock);
+	if (rc)
+		goto err_names;
 
 	v9ses->uid = INVALID_UID;
 	v9ses->dfltuid = V9FS_DEFUID;
@@ -352,10 +342,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 
 	v9ses->clnt = p9_client_create(dev_name, data);
 	if (IS_ERR(v9ses->clnt)) {
-		retval = PTR_ERR(v9ses->clnt);
-		v9ses->clnt = NULL;
+		rc = PTR_ERR(v9ses->clnt);
 		p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
-		goto error;
+		goto err_bdi;
 	}
 
 	v9ses->flags = V9FS_ACCESS_USER;
@@ -368,10 +357,8 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	}
 
 	rc = v9fs_parse_options(v9ses, data);
-	if (rc < 0) {
-		retval = rc;
-		goto error;
-	}
+	if (rc < 0)
+		goto err_clnt;
 
 	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
@@ -405,10 +392,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID,
 							v9ses->aname);
 	if (IS_ERR(fid)) {
-		retval = PTR_ERR(fid);
-		fid = NULL;
+		rc = PTR_ERR(fid);
 		p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
-		goto error;
+		goto err_clnt;
 	}
 
 	if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)
@@ -420,12 +406,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	/* register the session for caching */
 	v9fs_cache_session_get_cookie(v9ses);
 #endif
+	spin_lock(&v9fs_sessionlist_lock);
+	list_add(&v9ses->slist, &v9fs_sessionlist);
+	spin_unlock(&v9fs_sessionlist_lock);
 
 	return fid;
 
-error:
+err_clnt:
+	p9_client_destroy(v9ses->clnt);
+err_bdi:
 	bdi_destroy(&v9ses->bdi);
-	return ERR_PTR(retval);
+err_names:
+	kfree(v9ses->uname);
+	kfree(v9ses->aname);
+	return ERR_PTR(rc);
 }
 
 /**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index fb9ffcb43..0923f2cf3 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -149,8 +149,6 @@ extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
 extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
 extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry);
-extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
-			void *p);
 extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
 					 struct p9_fid *fid,
 					 struct super_block *sb, int new);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 53f1e8a21..b1dc51888 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1223,100 +1223,43 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
 }
 
 /**
- * v9fs_readlink - read a symlink's location (internal version)
+ * v9fs_vfs_follow_link - follow a symlink path
  * @dentry: dentry for symlink
- * @buffer: buffer to load symlink location into
- * @buflen: length of buffer
- *
+ * @cookie: place to pass the data to put_link()
  */
 
-static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
+static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
 {
-	int retval;
-
-	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid;
+	struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+	struct p9_fid *fid = v9fs_fid_lookup(dentry);
 	struct p9_wstat *st;
+	char *res;
+
+	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
 
-	p9_debug(P9_DEBUG_VFS, " %pd\n", dentry);
-	retval = -EPERM;
-	v9ses = v9fs_dentry2v9ses(dentry);
-	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
-		return PTR_ERR(fid);
+		return ERR_CAST(fid);
 
 	if (!v9fs_proto_dotu(v9ses))
-		return -EBADF;
+		return ERR_PTR(-EBADF);
 
 	st = p9_client_stat(fid);
 	if (IS_ERR(st))
-		return PTR_ERR(st);
+		return ERR_CAST(st);
 
 	if (!(st->mode & P9_DMSYMLINK)) {
-		retval = -EINVAL;
-		goto done;
+		p9stat_free(st);
+		kfree(st);
+		return ERR_PTR(-EINVAL);
 	}
+	res = st->extension;
+	st->extension = NULL;
+	if (strlen(res) >= PATH_MAX)
+		res[PATH_MAX - 1] = '\0';
 
-	/* copy extension buffer into buffer */
-	retval = min(strlen(st->extension)+1, (size_t)buflen);
-	memcpy(buffer, st->extension, retval);
-
-	p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n",
-		 dentry, st->extension, buflen, buffer);
-
-done:
 	p9stat_free(st);
 	kfree(st);
-	return retval;
-}
-
-/**
- * v9fs_vfs_follow_link - follow a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- *
- */
-
-static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	int len = 0;
-	char *link = __getname();
-
-	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
-
-	if (!link)
-		link = ERR_PTR(-ENOMEM);
-	else {
-		len = v9fs_readlink(dentry, link, PATH_MAX);
-
-		if (len < 0) {
-			__putname(link);
-			link = ERR_PTR(len);
-		} else
-			link[min(len, PATH_MAX-1)] = 0;
-	}
-	nd_set_link(nd, link);
-
-	return NULL;
-}
-
-/**
- * v9fs_vfs_put_link - release a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- * @p: unused
- *
- */
-
-void
-v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-	char *s = nd_get_link(nd);
-
-	p9_debug(P9_DEBUG_VFS, " %pd %s\n",
-		 dentry, IS_ERR(s) ? "<error>" : s);
-	if (!IS_ERR(s))
-		__putname(s);
+	return *cookie = res;
 }
 
 /**
@@ -1369,6 +1312,8 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
 }
 
+#define U32_MAX_DIGITS 10
+
 /**
  * v9fs_vfs_link - create a hardlink
  * @old_dentry: dentry for file to link to
@@ -1382,7 +1327,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	      struct dentry *dentry)
 {
 	int retval;
-	char *name;
+	char name[1 + U32_MAX_DIGITS + 2]; /* sign + number + \n + \0 */
 	struct p9_fid *oldfid;
 
 	p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n",
@@ -1392,20 +1337,12 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (IS_ERR(oldfid))
 		return PTR_ERR(oldfid);
 
-	name = __getname();
-	if (unlikely(!name)) {
-		retval = -ENOMEM;
-		goto clunk_fid;
-	}
-
 	sprintf(name, "%d\n", oldfid->fid);
 	retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
-	__putname(name);
 	if (!retval) {
 		v9fs_refresh_inode(oldfid, d_inode(old_dentry));
 		v9fs_invalidate_inode_attr(dir);
 	}
-clunk_fid:
 	p9_client_clunk(oldfid);
 	return retval;
 }
@@ -1424,7 +1361,7 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
 {
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
 	int retval;
-	char *name;
+	char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1];
 	u32 perm;
 
 	p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
@@ -1434,26 +1371,16 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	name = __getname();
-	if (!name)
-		return -ENOMEM;
 	/* build extension */
 	if (S_ISBLK(mode))
 		sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
 	else if (S_ISCHR(mode))
 		sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
-	else if (S_ISFIFO(mode))
-		*name = 0;
-	else if (S_ISSOCK(mode))
+	else
 		*name = 0;
-	else {
-		__putname(name);
-		return -EINVAL;
-	}
 
 	perm = unixmode2p9mode(v9ses, mode);
 	retval = v9fs_vfs_mkspecial(dir, dentry, perm, name);
-	__putname(name);
 
 	return retval;
 }
@@ -1529,7 +1456,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
 static const struct inode_operations v9fs_symlink_inode_operations = {
 	.readlink = generic_readlink,
 	.follow_link = v9fs_vfs_follow_link,
-	.put_link = v9fs_vfs_put_link,
+	.put_link = kfree_put_link,
 	.getattr = v9fs_vfs_getattr,
 	.setattr = v9fs_vfs_setattr,
 };
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4d3ecfb55..e8aa57dc8 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -904,41 +904,24 @@ error:
 /**
  * v9fs_vfs_follow_link_dotl - follow a symlink path
  * @dentry: dentry for symlink
- * @nd: nameidata
- *
+ * @cookie: place to pass the data to put_link()
  */
 
-static void *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+static const char *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
 {
-	int retval;
-	struct p9_fid *fid;
-	char *link = __getname();
+	struct p9_fid *fid = v9fs_fid_lookup(dentry);
 	char *target;
+	int retval;
 
 	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
 
-	if (!link) {
-		link = ERR_PTR(-ENOMEM);
-		goto ndset;
-	}
-	fid = v9fs_fid_lookup(dentry);
-	if (IS_ERR(fid)) {
-		__putname(link);
-		link = ERR_CAST(fid);
-		goto ndset;
-	}
+	if (IS_ERR(fid))
+		return ERR_CAST(fid);
 	retval = p9_client_readlink(fid, &target);
-	if (!retval) {
-		strcpy(link, target);
-		kfree(target);
-		goto ndset;
-	}
-	__putname(link);
-	link = ERR_PTR(retval);
-ndset:
-	nd_set_link(nd, link);
-	return NULL;
+	if (retval)
+		return ERR_PTR(retval);
+	return *cookie = target;
 }
 
 int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
@@ -1005,7 +988,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
 const struct inode_operations v9fs_symlink_inode_operations_dotl = {
 	.readlink = generic_readlink,
 	.follow_link = v9fs_vfs_follow_link_dotl,
-	.put_link = v9fs_vfs_put_link,
+	.put_link = kfree_put_link,
 	.getattr = v9fs_vfs_getattr_dotl,
 	.setattr = v9fs_vfs_setattr_dotl,
 	.setxattr = generic_setxattr,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index e99a338a4..bf495cede 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -130,11 +130,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	fid = v9fs_session_init(v9ses, dev_name, data);
 	if (IS_ERR(fid)) {
 		retval = PTR_ERR(fid);
-		/*
-		 * we need to call session_close to tear down some
-		 * of the data structure setup by session_init
-		 */
-		goto close_session;
+		goto free_session;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses);
@@ -195,8 +191,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 
 clunk_fid:
 	p9_client_clunk(fid);
-close_session:
 	v9fs_session_close(v9ses);
+free_session:
 	kfree(v9ses);
 	return ERR_PTR(retval);
 
diff --git a/fs/Kconfig b/fs/Kconfig
index b1083f6b6..011f43365 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -218,7 +218,6 @@ source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
-source "fs/aufs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/Makefile b/fs/Makefile
index 8c2df123d..cb20e4bf2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -115,7 +115,6 @@ obj-$(CONFIG_AFS_FS)		+= afs/
 obj-$(CONFIG_NILFS2_FS)		+= nilfs2/
 obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
-obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_CACHEFILES)	+= cachefiles/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_TRACING)		+= tracefs/
@@ -127,4 +126,3 @@ obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
-obj-$(CONFIG_AUFS_FS)           += aufs/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index a19c31d3f..4d4a0df83 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -242,7 +242,7 @@ static struct kmem_cache *adfs_inode_cachep;
 static struct inode *adfs_alloc_inode(struct super_block *sb)
 {
 	struct adfs_inode_info *ei;
-	ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
+	ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index cffe8370f..c69a87eaf 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -64,7 +64,7 @@ struct affs_inode_info {
 /* short cut to get to the affs specific inode data */
 static inline struct affs_inode_info *AFFS_I(struct inode *inode)
 {
-	return list_entry(inode, struct affs_inode_info, vfs_inode);
+	return container_of(inode, struct affs_inode_info, vfs_inode);
 }
 
 /*
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index a8f463c02..5fa92bc79 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -140,7 +140,7 @@ affs_remove_link(struct dentry *dentry)
 {
 	struct inode *dir, *inode = d_inode(dentry);
 	struct super_block *sb = inode->i_sb;
-	struct buffer_head *bh = NULL, *link_bh = NULL;
+	struct buffer_head *bh, *link_bh = NULL;
 	u32 link_ino, ino;
 	int retval;
 
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index a022f4acc..173495005 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -346,7 +346,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
 {
 	struct super_block *sb = dir->i_sb;
 	struct buffer_head *inode_bh = NULL;
-	struct buffer_head *bh = NULL;
+	struct buffer_head *bh;
 	u32 block = 0;
 	int retval;
 
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index f39b71c39..ea5b69a18 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -16,14 +16,12 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
 	struct inode *inode = page->mapping->host;
 	char *link = kmap(page);
 	struct slink_front *lf;
-	int err;
 	int			 i, j;
 	char			 c;
 	char			 lc;
 
 	pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
 
-	err = -EIO;
 	bh = affs_bread(inode->i_sb, inode->i_ino);
 	if (!bh)
 		goto fail;
@@ -66,7 +64,7 @@ fail:
 	SetPageError(page);
 	kunmap(page);
 	unlock_page(page);
-	return err;
+	return -EIO;
 }
 
 const struct address_space_operations affs_symlink_aops = {
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 3a57a1b0f..b50642870 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -85,7 +85,7 @@ int afs_open_socket(void)
 		return -ENOMEM;
 	}
 
-	ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+	ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
 	if (ret < 0) {
 		destroy_workqueue(afs_async_calls);
 		_leave(" = %d [socket]", ret);
diff --git a/fs/aufs/Kconfig b/fs/aufs/Kconfig
deleted file mode 100644
index 63560ceda..000000000
--- a/fs/aufs/Kconfig
+++ /dev/null
@@ -1,185 +0,0 @@
-config AUFS_FS
-	tristate "Aufs (Advanced multi layered unification filesystem) support"
-	help
-	Aufs is a stackable unification filesystem such as Unionfs,
-	which unifies several directories and provides a merged single
-	directory.
-	In the early days, aufs was entirely re-designed and
-	re-implemented Unionfs Version 1.x series. Introducing many
-	original ideas, approaches and improvements, it becomes totally
-	different from Unionfs while keeping the basic features.
-
-if AUFS_FS
-choice
-	prompt "Maximum number of branches"
-	default AUFS_BRANCH_MAX_127
-	help
-	Specifies the maximum number of branches (or member directories)
-	in a single aufs. The larger value consumes more system
-	resources and has a minor impact to performance.
-config AUFS_BRANCH_MAX_127
-	bool "127"
-	help
-	Specifies the maximum number of branches (or member directories)
-	in a single aufs. The larger value consumes more system
-	resources and has a minor impact to performance.
-config AUFS_BRANCH_MAX_511
-	bool "511"
-	help
-	Specifies the maximum number of branches (or member directories)
-	in a single aufs. The larger value consumes more system
-	resources and has a minor impact to performance.
-config AUFS_BRANCH_MAX_1023
-	bool "1023"
-	help
-	Specifies the maximum number of branches (or member directories)
-	in a single aufs. The larger value consumes more system
-	resources and has a minor impact to performance.
-config AUFS_BRANCH_MAX_32767
-	bool "32767"
-	help
-	Specifies the maximum number of branches (or member directories)
-	in a single aufs. The larger value consumes more system
-	resources and has a minor impact to performance.
-endchoice
-
-config AUFS_SBILIST
-	bool
-	depends on AUFS_MAGIC_SYSRQ || PROC_FS
-	default y
-	help
-	Automatic configuration for internal use.
-	When aufs supports Magic SysRq or /proc, enabled automatically.
-
-config AUFS_HNOTIFY
-	bool "Detect direct branch access (bypassing aufs)"
-	help
-	If you want to modify files on branches directly, eg. bypassing aufs,
-	and want aufs to detect the changes of them fully, then enable this
-	option and use 'udba=notify' mount option.
-	Currently there is only one available configuration, "fsnotify".
-	It will have a negative impact to the performance.
-	See detail in aufs.5.
-
-choice
-	prompt "method" if AUFS_HNOTIFY
-	default AUFS_HFSNOTIFY
-config AUFS_HFSNOTIFY
-	bool "fsnotify"
-	select FSNOTIFY
-endchoice
-
-config AUFS_EXPORT
-	bool "NFS-exportable aufs"
-	depends on EXPORTFS
-	help
-	If you want to export your mounted aufs via NFS, then enable this
-	option. There are several requirements for this configuration.
-	See detail in aufs.5.
-
-config AUFS_INO_T_64
-	bool
-	depends on AUFS_EXPORT
-	depends on 64BIT && !(ALPHA || S390)
-	default y
-	help
-	Automatic configuration for internal use.
-	/* typedef unsigned long/int __kernel_ino_t */
-	/* alpha and s390x are int */
-
-config AUFS_XATTR
-	bool "support for XATTR/EA (including Security Labels)"
-	help
-	If your branch fs supports XATTR/EA and you want to make them
-	available in aufs too, then enable this opsion and specify the
-	branch attributes for EA.
-	See detail in aufs.5.
-
-config AUFS_FHSM
-	bool "File-based Hierarchical Storage Management"
-	help
-	Hierarchical Storage Management (or HSM) is a well-known feature
-	in the storage world. Aufs provides this feature as file-based.
-	with multiple branches.
-	These multiple branches are prioritized, ie. the topmost one
-	should be the fastest drive and be used heavily.
-
-config AUFS_RDU
-	bool "Readdir in userspace"
-	help
-	Aufs has two methods to provide a merged view for a directory,
-	by a user-space library and by kernel-space natively. The latter
-	is always enabled but sometimes large and slow.
-	If you enable this option, install the library in aufs2-util
-	package, and set some environment variables for your readdir(3),
-	then the work will be handled in user-space which generally
-	shows better performance in most cases.
-	See detail in aufs.5.
-
-config AUFS_SHWH
-	bool "Show whiteouts"
-	help
-	If you want to make the whiteouts in aufs visible, then enable
-	this option and specify 'shwh' mount option. Although it may
-	sounds like philosophy or something, but in technically it
-	simply shows the name of whiteout with keeping its behaviour.
-
-config AUFS_BR_RAMFS
-	bool "Ramfs (initramfs/rootfs) as an aufs branch"
-	help
-	If you want to use ramfs as an aufs branch fs, then enable this
-	option. Generally tmpfs is recommended.
-	Aufs prohibited them to be a branch fs by default, because
-	initramfs becomes unusable after switch_root or something
-	generally. If you sets initramfs as an aufs branch and boot your
-	system by switch_root, you will meet a problem easily since the
-	files in initramfs may be inaccessible.
-	Unless you are going to use ramfs as an aufs branch fs without
-	switch_root or something, leave it N.
-
-config AUFS_BR_FUSE
-	bool "Fuse fs as an aufs branch"
-	depends on FUSE_FS
-	select AUFS_POLL
-	help
-	If you want to use fuse-based userspace filesystem as an aufs
-	branch fs, then enable this option.
-	It implements the internal poll(2) operation which is
-	implemented by fuse only (curretnly).
-
-config AUFS_POLL
-	bool
-	help
-	Automatic configuration for internal use.
-
-config AUFS_BR_HFSPLUS
-	bool "Hfsplus as an aufs branch"
-	depends on HFSPLUS_FS
-	default y
-	help
-	If you want to use hfsplus fs as an aufs branch fs, then enable
-	this option. This option introduces a small overhead at
-	copying-up a file on hfsplus.
-
-config AUFS_BDEV_LOOP
-	bool
-	depends on BLK_DEV_LOOP
-	default y
-	help
-	Automatic configuration for internal use.
-	Convert =[ym] into =y.
-
-config AUFS_DEBUG
-	bool "Debug aufs"
-	help
-	Enable this to compile aufs internal debug code.
-	It will have a negative impact to the performance.
-
-config AUFS_MAGIC_SYSRQ
-	bool
-	depends on AUFS_DEBUG && MAGIC_SYSRQ
-	default y
-	help
-	Automatic configuration for internal use.
-	When aufs supports Magic SysRq, enabled automatically.
-endif
diff --git a/fs/aufs/Makefile b/fs/aufs/Makefile
deleted file mode 100644
index c7a501e37..000000000
--- a/fs/aufs/Makefile
+++ /dev/null
@@ -1,44 +0,0 @@
-
-include ${src}/magic.mk
-ifeq (${CONFIG_AUFS_FS},m)
-include ${src}/conf.mk
-endif
--include ${src}/priv_def.mk
-
-# cf. include/linux/kernel.h
-# enable pr_debug
-ccflags-y += -DDEBUG
-# sparse requires the full pathname
-ifdef M
-ccflags-y += -include ${M}/../../include/uapi/linux/aufs_type.h
-else
-ccflags-y += -include ${srctree}/include/uapi/linux/aufs_type.h
-endif
-
-obj-$(CONFIG_AUFS_FS) += aufs.o
-aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \
-	wkq.o vfsub.o dcsub.o \
-	cpup.o whout.o wbr_policy.o \
-	dinfo.o dentry.o \
-	dynop.o \
-	finfo.o file.o f_op.o \
-	dir.o vdir.o \
-	iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \
-	mvdown.o ioctl.o
-
-# all are boolean
-aufs-$(CONFIG_PROC_FS) += procfs.o plink.o
-aufs-$(CONFIG_SYSFS) += sysfs.o
-aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o
-aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o
-aufs-$(CONFIG_AUFS_HNOTIFY) += hnotify.o
-aufs-$(CONFIG_AUFS_HFSNOTIFY) += hfsnotify.o
-aufs-$(CONFIG_AUFS_EXPORT) += export.o
-aufs-$(CONFIG_AUFS_XATTR) += xattr.o
-aufs-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
-aufs-$(CONFIG_AUFS_FHSM) += fhsm.o
-aufs-$(CONFIG_AUFS_POLL) += poll.o
-aufs-$(CONFIG_AUFS_RDU) += rdu.o
-aufs-$(CONFIG_AUFS_BR_HFSPLUS) += hfsplus.o
-aufs-$(CONFIG_AUFS_DEBUG) += debug.o
-aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o
diff --git a/fs/aufs/aufs.h b/fs/aufs/aufs.h
deleted file mode 100644
index d82e9743f..000000000
--- a/fs/aufs/aufs.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * all header files
- */
-
-#ifndef __AUFS_H__
-#define __AUFS_H__
-
-#ifdef __KERNEL__
-
-#define AuStub(type, name, body, ...) \
-	static inline type name(__VA_ARGS__) { body; }
-
-#define AuStubVoid(name, ...) \
-	AuStub(void, name, , __VA_ARGS__)
-#define AuStubInt0(name, ...) \
-	AuStub(int, name, return 0, __VA_ARGS__)
-
-#include "debug.h"
-
-#include "branch.h"
-#include "cpup.h"
-#include "dcsub.h"
-#include "dbgaufs.h"
-#include "dentry.h"
-#include "dir.h"
-#include "dynop.h"
-#include "file.h"
-#include "fstype.h"
-#include "inode.h"
-#include "loop.h"
-#include "module.h"
-#include "opts.h"
-#include "rwsem.h"
-#include "spl.h"
-#include "super.h"
-#include "sysaufs.h"
-#include "vfsub.h"
-#include "whout.h"
-#include "wkq.h"
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_H__ */
diff --git a/fs/aufs/branch.c b/fs/aufs/branch.c
deleted file mode 100644
index 2ec496438..000000000
--- a/fs/aufs/branch.c
+++ /dev/null
@@ -1,1414 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * branch management
- */
-
-#include <linux/compat.h>
-#include <linux/statfs.h>
-#include "aufs.h"
-
-/*
- * free a single branch
- */
-static void au_br_do_free(struct au_branch *br)
-{
-	int i;
-	struct au_wbr *wbr;
-	struct au_dykey **key;
-
-	au_hnotify_fin_br(br);
-
-	if (br->br_xino.xi_file)
-		fput(br->br_xino.xi_file);
-	mutex_destroy(&br->br_xino.xi_nondir_mtx);
-
-	AuDebugOn(atomic_read(&br->br_count));
-
-	wbr = br->br_wbr;
-	if (wbr) {
-		for (i = 0; i < AuBrWh_Last; i++)
-			dput(wbr->wbr_wh[i]);
-		AuDebugOn(atomic_read(&wbr->wbr_wh_running));
-		AuRwDestroy(&wbr->wbr_wh_rwsem);
-	}
-
-	if (br->br_fhsm) {
-		au_br_fhsm_fin(br->br_fhsm);
-		kfree(br->br_fhsm);
-	}
-
-	key = br->br_dykey;
-	for (i = 0; i < AuBrDynOp; i++, key++)
-		if (*key)
-			au_dy_put(*key);
-		else
-			break;
-
-	/* recursive lock, s_umount of branch's */
-	lockdep_off();
-	path_put(&br->br_path);
-	lockdep_on();
-	kfree(wbr);
-	kfree(br);
-}
-
-/*
- * frees all branches
- */
-void au_br_free(struct au_sbinfo *sbinfo)
-{
-	aufs_bindex_t bmax;
-	struct au_branch **br;
-
-	AuRwMustWriteLock(&sbinfo->si_rwsem);
-
-	bmax = sbinfo->si_bend + 1;
-	br = sbinfo->si_branch;
-	while (bmax--)
-		au_br_do_free(*br++);
-}
-
-/*
- * find the index of a branch which is specified by @br_id.
- */
-int au_br_index(struct super_block *sb, aufs_bindex_t br_id)
-{
-	aufs_bindex_t bindex, bend;
-
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++)
-		if (au_sbr_id(sb, bindex) == br_id)
-			return bindex;
-	return -1;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * add a branch
- */
-
-static int test_overlap(struct super_block *sb, struct dentry *h_adding,
-			struct dentry *h_root)
-{
-	if (unlikely(h_adding == h_root
-		     || au_test_loopback_overlap(sb, h_adding)))
-		return 1;
-	if (h_adding->d_sb != h_root->d_sb)
-		return 0;
-	return au_test_subdir(h_adding, h_root)
-		|| au_test_subdir(h_root, h_adding);
-}
-
-/*
- * returns a newly allocated branch. @new_nbranch is a number of branches
- * after adding a branch.
- */
-static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch,
-				     int perm)
-{
-	struct au_branch *add_branch;
-	struct dentry *root;
-	struct inode *inode;
-	int err;
-
-	err = -ENOMEM;
-	root = sb->s_root;
-	add_branch = kmalloc(sizeof(*add_branch), GFP_NOFS);
-	if (unlikely(!add_branch))
-		goto out;
-
-	err = au_hnotify_init_br(add_branch, perm);
-	if (unlikely(err))
-		goto out_br;
-
-	add_branch->br_wbr = NULL;
-	if (au_br_writable(perm)) {
-		/* may be freed separately at changing the branch permission */
-		add_branch->br_wbr = kmalloc(sizeof(*add_branch->br_wbr),
-					     GFP_NOFS);
-		if (unlikely(!add_branch->br_wbr))
-			goto out_hnotify;
-	}
-
-	add_branch->br_fhsm = NULL;
-	if (au_br_fhsm(perm)) {
-		err = au_fhsm_br_alloc(add_branch);
-		if (unlikely(err))
-			goto out_wbr;
-	}
-
-	err = au_sbr_realloc(au_sbi(sb), new_nbranch);
-	if (!err)
-		err = au_di_realloc(au_di(root), new_nbranch);
-	if (!err) {
-		inode = d_inode(root);
-		err = au_ii_realloc(au_ii(inode), new_nbranch);
-	}
-	if (!err)
-		return add_branch; /* success */
-
-out_wbr:
-	kfree(add_branch->br_wbr);
-out_hnotify:
-	au_hnotify_fin_br(add_branch);
-out_br:
-	kfree(add_branch);
-out:
-	return ERR_PTR(err);
-}
-
-/*
- * test if the branch permission is legal or not.
- */
-static int test_br(struct inode *inode, int brperm, char *path)
-{
-	int err;
-
-	err = (au_br_writable(brperm) && IS_RDONLY(inode));
-	if (!err)
-		goto out;
-
-	err = -EINVAL;
-	pr_err("write permission for readonly mount or inode, %s\n", path);
-
-out:
-	return err;
-}
-
-/*
- * returns:
- * 0: success, the caller will add it
- * plus: success, it is already unified, the caller should ignore it
- * minus: error
- */
-static int test_add(struct super_block *sb, struct au_opt_add *add, int remount)
-{
-	int err;
-	aufs_bindex_t bend, bindex;
-	struct dentry *root, *h_dentry;
-	struct inode *inode, *h_inode;
-
-	root = sb->s_root;
-	bend = au_sbend(sb);
-	if (unlikely(bend >= 0
-		     && au_find_dbindex(root, add->path.dentry) >= 0)) {
-		err = 1;
-		if (!remount) {
-			err = -EINVAL;
-			pr_err("%s duplicated\n", add->pathname);
-		}
-		goto out;
-	}
-
-	err = -ENOSPC; /* -E2BIG; */
-	if (unlikely(AUFS_BRANCH_MAX <= add->bindex
-		     || AUFS_BRANCH_MAX - 1 <= bend)) {
-		pr_err("number of branches exceeded %s\n", add->pathname);
-		goto out;
-	}
-
-	err = -EDOM;
-	if (unlikely(add->bindex < 0 || bend + 1 < add->bindex)) {
-		pr_err("bad index %d\n", add->bindex);
-		goto out;
-	}
-
-	inode = d_inode(add->path.dentry);
-	err = -ENOENT;
-	if (unlikely(!inode->i_nlink)) {
-		pr_err("no existence %s\n", add->pathname);
-		goto out;
-	}
-
-	err = -EINVAL;
-	if (unlikely(inode->i_sb == sb)) {
-		pr_err("%s must be outside\n", add->pathname);
-		goto out;
-	}
-
-	if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) {
-		pr_err("unsupported filesystem, %s (%s)\n",
-		       add->pathname, au_sbtype(inode->i_sb));
-		goto out;
-	}
-
-	if (unlikely(inode->i_sb->s_stack_depth)) {
-		pr_err("already stacked, %s (%s)\n",
-		       add->pathname, au_sbtype(inode->i_sb));
-		goto out;
-	}
-
-	err = test_br(d_inode(add->path.dentry), add->perm, add->pathname);
-	if (unlikely(err))
-		goto out;
-
-	if (bend < 0)
-		return 0; /* success */
-
-	err = -EINVAL;
-	for (bindex = 0; bindex <= bend; bindex++)
-		if (unlikely(test_overlap(sb, add->path.dentry,
-					  au_h_dptr(root, bindex)))) {
-			pr_err("%s is overlapped\n", add->pathname);
-			goto out;
-		}
-
-	err = 0;
-	if (au_opt_test(au_mntflags(sb), WARN_PERM)) {
-		h_dentry = au_h_dptr(root, 0);
-		h_inode = d_inode(h_dentry);
-		if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO)
-		    || !uid_eq(h_inode->i_uid, inode->i_uid)
-		    || !gid_eq(h_inode->i_gid, inode->i_gid))
-			pr_warn("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n",
-				add->pathname,
-				i_uid_read(inode), i_gid_read(inode),
-				(inode->i_mode & S_IALLUGO),
-				i_uid_read(h_inode), i_gid_read(h_inode),
-				(h_inode->i_mode & S_IALLUGO));
-	}
-
-out:
-	return err;
-}
-
-/*
- * initialize or clean the whiteouts for an adding branch
- */
-static int au_br_init_wh(struct super_block *sb, struct au_branch *br,
-			 int new_perm)
-{
-	int err, old_perm;
-	aufs_bindex_t bindex;
-	struct mutex *h_mtx;
-	struct au_wbr *wbr;
-	struct au_hinode *hdir;
-	struct dentry *h_dentry;
-
-	err = vfsub_mnt_want_write(au_br_mnt(br));
-	if (unlikely(err))
-		goto out;
-
-	wbr = br->br_wbr;
-	old_perm = br->br_perm;
-	br->br_perm = new_perm;
-	hdir = NULL;
-	h_mtx = NULL;
-	bindex = au_br_index(sb, br->br_id);
-	if (0 <= bindex) {
-		hdir = au_hi(d_inode(sb->s_root), bindex);
-		au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
-	} else {
-		h_dentry = au_br_dentry(br);
-		h_mtx = &d_inode(h_dentry)->i_mutex;
-		mutex_lock_nested(h_mtx, AuLsc_I_PARENT);
-	}
-	if (!wbr)
-		err = au_wh_init(br, sb);
-	else {
-		wbr_wh_write_lock(wbr);
-		err = au_wh_init(br, sb);
-		wbr_wh_write_unlock(wbr);
-	}
-	if (hdir)
-		au_hn_imtx_unlock(hdir);
-	else
-		mutex_unlock(h_mtx);
-	vfsub_mnt_drop_write(au_br_mnt(br));
-	br->br_perm = old_perm;
-
-	if (!err && wbr && !au_br_writable(new_perm)) {
-		kfree(wbr);
-		br->br_wbr = NULL;
-	}
-
-out:
-	return err;
-}
-
-static int au_wbr_init(struct au_branch *br, struct super_block *sb,
-		       int perm)
-{
-	int err;
-	struct kstatfs kst;
-	struct au_wbr *wbr;
-
-	wbr = br->br_wbr;
-	au_rw_init(&wbr->wbr_wh_rwsem);
-	memset(wbr->wbr_wh, 0, sizeof(wbr->wbr_wh));
-	atomic_set(&wbr->wbr_wh_running, 0);
-	wbr->wbr_bytes = 0;
-
-	/*
-	 * a limit for rmdir/rename a dir
-	 * cf. AUFS_MAX_NAMELEN in include/uapi/linux/aufs_type.h
-	 */
-	err = vfs_statfs(&br->br_path, &kst);
-	if (unlikely(err))
-		goto out;
-	err = -EINVAL;
-	if (kst.f_namelen >= NAME_MAX)
-		err = au_br_init_wh(sb, br, perm);
-	else
-		pr_err("%pd(%s), unsupported namelen %ld\n",
-		       au_br_dentry(br),
-		       au_sbtype(au_br_dentry(br)->d_sb), kst.f_namelen);
-
-out:
-	return err;
-}
-
-/* initialize a new branch */
-static int au_br_init(struct au_branch *br, struct super_block *sb,
-		      struct au_opt_add *add)
-{
-	int err;
-	struct inode *h_inode;
-
-	err = 0;
-	memset(&br->br_xino, 0, sizeof(br->br_xino));
-	mutex_init(&br->br_xino.xi_nondir_mtx);
-	br->br_perm = add->perm;
-	br->br_path = add->path; /* set first, path_get() later */
-	spin_lock_init(&br->br_dykey_lock);
-	memset(br->br_dykey, 0, sizeof(br->br_dykey));
-	atomic_set(&br->br_count, 0);
-	atomic_set(&br->br_xino_running, 0);
-	br->br_id = au_new_br_id(sb);
-	AuDebugOn(br->br_id < 0);
-
-	if (au_br_writable(add->perm)) {
-		err = au_wbr_init(br, sb, add->perm);
-		if (unlikely(err))
-			goto out_err;
-	}
-
-	if (au_opt_test(au_mntflags(sb), XINO)) {
-		h_inode = d_inode(add->path.dentry);
-		err = au_xino_br(sb, br, h_inode->i_ino,
-				 au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1);
-		if (unlikely(err)) {
-			AuDebugOn(br->br_xino.xi_file);
-			goto out_err;
-		}
-	}
-
-	sysaufs_br_init(br);
-	path_get(&br->br_path);
-	goto out; /* success */
-
-out_err:
-	memset(&br->br_path, 0, sizeof(br->br_path));
-out:
-	return err;
-}
-
-static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex,
-			     struct au_branch *br, aufs_bindex_t bend,
-			     aufs_bindex_t amount)
-{
-	struct au_branch **brp;
-
-	AuRwMustWriteLock(&sbinfo->si_rwsem);
-
-	brp = sbinfo->si_branch + bindex;
-	memmove(brp + 1, brp, sizeof(*brp) * amount);
-	*brp = br;
-	sbinfo->si_bend++;
-	if (unlikely(bend < 0))
-		sbinfo->si_bend = 0;
-}
-
-static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex,
-			     aufs_bindex_t bend, aufs_bindex_t amount)
-{
-	struct au_hdentry *hdp;
-
-	AuRwMustWriteLock(&dinfo->di_rwsem);
-
-	hdp = dinfo->di_hdentry + bindex;
-	memmove(hdp + 1, hdp, sizeof(*hdp) * amount);
-	au_h_dentry_init(hdp);
-	dinfo->di_bend++;
-	if (unlikely(bend < 0))
-		dinfo->di_bstart = 0;
-}
-
-static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex,
-			     aufs_bindex_t bend, aufs_bindex_t amount)
-{
-	struct au_hinode *hip;
-
-	AuRwMustWriteLock(&iinfo->ii_rwsem);
-
-	hip = iinfo->ii_hinode + bindex;
-	memmove(hip + 1, hip, sizeof(*hip) * amount);
-	hip->hi_inode = NULL;
-	au_hn_init(hip);
-	iinfo->ii_bend++;
-	if (unlikely(bend < 0))
-		iinfo->ii_bstart = 0;
-}
-
-static void au_br_do_add(struct super_block *sb, struct au_branch *br,
-			 aufs_bindex_t bindex)
-{
-	struct dentry *root, *h_dentry;
-	struct inode *root_inode, *h_inode;
-	aufs_bindex_t bend, amount;
-
-	root = sb->s_root;
-	root_inode = d_inode(root);
-	bend = au_sbend(sb);
-	amount = bend + 1 - bindex;
-	h_dentry = au_br_dentry(br);
-	au_sbilist_lock();
-	au_br_do_add_brp(au_sbi(sb), bindex, br, bend, amount);
-	au_br_do_add_hdp(au_di(root), bindex, bend, amount);
-	au_br_do_add_hip(au_ii(root_inode), bindex, bend, amount);
-	au_set_h_dptr(root, bindex, dget(h_dentry));
-	h_inode = d_inode(h_dentry);
-	au_set_h_iptr(root_inode, bindex, au_igrab(h_inode), /*flags*/0);
-	au_sbilist_unlock();
-}
-
-int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount)
-{
-	int err;
-	aufs_bindex_t bend, add_bindex;
-	struct dentry *root, *h_dentry;
-	struct inode *root_inode;
-	struct au_branch *add_branch;
-
-	root = sb->s_root;
-	root_inode = d_inode(root);
-	IMustLock(root_inode);
-	err = test_add(sb, add, remount);
-	if (unlikely(err < 0))
-		goto out;
-	if (err) {
-		err = 0;
-		goto out; /* success */
-	}
-
-	bend = au_sbend(sb);
-	add_branch = au_br_alloc(sb, bend + 2, add->perm);
-	err = PTR_ERR(add_branch);
-	if (IS_ERR(add_branch))
-		goto out;
-
-	err = au_br_init(add_branch, sb, add);
-	if (unlikely(err)) {
-		au_br_do_free(add_branch);
-		goto out;
-	}
-
-	add_bindex = add->bindex;
-	if (!remount)
-		au_br_do_add(sb, add_branch, add_bindex);
-	else {
-		sysaufs_brs_del(sb, add_bindex);
-		au_br_do_add(sb, add_branch, add_bindex);
-		sysaufs_brs_add(sb, add_bindex);
-	}
-
-	h_dentry = add->path.dentry;
-	if (!add_bindex) {
-		au_cpup_attr_all(root_inode, /*force*/1);
-		sb->s_maxbytes = h_dentry->d_sb->s_maxbytes;
-	} else
-		au_add_nlink(root_inode, d_inode(h_dentry));
-
-	/*
-	 * this test/set prevents aufs from handling unnecesary notify events
-	 * of xino files, in case of re-adding a writable branch which was
-	 * once detached from aufs.
-	 */
-	if (au_xino_brid(sb) < 0
-	    && au_br_writable(add_branch->br_perm)
-	    && !au_test_fs_bad_xino(h_dentry->d_sb)
-	    && add_branch->br_xino.xi_file
-	    && add_branch->br_xino.xi_file->f_path.dentry->d_parent == h_dentry)
-		au_xino_brid_set(sb, add_branch->br_id);
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static unsigned long long au_farray_cb(void *a,
-				       unsigned long long max __maybe_unused,
-				       void *arg)
-{
-	unsigned long long n;
-	struct file **p, *f;
-	struct au_sphlhead *files;
-	struct au_finfo *finfo;
-	struct super_block *sb = arg;
-
-	n = 0;
-	p = a;
-	files = &au_sbi(sb)->si_files;
-	spin_lock(&files->spin);
-	hlist_for_each_entry(finfo, &files->head, fi_hlist) {
-		f = finfo->fi_file;
-		if (file_count(f)
-		    && !special_file(file_inode(f)->i_mode)) {
-			get_file(f);
-			*p++ = f;
-			n++;
-			AuDebugOn(n > max);
-		}
-	}
-	spin_unlock(&files->spin);
-
-	return n;
-}
-
-static struct file **au_farray_alloc(struct super_block *sb,
-				     unsigned long long *max)
-{
-	*max = atomic_long_read(&au_sbi(sb)->si_nfiles);
-	return au_array_alloc(max, au_farray_cb, sb);
-}
-
-static void au_farray_free(struct file **a, unsigned long long max)
-{
-	unsigned long long ull;
-
-	for (ull = 0; ull < max; ull++)
-		if (a[ull])
-			fput(a[ull]);
-	au_array_free(a);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * delete a branch
- */
-
-/* to show the line number, do not make it inlined function */
-#define AuVerbose(do_info, fmt, ...) do { \
-	if (do_info) \
-		pr_info(fmt, ##__VA_ARGS__); \
-} while (0)
-
-static int au_test_ibusy(struct inode *inode, aufs_bindex_t bstart,
-			 aufs_bindex_t bend)
-{
-	return (inode && !S_ISDIR(inode->i_mode)) || bstart == bend;
-}
-
-static int au_test_dbusy(struct dentry *dentry, aufs_bindex_t bstart,
-			 aufs_bindex_t bend)
-{
-	return au_test_ibusy(d_inode(dentry), bstart, bend);
-}
-
-/*
- * test if the branch is deletable or not.
- */
-static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex,
-			    unsigned int sigen, const unsigned int verbose)
-{
-	int err, i, j, ndentry;
-	aufs_bindex_t bstart, bend;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry *d;
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	err = au_dcsub_pages(&dpages, root, NULL, NULL);
-	if (unlikely(err))
-		goto out_dpages;
-
-	for (i = 0; !err && i < dpages.ndpage; i++) {
-		dpage = dpages.dpages + i;
-		ndentry = dpage->ndentry;
-		for (j = 0; !err && j < ndentry; j++) {
-			d = dpage->dentries[j];
-			AuDebugOn(au_dcount(d) <= 0);
-			if (!au_digen_test(d, sigen)) {
-				di_read_lock_child(d, AuLock_IR);
-				if (unlikely(au_dbrange_test(d))) {
-					di_read_unlock(d, AuLock_IR);
-					continue;
-				}
-			} else {
-				di_write_lock_child(d);
-				if (unlikely(au_dbrange_test(d))) {
-					di_write_unlock(d);
-					continue;
-				}
-				err = au_reval_dpath(d, sigen);
-				if (!err)
-					di_downgrade_lock(d, AuLock_IR);
-				else {
-					di_write_unlock(d);
-					break;
-				}
-			}
-
-			/* AuDbgDentry(d); */
-			bstart = au_dbstart(d);
-			bend = au_dbend(d);
-			if (bstart <= bindex
-			    && bindex <= bend
-			    && au_h_dptr(d, bindex)
-			    && au_test_dbusy(d, bstart, bend)) {
-				err = -EBUSY;
-				AuVerbose(verbose, "busy %pd\n", d);
-				AuDbgDentry(d);
-			}
-			di_read_unlock(d, AuLock_IR);
-		}
-	}
-
-out_dpages:
-	au_dpages_free(&dpages);
-out:
-	return err;
-}
-
-static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex,
-			   unsigned int sigen, const unsigned int verbose)
-{
-	int err;
-	unsigned long long max, ull;
-	struct inode *i, **array;
-	aufs_bindex_t bstart, bend;
-
-	array = au_iarray_alloc(sb, &max);
-	err = PTR_ERR(array);
-	if (IS_ERR(array))
-		goto out;
-
-	err = 0;
-	AuDbg("b%d\n", bindex);
-	for (ull = 0; !err && ull < max; ull++) {
-		i = array[ull];
-		if (unlikely(!i))
-			break;
-		if (i->i_ino == AUFS_ROOT_INO)
-			continue;
-
-		/* AuDbgInode(i); */
-		if (au_iigen(i, NULL) == sigen)
-			ii_read_lock_child(i);
-		else {
-			ii_write_lock_child(i);
-			err = au_refresh_hinode_self(i);
-			au_iigen_dec(i);
-			if (!err)
-				ii_downgrade_lock(i);
-			else {
-				ii_write_unlock(i);
-				break;
-			}
-		}
-
-		bstart = au_ibstart(i);
-		bend = au_ibend(i);
-		if (bstart <= bindex
-		    && bindex <= bend
-		    && au_h_iptr(i, bindex)
-		    && au_test_ibusy(i, bstart, bend)) {
-			err = -EBUSY;
-			AuVerbose(verbose, "busy i%lu\n", i->i_ino);
-			AuDbgInode(i);
-		}
-		ii_read_unlock(i);
-	}
-	au_iarray_free(array, max);
-
-out:
-	return err;
-}
-
-static int test_children_busy(struct dentry *root, aufs_bindex_t bindex,
-			      const unsigned int verbose)
-{
-	int err;
-	unsigned int sigen;
-
-	sigen = au_sigen(root->d_sb);
-	DiMustNoWaiters(root);
-	IiMustNoWaiters(d_inode(root));
-	di_write_unlock(root);
-	err = test_dentry_busy(root, bindex, sigen, verbose);
-	if (!err)
-		err = test_inode_busy(root->d_sb, bindex, sigen, verbose);
-	di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */
-
-	return err;
-}
-
-static int test_dir_busy(struct file *file, aufs_bindex_t br_id,
-			 struct file **to_free, int *idx)
-{
-	int err;
-	unsigned char matched, root;
-	aufs_bindex_t bindex, bend;
-	struct au_fidir *fidir;
-	struct au_hfile *hfile;
-
-	err = 0;
-	root = IS_ROOT(file->f_path.dentry);
-	if (root) {
-		get_file(file);
-		to_free[*idx] = file;
-		(*idx)++;
-		goto out;
-	}
-
-	matched = 0;
-	fidir = au_fi(file)->fi_hdir;
-	AuDebugOn(!fidir);
-	bend = au_fbend_dir(file);
-	for (bindex = au_fbstart(file); bindex <= bend; bindex++) {
-		hfile = fidir->fd_hfile + bindex;
-		if (!hfile->hf_file)
-			continue;
-
-		if (hfile->hf_br->br_id == br_id) {
-			matched = 1;
-			break;
-		}
-	}
-	if (matched)
-		err = -EBUSY;
-
-out:
-	return err;
-}
-
-static int test_file_busy(struct super_block *sb, aufs_bindex_t br_id,
-			  struct file **to_free, int opened)
-{
-	int err, idx;
-	unsigned long long ull, max;
-	aufs_bindex_t bstart;
-	struct file *file, **array;
-	struct dentry *root;
-	struct au_hfile *hfile;
-
-	array = au_farray_alloc(sb, &max);
-	err = PTR_ERR(array);
-	if (IS_ERR(array))
-		goto out;
-
-	err = 0;
-	idx = 0;
-	root = sb->s_root;
-	di_write_unlock(root);
-	for (ull = 0; ull < max; ull++) {
-		file = array[ull];
-		if (unlikely(!file))
-			break;
-
-		/* AuDbg("%pD\n", file); */
-		fi_read_lock(file);
-		bstart = au_fbstart(file);
-		if (!d_is_dir(file->f_path.dentry)) {
-			hfile = &au_fi(file)->fi_htop;
-			if (hfile->hf_br->br_id == br_id)
-				err = -EBUSY;
-		} else
-			err = test_dir_busy(file, br_id, to_free, &idx);
-		fi_read_unlock(file);
-		if (unlikely(err))
-			break;
-	}
-	di_write_lock_child(root);
-	au_farray_free(array, max);
-	AuDebugOn(idx > opened);
-
-out:
-	return err;
-}
-
-static void br_del_file(struct file **to_free, unsigned long long opened,
-			  aufs_bindex_t br_id)
-{
-	unsigned long long ull;
-	aufs_bindex_t bindex, bstart, bend, bfound;
-	struct file *file;
-	struct au_fidir *fidir;
-	struct au_hfile *hfile;
-
-	for (ull = 0; ull < opened; ull++) {
-		file = to_free[ull];
-		if (unlikely(!file))
-			break;
-
-		/* AuDbg("%pD\n", file); */
-		AuDebugOn(!d_is_dir(file->f_path.dentry));
-		bfound = -1;
-		fidir = au_fi(file)->fi_hdir;
-		AuDebugOn(!fidir);
-		fi_write_lock(file);
-		bstart = au_fbstart(file);
-		bend = au_fbend_dir(file);
-		for (bindex = bstart; bindex <= bend; bindex++) {
-			hfile = fidir->fd_hfile + bindex;
-			if (!hfile->hf_file)
-				continue;
-
-			if (hfile->hf_br->br_id == br_id) {
-				bfound = bindex;
-				break;
-			}
-		}
-		AuDebugOn(bfound < 0);
-		au_set_h_fptr(file, bfound, NULL);
-		if (bfound == bstart) {
-			for (bstart++; bstart <= bend; bstart++)
-				if (au_hf_dir(file, bstart)) {
-					au_set_fbstart(file, bstart);
-					break;
-				}
-		}
-		fi_write_unlock(file);
-	}
-}
-
-static void au_br_do_del_brp(struct au_sbinfo *sbinfo,
-			     const aufs_bindex_t bindex,
-			     const aufs_bindex_t bend)
-{
-	struct au_branch **brp, **p;
-
-	AuRwMustWriteLock(&sbinfo->si_rwsem);
-
-	brp = sbinfo->si_branch + bindex;
-	if (bindex < bend)
-		memmove(brp, brp + 1, sizeof(*brp) * (bend - bindex));
-	sbinfo->si_branch[0 + bend] = NULL;
-	sbinfo->si_bend--;
-
-	p = krealloc(sbinfo->si_branch, sizeof(*p) * bend, AuGFP_SBILIST);
-	if (p)
-		sbinfo->si_branch = p;
-	/* harmless error */
-}
-
-static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex,
-			     const aufs_bindex_t bend)
-{
-	struct au_hdentry *hdp, *p;
-
-	AuRwMustWriteLock(&dinfo->di_rwsem);
-
-	hdp = dinfo->di_hdentry;
-	if (bindex < bend)
-		memmove(hdp + bindex, hdp + bindex + 1,
-			sizeof(*hdp) * (bend - bindex));
-	hdp[0 + bend].hd_dentry = NULL;
-	dinfo->di_bend--;
-
-	p = krealloc(hdp, sizeof(*p) * bend, AuGFP_SBILIST);
-	if (p)
-		dinfo->di_hdentry = p;
-	/* harmless error */
-}
-
-static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex,
-			     const aufs_bindex_t bend)
-{
-	struct au_hinode *hip, *p;
-
-	AuRwMustWriteLock(&iinfo->ii_rwsem);
-
-	hip = iinfo->ii_hinode + bindex;
-	if (bindex < bend)
-		memmove(hip, hip + 1, sizeof(*hip) * (bend - bindex));
-	iinfo->ii_hinode[0 + bend].hi_inode = NULL;
-	au_hn_init(iinfo->ii_hinode + bend);
-	iinfo->ii_bend--;
-
-	p = krealloc(iinfo->ii_hinode, sizeof(*p) * bend, AuGFP_SBILIST);
-	if (p)
-		iinfo->ii_hinode = p;
-	/* harmless error */
-}
-
-static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex,
-			 struct au_branch *br)
-{
-	aufs_bindex_t bend;
-	struct au_sbinfo *sbinfo;
-	struct dentry *root, *h_root;
-	struct inode *inode, *h_inode;
-	struct au_hinode *hinode;
-
-	SiMustWriteLock(sb);
-
-	root = sb->s_root;
-	inode = d_inode(root);
-	sbinfo = au_sbi(sb);
-	bend = sbinfo->si_bend;
-
-	h_root = au_h_dptr(root, bindex);
-	hinode = au_hi(inode, bindex);
-	h_inode = au_igrab(hinode->hi_inode);
-	au_hiput(hinode);
-
-	au_sbilist_lock();
-	au_br_do_del_brp(sbinfo, bindex, bend);
-	au_br_do_del_hdp(au_di(root), bindex, bend);
-	au_br_do_del_hip(au_ii(inode), bindex, bend);
-	au_sbilist_unlock();
-
-	dput(h_root);
-	iput(h_inode);
-	au_br_do_free(br);
-}
-
-static unsigned long long empty_cb(void *array, unsigned long long max,
-				   void *arg)
-{
-	return max;
-}
-
-int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount)
-{
-	int err, rerr, i;
-	unsigned long long opened;
-	unsigned int mnt_flags;
-	aufs_bindex_t bindex, bend, br_id;
-	unsigned char do_wh, verbose;
-	struct au_branch *br;
-	struct au_wbr *wbr;
-	struct dentry *root;
-	struct file **to_free;
-
-	err = 0;
-	opened = 0;
-	to_free = NULL;
-	root = sb->s_root;
-	bindex = au_find_dbindex(root, del->h_path.dentry);
-	if (bindex < 0) {
-		if (remount)
-			goto out; /* success */
-		err = -ENOENT;
-		pr_err("%s no such branch\n", del->pathname);
-		goto out;
-	}
-	AuDbg("bindex b%d\n", bindex);
-
-	err = -EBUSY;
-	mnt_flags = au_mntflags(sb);
-	verbose = !!au_opt_test(mnt_flags, VERBOSE);
-	bend = au_sbend(sb);
-	if (unlikely(!bend)) {
-		AuVerbose(verbose, "no more branches left\n");
-		goto out;
-	}
-	br = au_sbr(sb, bindex);
-	AuDebugOn(!path_equal(&br->br_path, &del->h_path));
-
-	br_id = br->br_id;
-	opened = atomic_read(&br->br_count);
-	if (unlikely(opened)) {
-		to_free = au_array_alloc(&opened, empty_cb, NULL);
-		err = PTR_ERR(to_free);
-		if (IS_ERR(to_free))
-			goto out;
-
-		err = test_file_busy(sb, br_id, to_free, opened);
-		if (unlikely(err)) {
-			AuVerbose(verbose, "%llu file(s) opened\n", opened);
-			goto out;
-		}
-	}
-
-	wbr = br->br_wbr;
-	do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph);
-	if (do_wh) {
-		/* instead of WbrWhMustWriteLock(wbr) */
-		SiMustWriteLock(sb);
-		for (i = 0; i < AuBrWh_Last; i++) {
-			dput(wbr->wbr_wh[i]);
-			wbr->wbr_wh[i] = NULL;
-		}
-	}
-
-	err = test_children_busy(root, bindex, verbose);
-	if (unlikely(err)) {
-		if (do_wh)
-			goto out_wh;
-		goto out;
-	}
-
-	err = 0;
-	if (to_free) {
-		/*
-		 * now we confirmed the branch is deletable.
-		 * let's free the remaining opened dirs on the branch.
-		 */
-		di_write_unlock(root);
-		br_del_file(to_free, opened, br_id);
-		di_write_lock_child(root);
-	}
-
-	if (!remount)
-		au_br_do_del(sb, bindex, br);
-	else {
-		sysaufs_brs_del(sb, bindex);
-		au_br_do_del(sb, bindex, br);
-		sysaufs_brs_add(sb, bindex);
-	}
-
-	if (!bindex) {
-		au_cpup_attr_all(d_inode(root), /*force*/1);
-		sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes;
-	} else
-		au_sub_nlink(d_inode(root), d_inode(del->h_path.dentry));
-	if (au_opt_test(mnt_flags, PLINK))
-		au_plink_half_refresh(sb, br_id);
-
-	if (au_xino_brid(sb) == br_id)
-		au_xino_brid_set(sb, -1);
-	goto out; /* success */
-
-out_wh:
-	/* revert */
-	rerr = au_br_init_wh(sb, br, br->br_perm);
-	if (rerr)
-		pr_warn("failed re-creating base whiteout, %s. (%d)\n",
-			del->pathname, rerr);
-out:
-	if (to_free)
-		au_farray_free(to_free, opened);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_ibusy(struct super_block *sb, struct aufs_ibusy __user *arg)
-{
-	int err;
-	aufs_bindex_t bstart, bend;
-	struct aufs_ibusy ibusy;
-	struct inode *inode, *h_inode;
-
-	err = -EPERM;
-	if (unlikely(!capable(CAP_SYS_ADMIN)))
-		goto out;
-
-	err = copy_from_user(&ibusy, arg, sizeof(ibusy));
-	if (!err)
-		err = !access_ok(VERIFY_WRITE, &arg->h_ino, sizeof(arg->h_ino));
-	if (unlikely(err)) {
-		err = -EFAULT;
-		AuTraceErr(err);
-		goto out;
-	}
-
-	err = -EINVAL;
-	si_read_lock(sb, AuLock_FLUSH);
-	if (unlikely(ibusy.bindex < 0 || ibusy.bindex > au_sbend(sb)))
-		goto out_unlock;
-
-	err = 0;
-	ibusy.h_ino = 0; /* invalid */
-	inode = ilookup(sb, ibusy.ino);
-	if (!inode
-	    || inode->i_ino == AUFS_ROOT_INO
-	    || is_bad_inode(inode))
-		goto out_unlock;
-
-	ii_read_lock_child(inode);
-	bstart = au_ibstart(inode);
-	bend = au_ibend(inode);
-	if (bstart <= ibusy.bindex && ibusy.bindex <= bend) {
-		h_inode = au_h_iptr(inode, ibusy.bindex);
-		if (h_inode && au_test_ibusy(inode, bstart, bend))
-			ibusy.h_ino = h_inode->i_ino;
-	}
-	ii_read_unlock(inode);
-	iput(inode);
-
-out_unlock:
-	si_read_unlock(sb);
-	if (!err) {
-		err = __put_user(ibusy.h_ino, &arg->h_ino);
-		if (unlikely(err)) {
-			err = -EFAULT;
-			AuTraceErr(err);
-		}
-	}
-out:
-	return err;
-}
-
-long au_ibusy_ioctl(struct file *file, unsigned long arg)
-{
-	return au_ibusy(file->f_path.dentry->d_sb, (void __user *)arg);
-}
-
-#ifdef CONFIG_COMPAT
-long au_ibusy_compat_ioctl(struct file *file, unsigned long arg)
-{
-	return au_ibusy(file->f_path.dentry->d_sb, compat_ptr(arg));
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * change a branch permission
- */
-
-static void au_warn_ima(void)
-{
-#ifdef CONFIG_IMA
-	/* since it doesn't support mark_files_ro() */
-	AuWarn1("RW -> RO makes IMA to produce wrong message\n");
-#endif
-}
-
-static int do_need_sigen_inc(int a, int b)
-{
-	return au_br_whable(a) && !au_br_whable(b);
-}
-
-static int need_sigen_inc(int old, int new)
-{
-	return do_need_sigen_inc(old, new)
-		|| do_need_sigen_inc(new, old);
-}
-
-static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex)
-{
-	int err, do_warn;
-	unsigned int mnt_flags;
-	unsigned long long ull, max;
-	aufs_bindex_t br_id;
-	unsigned char verbose, writer;
-	struct file *file, *hf, **array;
-	struct au_hfile *hfile;
-
-	mnt_flags = au_mntflags(sb);
-	verbose = !!au_opt_test(mnt_flags, VERBOSE);
-
-	array = au_farray_alloc(sb, &max);
-	err = PTR_ERR(array);
-	if (IS_ERR(array))
-		goto out;
-
-	do_warn = 0;
-	br_id = au_sbr_id(sb, bindex);
-	for (ull = 0; ull < max; ull++) {
-		file = array[ull];
-		if (unlikely(!file))
-			break;
-
-		/* AuDbg("%pD\n", file); */
-		fi_read_lock(file);
-		if (unlikely(au_test_mmapped(file))) {
-			err = -EBUSY;
-			AuVerbose(verbose, "mmapped %pD\n", file);
-			AuDbgFile(file);
-			FiMustNoWaiters(file);
-			fi_read_unlock(file);
-			goto out_array;
-		}
-
-		hfile = &au_fi(file)->fi_htop;
-		hf = hfile->hf_file;
-		if (!d_is_reg(file->f_path.dentry)
-		    || !(file->f_mode & FMODE_WRITE)
-		    || hfile->hf_br->br_id != br_id
-		    || !(hf->f_mode & FMODE_WRITE))
-			array[ull] = NULL;
-		else {
-			do_warn = 1;
-			get_file(file);
-		}
-
-		FiMustNoWaiters(file);
-		fi_read_unlock(file);
-		fput(file);
-	}
-
-	err = 0;
-	if (do_warn)
-		au_warn_ima();
-
-	for (ull = 0; ull < max; ull++) {
-		file = array[ull];
-		if (!file)
-			continue;
-
-		/* todo: already flushed? */
-		/*
-		 * fs/super.c:mark_files_ro() is gone, but aufs keeps its
-		 * approach which resets f_mode and calls mnt_drop_write() and
-		 * file_release_write() for each file, because the branch
-		 * attribute in aufs world is totally different from the native
-		 * fs rw/ro mode.
-		*/
-		/* fi_read_lock(file); */
-		hfile = &au_fi(file)->fi_htop;
-		hf = hfile->hf_file;
-		/* fi_read_unlock(file); */
-		spin_lock(&hf->f_lock);
-		writer = !!(hf->f_mode & FMODE_WRITER);
-		hf->f_mode &= ~(FMODE_WRITE | FMODE_WRITER);
-		spin_unlock(&hf->f_lock);
-		if (writer) {
-			put_write_access(file_inode(hf));
-			__mnt_drop_write(hf->f_path.mnt);
-		}
-	}
-
-out_array:
-	au_farray_free(array, max);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount,
-	      int *do_refresh)
-{
-	int err, rerr;
-	aufs_bindex_t bindex;
-	struct dentry *root;
-	struct au_branch *br;
-	struct au_br_fhsm *bf;
-
-	root = sb->s_root;
-	bindex = au_find_dbindex(root, mod->h_root);
-	if (bindex < 0) {
-		if (remount)
-			return 0; /* success */
-		err = -ENOENT;
-		pr_err("%s no such branch\n", mod->path);
-		goto out;
-	}
-	AuDbg("bindex b%d\n", bindex);
-
-	err = test_br(d_inode(mod->h_root), mod->perm, mod->path);
-	if (unlikely(err))
-		goto out;
-
-	br = au_sbr(sb, bindex);
-	AuDebugOn(mod->h_root != au_br_dentry(br));
-	if (br->br_perm == mod->perm)
-		return 0; /* success */
-
-	/* pre-allocate for non-fhsm --> fhsm */
-	bf = NULL;
-	if (!au_br_fhsm(br->br_perm) && au_br_fhsm(mod->perm)) {
-		err = au_fhsm_br_alloc(br);
-		if (unlikely(err))
-			goto out;
-		bf = br->br_fhsm;
-		br->br_fhsm = NULL;
-	}
-
-	if (au_br_writable(br->br_perm)) {
-		/* remove whiteout base */
-		err = au_br_init_wh(sb, br, mod->perm);
-		if (unlikely(err))
-			goto out_bf;
-
-		if (!au_br_writable(mod->perm)) {
-			/* rw --> ro, file might be mmapped */
-			DiMustNoWaiters(root);
-			IiMustNoWaiters(d_inode(root));
-			di_write_unlock(root);
-			err = au_br_mod_files_ro(sb, bindex);
-			/* aufs_write_lock() calls ..._child() */
-			di_write_lock_child(root);
-
-			if (unlikely(err)) {
-				rerr = -ENOMEM;
-				br->br_wbr = kmalloc(sizeof(*br->br_wbr),
-						     GFP_NOFS);
-				if (br->br_wbr)
-					rerr = au_wbr_init(br, sb, br->br_perm);
-				if (unlikely(rerr)) {
-					AuIOErr("nested error %d (%d)\n",
-						rerr, err);
-					br->br_perm = mod->perm;
-				}
-			}
-		}
-	} else if (au_br_writable(mod->perm)) {
-		/* ro --> rw */
-		err = -ENOMEM;
-		br->br_wbr = kmalloc(sizeof(*br->br_wbr), GFP_NOFS);
-		if (br->br_wbr) {
-			err = au_wbr_init(br, sb, mod->perm);
-			if (unlikely(err)) {
-				kfree(br->br_wbr);
-				br->br_wbr = NULL;
-			}
-		}
-	}
-	if (unlikely(err))
-		goto out_bf;
-
-	if (au_br_fhsm(br->br_perm)) {
-		if (!au_br_fhsm(mod->perm)) {
-			/* fhsm --> non-fhsm */
-			au_br_fhsm_fin(br->br_fhsm);
-			kfree(br->br_fhsm);
-			br->br_fhsm = NULL;
-		}
-	} else if (au_br_fhsm(mod->perm))
-		/* non-fhsm --> fhsm */
-		br->br_fhsm = bf;
-
-	*do_refresh |= need_sigen_inc(br->br_perm, mod->perm);
-	br->br_perm = mod->perm;
-	goto out; /* success */
-
-out_bf:
-	kfree(bf);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs)
-{
-	int err;
-	struct kstatfs kstfs;
-
-	err = vfs_statfs(&br->br_path, &kstfs);
-	if (!err) {
-		stfs->f_blocks = kstfs.f_blocks;
-		stfs->f_bavail = kstfs.f_bavail;
-		stfs->f_files = kstfs.f_files;
-		stfs->f_ffree = kstfs.f_ffree;
-	}
-
-	return err;
-}
diff --git a/fs/aufs/branch.h b/fs/aufs/branch.h
deleted file mode 100644
index 3d026f57a..000000000
--- a/fs/aufs/branch.h
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * branch filesystems and xino for them
- */
-
-#ifndef __AUFS_BRANCH_H__
-#define __AUFS_BRANCH_H__
-
-#ifdef __KERNEL__
-
-#include <linux/mount.h>
-#include "dynop.h"
-#include "rwsem.h"
-#include "super.h"
-
-/* ---------------------------------------------------------------------- */
-
-/* a xino file */
-struct au_xino_file {
-	struct file		*xi_file;
-	struct mutex		xi_nondir_mtx;
-
-	/* todo: make xino files an array to support huge inode number */
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry		 *xi_dbgaufs;
-#endif
-};
-
-/* File-based Hierarchical Storage Management */
-struct au_br_fhsm {
-#ifdef CONFIG_AUFS_FHSM
-	struct mutex		bf_lock;
-	unsigned long		bf_jiffy;
-	struct aufs_stfs	bf_stfs;
-	int			bf_readable;
-#endif
-};
-
-/* members for writable branch only */
-enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last};
-struct au_wbr {
-	struct au_rwsem		wbr_wh_rwsem;
-	struct dentry		*wbr_wh[AuBrWh_Last];
-	atomic_t		wbr_wh_running;
-#define wbr_whbase		wbr_wh[AuBrWh_BASE]	/* whiteout base */
-#define wbr_plink		wbr_wh[AuBrWh_PLINK]	/* pseudo-link dir */
-#define wbr_orph		wbr_wh[AuBrWh_ORPH]	/* dir for orphans */
-
-	/* mfs mode */
-	unsigned long long	wbr_bytes;
-};
-
-/* ext2 has 3 types of operations at least, ext3 has 4 */
-#define AuBrDynOp (AuDyLast * 4)
-
-#ifdef CONFIG_AUFS_HFSNOTIFY
-/* support for asynchronous destruction */
-struct au_br_hfsnotify {
-	struct fsnotify_group	*hfsn_group;
-};
-#endif
-
-/* sysfs entries */
-struct au_brsysfs {
-	char			name[16];
-	struct attribute	attr;
-};
-
-enum {
-	AuBrSysfs_BR,
-	AuBrSysfs_BRID,
-	AuBrSysfs_Last
-};
-
-/* protected by superblock rwsem */
-struct au_branch {
-	struct au_xino_file	br_xino;
-
-	aufs_bindex_t		br_id;
-
-	int			br_perm;
-	struct path		br_path;
-	spinlock_t		br_dykey_lock;
-	struct au_dykey		*br_dykey[AuBrDynOp];
-	atomic_t		br_count;
-
-	struct au_wbr		*br_wbr;
-	struct au_br_fhsm	*br_fhsm;
-
-	/* xino truncation */
-	atomic_t		br_xino_running;
-
-#ifdef CONFIG_AUFS_HFSNOTIFY
-	struct au_br_hfsnotify	*br_hfsn;
-#endif
-
-#ifdef CONFIG_SYSFS
-	/* entries under sysfs per mount-point */
-	struct au_brsysfs	br_sysfs[AuBrSysfs_Last];
-#endif
-};
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct vfsmount *au_br_mnt(struct au_branch *br)
-{
-	return br->br_path.mnt;
-}
-
-static inline struct dentry *au_br_dentry(struct au_branch *br)
-{
-	return br->br_path.dentry;
-}
-
-static inline struct super_block *au_br_sb(struct au_branch *br)
-{
-	return au_br_mnt(br)->mnt_sb;
-}
-
-static inline int au_br_rdonly(struct au_branch *br)
-{
-	return ((au_br_sb(br)->s_flags & MS_RDONLY)
-		|| !au_br_writable(br->br_perm))
-		? -EROFS : 0;
-}
-
-static inline int au_br_hnotifyable(int brperm __maybe_unused)
-{
-#ifdef CONFIG_AUFS_HNOTIFY
-	return !(brperm & AuBrPerm_RR);
-#else
-	return 0;
-#endif
-}
-
-static inline int au_br_test_oflag(int oflag, struct au_branch *br)
-{
-	int err, exec_flag;
-
-	err = 0;
-	exec_flag = oflag & __FMODE_EXEC;
-	if (unlikely(exec_flag && (au_br_mnt(br)->mnt_flags & MNT_NOEXEC)))
-		err = -EACCES;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* branch.c */
-struct au_sbinfo;
-void au_br_free(struct au_sbinfo *sinfo);
-int au_br_index(struct super_block *sb, aufs_bindex_t br_id);
-struct au_opt_add;
-int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount);
-struct au_opt_del;
-int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount);
-long au_ibusy_ioctl(struct file *file, unsigned long arg);
-#ifdef CONFIG_COMPAT
-long au_ibusy_compat_ioctl(struct file *file, unsigned long arg);
-#endif
-struct au_opt_mod;
-int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount,
-	      int *do_refresh);
-struct aufs_stfs;
-int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs);
-
-/* xino.c */
-static const loff_t au_loff_max = LLONG_MAX;
-
-int au_xib_trunc(struct super_block *sb);
-ssize_t xino_fread(vfs_readf_t func, struct file *file, void *buf, size_t size,
-		   loff_t *pos);
-ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf,
-		    size_t size, loff_t *pos);
-struct file *au_xino_create2(struct file *base_file, struct file *copy_src);
-struct file *au_xino_create(struct super_block *sb, char *fname, int silent);
-ino_t au_xino_new_ino(struct super_block *sb);
-void au_xino_delete_inode(struct inode *inode, const int unlinked);
-int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-		  ino_t ino);
-int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-		 ino_t *ino);
-int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino,
-	       struct file *base_file, int do_test);
-int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex);
-
-struct au_opt_xino;
-int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount);
-void au_xino_clr(struct super_block *sb);
-struct file *au_xino_def(struct super_block *sb);
-int au_xino_path(struct seq_file *seq, struct file *file);
-
-/* ---------------------------------------------------------------------- */
-
-/* Superblock to branch */
-static inline
-aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex)
-{
-	return au_sbr(sb, bindex)->br_id;
-}
-
-static inline
-struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex)
-{
-	return au_br_mnt(au_sbr(sb, bindex));
-}
-
-static inline
-struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex)
-{
-	return au_br_sb(au_sbr(sb, bindex));
-}
-
-static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex)
-{
-	atomic_dec(&au_sbr(sb, bindex)->br_count);
-}
-
-static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex)
-{
-	return au_sbr(sb, bindex)->br_perm;
-}
-
-static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex)
-{
-	return au_br_whable(au_sbr_perm(sb, bindex));
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * wbr_wh_read_lock, wbr_wh_write_lock
- * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock
- */
-AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem);
-
-#define WbrWhMustNoWaiters(wbr)	AuRwMustNoWaiters(&wbr->wbr_wh_rwsem)
-#define WbrWhMustAnyLock(wbr)	AuRwMustAnyLock(&wbr->wbr_wh_rwsem)
-#define WbrWhMustWriteLock(wbr)	AuRwMustWriteLock(&wbr->wbr_wh_rwsem)
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_FHSM
-static inline void au_br_fhsm_init(struct au_br_fhsm *brfhsm)
-{
-	mutex_init(&brfhsm->bf_lock);
-	brfhsm->bf_jiffy = 0;
-	brfhsm->bf_readable = 0;
-}
-
-static inline void au_br_fhsm_fin(struct au_br_fhsm *brfhsm)
-{
-	mutex_destroy(&brfhsm->bf_lock);
-}
-#else
-AuStubVoid(au_br_fhsm_init, struct au_br_fhsm *brfhsm)
-AuStubVoid(au_br_fhsm_fin, struct au_br_fhsm *brfhsm)
-#endif
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_BRANCH_H__ */
diff --git a/fs/aufs/conf.mk b/fs/aufs/conf.mk
deleted file mode 100644
index 0bbb2d3a5..000000000
--- a/fs/aufs/conf.mk
+++ /dev/null
@@ -1,38 +0,0 @@
-
-AuConfStr = CONFIG_AUFS_FS=${CONFIG_AUFS_FS}
-
-define AuConf
-ifdef ${1}
-AuConfStr += ${1}=${${1}}
-endif
-endef
-
-AuConfAll = BRANCH_MAX_127 BRANCH_MAX_511 BRANCH_MAX_1023 BRANCH_MAX_32767 \
-	SBILIST \
-	HNOTIFY HFSNOTIFY \
-	EXPORT INO_T_64 \
-	XATTR \
-	FHSM \
-	RDU \
-	SHWH \
-	BR_RAMFS \
-	BR_FUSE POLL \
-	BR_HFSPLUS \
-	BDEV_LOOP \
-	DEBUG MAGIC_SYSRQ
-$(foreach i, ${AuConfAll}, \
-	$(eval $(call AuConf,CONFIG_AUFS_${i})))
-
-AuConfName = ${obj}/conf.str
-${AuConfName}.tmp: FORCE
-	@echo ${AuConfStr} | tr ' ' '\n' | sed -e 's/^/"/' -e 's/$$/\\n"/' > $@
-${AuConfName}: ${AuConfName}.tmp
-	@diff -q $< $@ > /dev/null 2>&1 || { \
-	echo '  GEN    ' $@; \
-	cp -p $< $@; \
-	}
-FORCE:
-clean-files += ${AuConfName} ${AuConfName}.tmp
-${obj}/sysfs.o: ${AuConfName}
-
--include ${srctree}/${src}/conf_priv.mk
diff --git a/fs/aufs/cpup.c b/fs/aufs/cpup.c
deleted file mode 100644
index 3ea177a5d..000000000
--- a/fs/aufs/cpup.c
+++ /dev/null
@@ -1,1319 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * copy-up functions, see wbr_policy.c for copy-down
- */
-
-#include <linux/fs_stack.h>
-#include <linux/mm.h>
-#include "aufs.h"
-
-void au_cpup_attr_flags(struct inode *dst, unsigned int iflags)
-{
-	const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE
-		| S_NOATIME | S_NOCMTIME | S_AUTOMOUNT;
-
-	BUILD_BUG_ON(sizeof(iflags) != sizeof(dst->i_flags));
-
-	dst->i_flags |= iflags & ~mask;
-	if (au_test_fs_notime(dst->i_sb))
-		dst->i_flags |= S_NOATIME | S_NOCMTIME;
-}
-
-void au_cpup_attr_timesizes(struct inode *inode)
-{
-	struct inode *h_inode;
-
-	h_inode = au_h_iptr(inode, au_ibstart(inode));
-	fsstack_copy_attr_times(inode, h_inode);
-	fsstack_copy_inode_size(inode, h_inode);
-}
-
-void au_cpup_attr_nlink(struct inode *inode, int force)
-{
-	struct inode *h_inode;
-	struct super_block *sb;
-	aufs_bindex_t bindex, bend;
-
-	sb = inode->i_sb;
-	bindex = au_ibstart(inode);
-	h_inode = au_h_iptr(inode, bindex);
-	if (!force
-	    && !S_ISDIR(h_inode->i_mode)
-	    && au_opt_test(au_mntflags(sb), PLINK)
-	    && au_plink_test(inode))
-		return;
-
-	/*
-	 * 0 can happen in revalidating.
-	 * h_inode->i_mutex may not be held here, but it is harmless since once
-	 * i_nlink reaches 0, it will never become positive except O_TMPFILE
-	 * case.
-	 * todo: O_TMPFILE+linkat(AT_SYMLINK_FOLLOW) bypassing aufs may cause
-	 *	 the incorrect link count.
-	 */
-	set_nlink(inode, h_inode->i_nlink);
-
-	/*
-	 * fewer nlink makes find(1) noisy, but larger nlink doesn't.
-	 * it may includes whplink directory.
-	 */
-	if (S_ISDIR(h_inode->i_mode)) {
-		bend = au_ibend(inode);
-		for (bindex++; bindex <= bend; bindex++) {
-			h_inode = au_h_iptr(inode, bindex);
-			if (h_inode)
-				au_add_nlink(inode, h_inode);
-		}
-	}
-}
-
-void au_cpup_attr_changeable(struct inode *inode)
-{
-	struct inode *h_inode;
-
-	h_inode = au_h_iptr(inode, au_ibstart(inode));
-	inode->i_mode = h_inode->i_mode;
-	inode->i_uid = h_inode->i_uid;
-	inode->i_gid = h_inode->i_gid;
-	au_cpup_attr_timesizes(inode);
-	au_cpup_attr_flags(inode, h_inode->i_flags);
-}
-
-void au_cpup_igen(struct inode *inode, struct inode *h_inode)
-{
-	struct au_iinfo *iinfo = au_ii(inode);
-
-	IiMustWriteLock(inode);
-
-	iinfo->ii_higen = h_inode->i_generation;
-	iinfo->ii_hsb1 = h_inode->i_sb;
-}
-
-void au_cpup_attr_all(struct inode *inode, int force)
-{
-	struct inode *h_inode;
-
-	h_inode = au_h_iptr(inode, au_ibstart(inode));
-	au_cpup_attr_changeable(inode);
-	if (inode->i_nlink > 0)
-		au_cpup_attr_nlink(inode, force);
-	inode->i_rdev = h_inode->i_rdev;
-	inode->i_blkbits = h_inode->i_blkbits;
-	au_cpup_igen(inode, h_inode);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */
-
-/* keep the timestamps of the parent dir when cpup */
-void au_dtime_store(struct au_dtime *dt, struct dentry *dentry,
-		    struct path *h_path)
-{
-	struct inode *h_inode;
-
-	dt->dt_dentry = dentry;
-	dt->dt_h_path = *h_path;
-	h_inode = d_inode(h_path->dentry);
-	dt->dt_atime = h_inode->i_atime;
-	dt->dt_mtime = h_inode->i_mtime;
-	/* smp_mb(); */
-}
-
-void au_dtime_revert(struct au_dtime *dt)
-{
-	struct iattr attr;
-	int err;
-
-	attr.ia_atime = dt->dt_atime;
-	attr.ia_mtime = dt->dt_mtime;
-	attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET
-		| ATTR_ATIME | ATTR_ATIME_SET;
-
-	/* no delegation since this is a directory */
-	err = vfsub_notify_change(&dt->dt_h_path, &attr, /*delegated*/NULL);
-	if (unlikely(err))
-		pr_warn("restoring timestamps failed(%d). ignored\n", err);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* internal use only */
-struct au_cpup_reg_attr {
-	int		valid;
-	struct kstat	st;
-	unsigned int	iflags; /* inode->i_flags */
-};
-
-static noinline_for_stack
-int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src,
-	       struct au_cpup_reg_attr *h_src_attr)
-{
-	int err, sbits, icex;
-	unsigned int mnt_flags;
-	unsigned char verbose;
-	struct iattr ia;
-	struct path h_path;
-	struct inode *h_isrc, *h_idst;
-	struct kstat *h_st;
-	struct au_branch *br;
-
-	h_path.dentry = au_h_dptr(dst, bindex);
-	h_idst = d_inode(h_path.dentry);
-	br = au_sbr(dst->d_sb, bindex);
-	h_path.mnt = au_br_mnt(br);
-	h_isrc = d_inode(h_src);
-	ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID
-		| ATTR_ATIME | ATTR_MTIME
-		| ATTR_ATIME_SET | ATTR_MTIME_SET;
-	if (h_src_attr && h_src_attr->valid) {
-		h_st = &h_src_attr->st;
-		ia.ia_uid = h_st->uid;
-		ia.ia_gid = h_st->gid;
-		ia.ia_atime = h_st->atime;
-		ia.ia_mtime = h_st->mtime;
-		if (h_idst->i_mode != h_st->mode
-		    && !S_ISLNK(h_idst->i_mode)) {
-			ia.ia_valid |= ATTR_MODE;
-			ia.ia_mode = h_st->mode;
-		}
-		sbits = !!(h_st->mode & (S_ISUID | S_ISGID));
-		au_cpup_attr_flags(h_idst, h_src_attr->iflags);
-	} else {
-		ia.ia_uid = h_isrc->i_uid;
-		ia.ia_gid = h_isrc->i_gid;
-		ia.ia_atime = h_isrc->i_atime;
-		ia.ia_mtime = h_isrc->i_mtime;
-		if (h_idst->i_mode != h_isrc->i_mode
-		    && !S_ISLNK(h_idst->i_mode)) {
-			ia.ia_valid |= ATTR_MODE;
-			ia.ia_mode = h_isrc->i_mode;
-		}
-		sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID));
-		au_cpup_attr_flags(h_idst, h_isrc->i_flags);
-	}
-	/* no delegation since it is just created */
-	err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL);
-
-	/* is this nfs only? */
-	if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) {
-		ia.ia_valid = ATTR_FORCE | ATTR_MODE;
-		ia.ia_mode = h_isrc->i_mode;
-		err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL);
-	}
-
-	icex = br->br_perm & AuBrAttr_ICEX;
-	if (!err) {
-		mnt_flags = au_mntflags(dst->d_sb);
-		verbose = !!au_opt_test(mnt_flags, VERBOSE);
-		err = au_cpup_xattr(h_path.dentry, h_src, icex, verbose);
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_do_copy_file(struct file *dst, struct file *src, loff_t len,
-			   char *buf, unsigned long blksize)
-{
-	int err;
-	size_t sz, rbytes, wbytes;
-	unsigned char all_zero;
-	char *p, *zp;
-	struct mutex *h_mtx;
-	/* reduce stack usage */
-	struct iattr *ia;
-
-	zp = page_address(ZERO_PAGE(0));
-	if (unlikely(!zp))
-		return -ENOMEM; /* possible? */
-
-	err = 0;
-	all_zero = 0;
-	while (len) {
-		AuDbg("len %lld\n", len);
-		sz = blksize;
-		if (len < blksize)
-			sz = len;
-
-		rbytes = 0;
-		/* todo: signal_pending? */
-		while (!rbytes || err == -EAGAIN || err == -EINTR) {
-			rbytes = vfsub_read_k(src, buf, sz, &src->f_pos);
-			err = rbytes;
-		}
-		if (unlikely(err < 0))
-			break;
-
-		all_zero = 0;
-		if (len >= rbytes && rbytes == blksize)
-			all_zero = !memcmp(buf, zp, rbytes);
-		if (!all_zero) {
-			wbytes = rbytes;
-			p = buf;
-			while (wbytes) {
-				size_t b;
-
-				b = vfsub_write_k(dst, p, wbytes, &dst->f_pos);
-				err = b;
-				/* todo: signal_pending? */
-				if (unlikely(err == -EAGAIN || err == -EINTR))
-					continue;
-				if (unlikely(err < 0))
-					break;
-				wbytes -= b;
-				p += b;
-			}
-			if (unlikely(err < 0))
-				break;
-		} else {
-			loff_t res;
-
-			AuLabel(hole);
-			res = vfsub_llseek(dst, rbytes, SEEK_CUR);
-			err = res;
-			if (unlikely(res < 0))
-				break;
-		}
-		len -= rbytes;
-		err = 0;
-	}
-
-	/* the last block may be a hole */
-	if (!err && all_zero) {
-		AuLabel(last hole);
-
-		err = 1;
-		if (au_test_nfs(dst->f_path.dentry->d_sb)) {
-			/* nfs requires this step to make last hole */
-			/* is this only nfs? */
-			do {
-				/* todo: signal_pending? */
-				err = vfsub_write_k(dst, "\0", 1, &dst->f_pos);
-			} while (err == -EAGAIN || err == -EINTR);
-			if (err == 1)
-				dst->f_pos--;
-		}
-
-		if (err == 1) {
-			ia = (void *)buf;
-			ia->ia_size = dst->f_pos;
-			ia->ia_valid = ATTR_SIZE | ATTR_FILE;
-			ia->ia_file = dst;
-			h_mtx = &file_inode(dst)->i_mutex;
-			mutex_lock_nested(h_mtx, AuLsc_I_CHILD2);
-			/* no delegation since it is just created */
-			err = vfsub_notify_change(&dst->f_path, ia,
-						  /*delegated*/NULL);
-			mutex_unlock(h_mtx);
-		}
-	}
-
-	return err;
-}
-
-int au_copy_file(struct file *dst, struct file *src, loff_t len)
-{
-	int err;
-	unsigned long blksize;
-	unsigned char do_kfree;
-	char *buf;
-
-	err = -ENOMEM;
-	blksize = dst->f_path.dentry->d_sb->s_blocksize;
-	if (!blksize || PAGE_SIZE < blksize)
-		blksize = PAGE_SIZE;
-	AuDbg("blksize %lu\n", blksize);
-	do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *));
-	if (do_kfree)
-		buf = kmalloc(blksize, GFP_NOFS);
-	else
-		buf = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!buf))
-		goto out;
-
-	if (len > (1 << 22))
-		AuDbg("copying a large file %lld\n", (long long)len);
-
-	src->f_pos = 0;
-	dst->f_pos = 0;
-	err = au_do_copy_file(dst, src, len, buf, blksize);
-	if (do_kfree)
-		kfree(buf);
-	else
-		free_page((unsigned long)buf);
-
-out:
-	return err;
-}
-
-/*
- * to support a sparse file which is opened with O_APPEND,
- * we need to close the file.
- */
-static int au_cp_regular(struct au_cp_generic *cpg)
-{
-	int err, i;
-	enum { SRC, DST };
-	struct {
-		aufs_bindex_t bindex;
-		unsigned int flags;
-		struct dentry *dentry;
-		int force_wr;
-		struct file *file;
-		void *label;
-	} *f, file[] = {
-		{
-			.bindex = cpg->bsrc,
-			.flags = O_RDONLY | O_NOATIME | O_LARGEFILE,
-			.label = &&out
-		},
-		{
-			.bindex = cpg->bdst,
-			.flags = O_WRONLY | O_NOATIME | O_LARGEFILE,
-			.force_wr = !!au_ftest_cpup(cpg->flags, RWDST),
-			.label = &&out_src
-		}
-	};
-	struct super_block *sb;
-
-	/* bsrc branch can be ro/rw. */
-	sb = cpg->dentry->d_sb;
-	f = file;
-	for (i = 0; i < 2; i++, f++) {
-		f->dentry = au_h_dptr(cpg->dentry, f->bindex);
-		f->file = au_h_open(cpg->dentry, f->bindex, f->flags,
-				    /*file*/NULL, f->force_wr);
-		err = PTR_ERR(f->file);
-		if (IS_ERR(f->file))
-			goto *f->label;
-	}
-
-	/* try stopping to update while we copyup */
-	IMustLock(d_inode(file[SRC].dentry));
-	err = au_copy_file(file[DST].file, file[SRC].file, cpg->len);
-
-	fput(file[DST].file);
-	au_sbr_put(sb, file[DST].bindex);
-
-out_src:
-	fput(file[SRC].file);
-	au_sbr_put(sb, file[SRC].bindex);
-out:
-	return err;
-}
-
-static int au_do_cpup_regular(struct au_cp_generic *cpg,
-			      struct au_cpup_reg_attr *h_src_attr)
-{
-	int err, rerr;
-	loff_t l;
-	struct path h_path;
-	struct inode *h_src_inode, *h_dst_inode;
-
-	err = 0;
-	h_src_inode = au_h_iptr(d_inode(cpg->dentry), cpg->bsrc);
-	l = i_size_read(h_src_inode);
-	if (cpg->len == -1 || l < cpg->len)
-		cpg->len = l;
-	if (cpg->len) {
-		/* try stopping to update while we are referencing */
-		mutex_lock_nested(&h_src_inode->i_mutex, AuLsc_I_CHILD);
-		au_pin_hdir_unlock(cpg->pin);
-
-		h_path.dentry = au_h_dptr(cpg->dentry, cpg->bsrc);
-		h_path.mnt = au_sbr_mnt(cpg->dentry->d_sb, cpg->bsrc);
-		h_src_attr->iflags = h_src_inode->i_flags;
-		if (!au_test_nfs(h_src_inode->i_sb))
-			err = vfs_getattr(&h_path, &h_src_attr->st);
-		else {
-			mutex_unlock(&h_src_inode->i_mutex);
-			err = vfs_getattr(&h_path, &h_src_attr->st);
-			mutex_lock_nested(&h_src_inode->i_mutex, AuLsc_I_CHILD);
-		}
-		if (unlikely(err)) {
-			mutex_unlock(&h_src_inode->i_mutex);
-			goto out;
-		}
-		h_src_attr->valid = 1;
-		err = au_cp_regular(cpg);
-		mutex_unlock(&h_src_inode->i_mutex);
-		rerr = au_pin_hdir_relock(cpg->pin);
-		if (!err && rerr)
-			err = rerr;
-	}
-	if (!err && (h_src_inode->i_state & I_LINKABLE)) {
-		h_path.dentry = au_h_dptr(cpg->dentry, cpg->bdst);
-		h_dst_inode = d_inode(h_path.dentry);
-		spin_lock(&h_dst_inode->i_lock);
-		h_dst_inode->i_state |= I_LINKABLE;
-		spin_unlock(&h_dst_inode->i_lock);
-	}
-
-out:
-	return err;
-}
-
-static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src,
-			      struct inode *h_dir)
-{
-	int err, symlen;
-	mm_segment_t old_fs;
-	union {
-		char *k;
-		char __user *u;
-	} sym;
-	struct inode *h_inode = d_inode(h_src);
-	const struct inode_operations *h_iop = h_inode->i_op;
-
-	err = -ENOSYS;
-	if (unlikely(!h_iop->readlink))
-		goto out;
-
-	err = -ENOMEM;
-	sym.k = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!sym.k))
-		goto out;
-
-	/* unnecessary to support mmap_sem since symlink is not mmap-able */
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-	symlen = h_iop->readlink(h_src, sym.u, PATH_MAX);
-	err = symlen;
-	set_fs(old_fs);
-
-	if (symlen > 0) {
-		sym.k[symlen] = 0;
-		err = vfsub_symlink(h_dir, h_path, sym.k);
-	}
-	free_page((unsigned long)sym.k);
-
-out:
-	return err;
-}
-
-static noinline_for_stack
-int cpup_entry(struct au_cp_generic *cpg, struct dentry *dst_parent,
-	       struct au_cpup_reg_attr *h_src_attr)
-{
-	int err;
-	umode_t mode;
-	unsigned int mnt_flags;
-	unsigned char isdir, isreg, force;
-	const unsigned char do_dt = !!au_ftest_cpup(cpg->flags, DTIME);
-	struct au_dtime dt;
-	struct path h_path;
-	struct dentry *h_src, *h_dst, *h_parent;
-	struct inode *h_inode, *h_dir, *dir, *inode;
-	struct super_block *sb;
-
-	/* bsrc branch can be ro/rw. */
-	h_src = au_h_dptr(cpg->dentry, cpg->bsrc);
-	h_inode = d_inode(h_src);
-	AuDebugOn(h_inode != au_h_iptr(d_inode(cpg->dentry), cpg->bsrc));
-
-	/* try stopping to be referenced while we are creating */
-	h_dst = au_h_dptr(cpg->dentry, cpg->bdst);
-	if (au_ftest_cpup(cpg->flags, RENAME))
-		AuDebugOn(strncmp(h_dst->d_name.name, AUFS_WH_PFX,
-				  AUFS_WH_PFX_LEN));
-	h_parent = h_dst->d_parent; /* dir inode is locked */
-	h_dir = d_inode(h_parent);
-	IMustLock(h_dir);
-	AuDebugOn(h_parent != h_dst->d_parent);
-
-	sb = cpg->dentry->d_sb;
-	h_path.mnt = au_sbr_mnt(sb, cpg->bdst);
-	if (do_dt) {
-		h_path.dentry = h_parent;
-		au_dtime_store(&dt, dst_parent, &h_path);
-	}
-	h_path.dentry = h_dst;
-
-	isreg = 0;
-	isdir = 0;
-	mode = h_inode->i_mode;
-	switch (mode & S_IFMT) {
-	case S_IFREG:
-		isreg = 1;
-		err = vfsub_create(h_dir, &h_path, mode | S_IWUSR,
-				   /*want_excl*/true);
-		if (!err)
-			err = au_do_cpup_regular(cpg, h_src_attr);
-		break;
-	case S_IFDIR:
-		isdir = 1;
-		err = vfsub_mkdir(h_dir, &h_path, mode);
-		if (!err) {
-			/*
-			 * strange behaviour from the users view,
-			 * particularry setattr case
-			 */
-			dir = d_inode(dst_parent);
-			if (au_ibstart(dir) == cpg->bdst)
-				au_cpup_attr_nlink(dir, /*force*/1);
-			inode = d_inode(cpg->dentry);
-			au_cpup_attr_nlink(inode, /*force*/1);
-		}
-		break;
-	case S_IFLNK:
-		err = au_do_cpup_symlink(&h_path, h_src, h_dir);
-		break;
-	case S_IFCHR:
-	case S_IFBLK:
-		AuDebugOn(!capable(CAP_MKNOD));
-		/*FALLTHROUGH*/
-	case S_IFIFO:
-	case S_IFSOCK:
-		err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev);
-		break;
-	default:
-		AuIOErr("Unknown inode type 0%o\n", mode);
-		err = -EIO;
-	}
-
-	mnt_flags = au_mntflags(sb);
-	if (!au_opt_test(mnt_flags, UDBA_NONE)
-	    && !isdir
-	    && au_opt_test(mnt_flags, XINO)
-	    && (h_inode->i_nlink == 1
-		|| (h_inode->i_state & I_LINKABLE))
-	    /* todo: unnecessary? */
-	    /* && d_inode(cpg->dentry)->i_nlink == 1 */
-	    && cpg->bdst < cpg->bsrc
-	    && !au_ftest_cpup(cpg->flags, KEEPLINO))
-		au_xino_write(sb, cpg->bsrc, h_inode->i_ino, /*ino*/0);
-		/* ignore this error */
-
-	if (!err) {
-		force = 0;
-		if (isreg) {
-			force = !!cpg->len;
-			if (cpg->len == -1)
-				force = !!i_size_read(h_inode);
-		}
-		au_fhsm_wrote(sb, cpg->bdst, force);
-	}
-
-	if (do_dt)
-		au_dtime_revert(&dt);
-	return err;
-}
-
-static int au_do_ren_after_cpup(struct au_cp_generic *cpg, struct path *h_path)
-{
-	int err;
-	struct dentry *dentry, *h_dentry, *h_parent, *parent;
-	struct inode *h_dir;
-	aufs_bindex_t bdst;
-
-	dentry = cpg->dentry;
-	bdst = cpg->bdst;
-	h_dentry = au_h_dptr(dentry, bdst);
-	if (!au_ftest_cpup(cpg->flags, OVERWRITE)) {
-		dget(h_dentry);
-		au_set_h_dptr(dentry, bdst, NULL);
-		err = au_lkup_neg(dentry, bdst, /*wh*/0);
-		if (!err)
-			h_path->dentry = dget(au_h_dptr(dentry, bdst));
-		au_set_h_dptr(dentry, bdst, h_dentry);
-	} else {
-		err = 0;
-		parent = dget_parent(dentry);
-		h_parent = au_h_dptr(parent, bdst);
-		dput(parent);
-		h_path->dentry = vfsub_lkup_one(&dentry->d_name, h_parent);
-		if (IS_ERR(h_path->dentry))
-			err = PTR_ERR(h_path->dentry);
-	}
-	if (unlikely(err))
-		goto out;
-
-	h_parent = h_dentry->d_parent; /* dir inode is locked */
-	h_dir = d_inode(h_parent);
-	IMustLock(h_dir);
-	AuDbg("%pd %pd\n", h_dentry, h_path->dentry);
-	/* no delegation since it is just created */
-	err = vfsub_rename(h_dir, h_dentry, h_dir, h_path, /*delegated*/NULL);
-	dput(h_path->dentry);
-
-out:
-	return err;
-}
-
-/*
- * copyup the @dentry from @bsrc to @bdst.
- * the caller must set the both of lower dentries.
- * @len is for truncating when it is -1 copyup the entire file.
- * in link/rename cases, @dst_parent may be different from the real one.
- * basic->bsrc can be larger than basic->bdst.
- */
-static int au_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent)
-{
-	int err, rerr;
-	aufs_bindex_t old_ibstart;
-	unsigned char isdir, plink;
-	struct dentry *h_src, *h_dst, *h_parent;
-	struct inode *dst_inode, *h_dir, *inode, *delegated, *src_inode;
-	struct super_block *sb;
-	struct au_branch *br;
-	/* to reuduce stack size */
-	struct {
-		struct au_dtime dt;
-		struct path h_path;
-		struct au_cpup_reg_attr h_src_attr;
-	} *a;
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-	a->h_src_attr.valid = 0;
-
-	sb = cpg->dentry->d_sb;
-	br = au_sbr(sb, cpg->bdst);
-	a->h_path.mnt = au_br_mnt(br);
-	h_dst = au_h_dptr(cpg->dentry, cpg->bdst);
-	h_parent = h_dst->d_parent; /* dir inode is locked */
-	h_dir = d_inode(h_parent);
-	IMustLock(h_dir);
-
-	h_src = au_h_dptr(cpg->dentry, cpg->bsrc);
-	inode = d_inode(cpg->dentry);
-
-	if (!dst_parent)
-		dst_parent = dget_parent(cpg->dentry);
-	else
-		dget(dst_parent);
-
-	plink = !!au_opt_test(au_mntflags(sb), PLINK);
-	dst_inode = au_h_iptr(inode, cpg->bdst);
-	if (dst_inode) {
-		if (unlikely(!plink)) {
-			err = -EIO;
-			AuIOErr("hi%lu(i%lu) exists on b%d "
-				"but plink is disabled\n",
-				dst_inode->i_ino, inode->i_ino, cpg->bdst);
-			goto out_parent;
-		}
-
-		if (dst_inode->i_nlink) {
-			const int do_dt = au_ftest_cpup(cpg->flags, DTIME);
-
-			h_src = au_plink_lkup(inode, cpg->bdst);
-			err = PTR_ERR(h_src);
-			if (IS_ERR(h_src))
-				goto out_parent;
-			if (unlikely(d_is_negative(h_src))) {
-				err = -EIO;
-				AuIOErr("i%lu exists on a upper branch "
-					"but not pseudo-linked\n",
-					inode->i_ino);
-				dput(h_src);
-				goto out_parent;
-			}
-
-			if (do_dt) {
-				a->h_path.dentry = h_parent;
-				au_dtime_store(&a->dt, dst_parent, &a->h_path);
-			}
-
-			a->h_path.dentry = h_dst;
-			delegated = NULL;
-			err = vfsub_link(h_src, h_dir, &a->h_path, &delegated);
-			if (!err && au_ftest_cpup(cpg->flags, RENAME))
-				err = au_do_ren_after_cpup(cpg, &a->h_path);
-			if (do_dt)
-				au_dtime_revert(&a->dt);
-			if (unlikely(err == -EWOULDBLOCK)) {
-				pr_warn("cannot retry for NFSv4 delegation"
-					" for an internal link\n");
-				iput(delegated);
-			}
-			dput(h_src);
-			goto out_parent;
-		} else
-			/* todo: cpup_wh_file? */
-			/* udba work */
-			au_update_ibrange(inode, /*do_put_zero*/1);
-	}
-
-	isdir = S_ISDIR(inode->i_mode);
-	old_ibstart = au_ibstart(inode);
-	err = cpup_entry(cpg, dst_parent, &a->h_src_attr);
-	if (unlikely(err))
-		goto out_rev;
-	dst_inode = d_inode(h_dst);
-	mutex_lock_nested(&dst_inode->i_mutex, AuLsc_I_CHILD2);
-	/* todo: necessary? */
-	/* au_pin_hdir_unlock(cpg->pin); */
-
-	err = cpup_iattr(cpg->dentry, cpg->bdst, h_src, &a->h_src_attr);
-	if (unlikely(err)) {
-		/* todo: necessary? */
-		/* au_pin_hdir_relock(cpg->pin); */ /* ignore an error */
-		mutex_unlock(&dst_inode->i_mutex);
-		goto out_rev;
-	}
-
-	if (cpg->bdst < old_ibstart) {
-		if (S_ISREG(inode->i_mode)) {
-			err = au_dy_iaop(inode, cpg->bdst, dst_inode);
-			if (unlikely(err)) {
-				/* ignore an error */
-				/* au_pin_hdir_relock(cpg->pin); */
-				mutex_unlock(&dst_inode->i_mutex);
-				goto out_rev;
-			}
-		}
-		au_set_ibstart(inode, cpg->bdst);
-	} else
-		au_set_ibend(inode, cpg->bdst);
-	au_set_h_iptr(inode, cpg->bdst, au_igrab(dst_inode),
-		      au_hi_flags(inode, isdir));
-
-	/* todo: necessary? */
-	/* err = au_pin_hdir_relock(cpg->pin); */
-	mutex_unlock(&dst_inode->i_mutex);
-	if (unlikely(err))
-		goto out_rev;
-
-	src_inode = d_inode(h_src);
-	if (!isdir
-	    && (src_inode->i_nlink > 1
-		|| src_inode->i_state & I_LINKABLE)
-	    && plink)
-		au_plink_append(inode, cpg->bdst, h_dst);
-
-	if (au_ftest_cpup(cpg->flags, RENAME)) {
-		a->h_path.dentry = h_dst;
-		err = au_do_ren_after_cpup(cpg, &a->h_path);
-	}
-	if (!err)
-		goto out_parent; /* success */
-
-	/* revert */
-out_rev:
-	a->h_path.dentry = h_parent;
-	au_dtime_store(&a->dt, dst_parent, &a->h_path);
-	a->h_path.dentry = h_dst;
-	rerr = 0;
-	if (d_is_positive(h_dst)) {
-		if (!isdir) {
-			/* no delegation since it is just created */
-			rerr = vfsub_unlink(h_dir, &a->h_path,
-					    /*delegated*/NULL, /*force*/0);
-		} else
-			rerr = vfsub_rmdir(h_dir, &a->h_path);
-	}
-	au_dtime_revert(&a->dt);
-	if (rerr) {
-		AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr);
-		err = -EIO;
-	}
-out_parent:
-	dput(dst_parent);
-	kfree(a);
-out:
-	return err;
-}
-
-#if 0 /* reserved */
-struct au_cpup_single_args {
-	int *errp;
-	struct au_cp_generic *cpg;
-	struct dentry *dst_parent;
-};
-
-static void au_call_cpup_single(void *args)
-{
-	struct au_cpup_single_args *a = args;
-
-	au_pin_hdir_acquire_nest(a->cpg->pin);
-	*a->errp = au_cpup_single(a->cpg, a->dst_parent);
-	au_pin_hdir_release(a->cpg->pin);
-}
-#endif
-
-/*
- * prevent SIGXFSZ in copy-up.
- * testing CAP_MKNOD is for generic fs,
- * but CAP_FSETID is for xfs only, currently.
- */
-static int au_cpup_sio_test(struct au_pin *pin, umode_t mode)
-{
-	int do_sio;
-	struct super_block *sb;
-	struct inode *h_dir;
-
-	do_sio = 0;
-	sb = au_pinned_parent(pin)->d_sb;
-	if (!au_wkq_test()
-	    && (!au_sbi(sb)->si_plink_maint_pid
-		|| au_plink_maint(sb, AuLock_NOPLM))) {
-		switch (mode & S_IFMT) {
-		case S_IFREG:
-			/* no condition about RLIMIT_FSIZE and the file size */
-			do_sio = 1;
-			break;
-		case S_IFCHR:
-		case S_IFBLK:
-			do_sio = !capable(CAP_MKNOD);
-			break;
-		}
-		if (!do_sio)
-			do_sio = ((mode & (S_ISUID | S_ISGID))
-				  && !capable(CAP_FSETID));
-		/* this workaround may be removed in the future */
-		if (!do_sio) {
-			h_dir = au_pinned_h_dir(pin);
-			do_sio = h_dir->i_mode & S_ISVTX;
-		}
-	}
-
-	return do_sio;
-}
-
-#if 0 /* reserved */
-int au_sio_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent)
-{
-	int err, wkq_err;
-	struct dentry *h_dentry;
-
-	h_dentry = au_h_dptr(cpg->dentry, cpg->bsrc);
-	if (!au_cpup_sio_test(pin, d_inode(h_dentry)->i_mode))
-		err = au_cpup_single(cpg, dst_parent);
-	else {
-		struct au_cpup_single_args args = {
-			.errp		= &err,
-			.cpg		= cpg,
-			.dst_parent	= dst_parent
-		};
-		wkq_err = au_wkq_wait(au_call_cpup_single, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	return err;
-}
-#endif
-
-/*
- * copyup the @dentry from the first active lower branch to @bdst,
- * using au_cpup_single().
- */
-static int au_cpup_simple(struct au_cp_generic *cpg)
-{
-	int err;
-	unsigned int flags_orig;
-	struct dentry *dentry;
-
-	AuDebugOn(cpg->bsrc < 0);
-
-	dentry = cpg->dentry;
-	DiMustWriteLock(dentry);
-
-	err = au_lkup_neg(dentry, cpg->bdst, /*wh*/1);
-	if (!err) {
-		flags_orig = cpg->flags;
-		au_fset_cpup(cpg->flags, RENAME);
-		err = au_cpup_single(cpg, NULL);
-		cpg->flags = flags_orig;
-		if (!err)
-			return 0; /* success */
-
-		/* revert */
-		au_set_h_dptr(dentry, cpg->bdst, NULL);
-		au_set_dbstart(dentry, cpg->bsrc);
-	}
-
-	return err;
-}
-
-struct au_cpup_simple_args {
-	int *errp;
-	struct au_cp_generic *cpg;
-};
-
-static void au_call_cpup_simple(void *args)
-{
-	struct au_cpup_simple_args *a = args;
-
-	au_pin_hdir_acquire_nest(a->cpg->pin);
-	*a->errp = au_cpup_simple(a->cpg);
-	au_pin_hdir_release(a->cpg->pin);
-}
-
-static int au_do_sio_cpup_simple(struct au_cp_generic *cpg)
-{
-	int err, wkq_err;
-	struct dentry *dentry, *parent;
-	struct file *h_file;
-	struct inode *h_dir;
-
-	dentry = cpg->dentry;
-	h_file = NULL;
-	if (au_ftest_cpup(cpg->flags, HOPEN)) {
-		AuDebugOn(cpg->bsrc < 0);
-		h_file = au_h_open_pre(dentry, cpg->bsrc, /*force_wr*/0);
-		err = PTR_ERR(h_file);
-		if (IS_ERR(h_file))
-			goto out;
-	}
-
-	parent = dget_parent(dentry);
-	h_dir = au_h_iptr(d_inode(parent), cpg->bdst);
-	if (!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE)
-	    && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode))
-		err = au_cpup_simple(cpg);
-	else {
-		struct au_cpup_simple_args args = {
-			.errp		= &err,
-			.cpg		= cpg
-		};
-		wkq_err = au_wkq_wait(au_call_cpup_simple, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	dput(parent);
-	if (h_file)
-		au_h_open_post(dentry, cpg->bsrc, h_file);
-
-out:
-	return err;
-}
-
-int au_sio_cpup_simple(struct au_cp_generic *cpg)
-{
-	aufs_bindex_t bsrc, bend;
-	struct dentry *dentry, *h_dentry;
-
-	if (cpg->bsrc < 0) {
-		dentry = cpg->dentry;
-		bend = au_dbend(dentry);
-		for (bsrc = cpg->bdst + 1; bsrc <= bend; bsrc++) {
-			h_dentry = au_h_dptr(dentry, bsrc);
-			if (h_dentry) {
-				AuDebugOn(d_is_negative(h_dentry));
-				break;
-			}
-		}
-		AuDebugOn(bsrc > bend);
-		cpg->bsrc = bsrc;
-	}
-	AuDebugOn(cpg->bsrc <= cpg->bdst);
-	return au_do_sio_cpup_simple(cpg);
-}
-
-int au_sio_cpdown_simple(struct au_cp_generic *cpg)
-{
-	AuDebugOn(cpg->bdst <= cpg->bsrc);
-	return au_do_sio_cpup_simple(cpg);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * copyup the deleted file for writing.
- */
-static int au_do_cpup_wh(struct au_cp_generic *cpg, struct dentry *wh_dentry,
-			 struct file *file)
-{
-	int err;
-	unsigned int flags_orig;
-	aufs_bindex_t bsrc_orig;
-	struct dentry *h_d_dst, *h_d_start;
-	struct au_dinfo *dinfo;
-	struct au_hdentry *hdp;
-
-	dinfo = au_di(cpg->dentry);
-	AuRwMustWriteLock(&dinfo->di_rwsem);
-
-	bsrc_orig = cpg->bsrc;
-	cpg->bsrc = dinfo->di_bstart;
-	hdp = dinfo->di_hdentry;
-	h_d_dst = hdp[0 + cpg->bdst].hd_dentry;
-	dinfo->di_bstart = cpg->bdst;
-	hdp[0 + cpg->bdst].hd_dentry = wh_dentry;
-	h_d_start = NULL;
-	if (file) {
-		h_d_start = hdp[0 + cpg->bsrc].hd_dentry;
-		hdp[0 + cpg->bsrc].hd_dentry = au_hf_top(file)->f_path.dentry;
-	}
-	flags_orig = cpg->flags;
-	cpg->flags = !AuCpup_DTIME;
-	err = au_cpup_single(cpg, /*h_parent*/NULL);
-	cpg->flags = flags_orig;
-	if (file) {
-		if (!err)
-			err = au_reopen_nondir(file);
-		hdp[0 + cpg->bsrc].hd_dentry = h_d_start;
-	}
-	hdp[0 + cpg->bdst].hd_dentry = h_d_dst;
-	dinfo->di_bstart = cpg->bsrc;
-	cpg->bsrc = bsrc_orig;
-
-	return err;
-}
-
-static int au_cpup_wh(struct au_cp_generic *cpg, struct file *file)
-{
-	int err;
-	aufs_bindex_t bdst;
-	struct au_dtime dt;
-	struct dentry *dentry, *parent, *h_parent, *wh_dentry;
-	struct au_branch *br;
-	struct path h_path;
-
-	dentry = cpg->dentry;
-	bdst = cpg->bdst;
-	br = au_sbr(dentry->d_sb, bdst);
-	parent = dget_parent(dentry);
-	h_parent = au_h_dptr(parent, bdst);
-	wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out;
-
-	h_path.dentry = h_parent;
-	h_path.mnt = au_br_mnt(br);
-	au_dtime_store(&dt, parent, &h_path);
-	err = au_do_cpup_wh(cpg, wh_dentry, file);
-	if (unlikely(err))
-		goto out_wh;
-
-	dget(wh_dentry);
-	h_path.dentry = wh_dentry;
-	if (!d_is_dir(wh_dentry)) {
-		/* no delegation since it is just created */
-		err = vfsub_unlink(d_inode(h_parent), &h_path,
-				   /*delegated*/NULL, /*force*/0);
-	} else
-		err = vfsub_rmdir(d_inode(h_parent), &h_path);
-	if (unlikely(err)) {
-		AuIOErr("failed remove copied-up tmp file %pd(%d)\n",
-			wh_dentry, err);
-		err = -EIO;
-	}
-	au_dtime_revert(&dt);
-	au_set_hi_wh(d_inode(dentry), bdst, wh_dentry);
-
-out_wh:
-	dput(wh_dentry);
-out:
-	dput(parent);
-	return err;
-}
-
-struct au_cpup_wh_args {
-	int *errp;
-	struct au_cp_generic *cpg;
-	struct file *file;
-};
-
-static void au_call_cpup_wh(void *args)
-{
-	struct au_cpup_wh_args *a = args;
-
-	au_pin_hdir_acquire_nest(a->cpg->pin);
-	*a->errp = au_cpup_wh(a->cpg, a->file);
-	au_pin_hdir_release(a->cpg->pin);
-}
-
-int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file)
-{
-	int err, wkq_err;
-	aufs_bindex_t bdst;
-	struct dentry *dentry, *parent, *h_orph, *h_parent;
-	struct inode *dir, *h_dir, *h_tmpdir;
-	struct au_wbr *wbr;
-	struct au_pin wh_pin, *pin_orig;
-
-	dentry = cpg->dentry;
-	bdst = cpg->bdst;
-	parent = dget_parent(dentry);
-	dir = d_inode(parent);
-	h_orph = NULL;
-	h_parent = NULL;
-	h_dir = au_igrab(au_h_iptr(dir, bdst));
-	h_tmpdir = h_dir;
-	pin_orig = NULL;
-	if (!h_dir->i_nlink) {
-		wbr = au_sbr(dentry->d_sb, bdst)->br_wbr;
-		h_orph = wbr->wbr_orph;
-
-		h_parent = dget(au_h_dptr(parent, bdst));
-		au_set_h_dptr(parent, bdst, dget(h_orph));
-		h_tmpdir = d_inode(h_orph);
-		au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0);
-
-		mutex_lock_nested(&h_tmpdir->i_mutex, AuLsc_I_PARENT3);
-		/* todo: au_h_open_pre()? */
-
-		pin_orig = cpg->pin;
-		au_pin_init(&wh_pin, dentry, bdst, AuLsc_DI_PARENT,
-			    AuLsc_I_PARENT3, cpg->pin->udba, AuPin_DI_LOCKED);
-		cpg->pin = &wh_pin;
-	}
-
-	if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE)
-	    && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode))
-		err = au_cpup_wh(cpg, file);
-	else {
-		struct au_cpup_wh_args args = {
-			.errp	= &err,
-			.cpg	= cpg,
-			.file	= file
-		};
-		wkq_err = au_wkq_wait(au_call_cpup_wh, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	if (h_orph) {
-		mutex_unlock(&h_tmpdir->i_mutex);
-		/* todo: au_h_open_post()? */
-		au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0);
-		au_set_h_dptr(parent, bdst, h_parent);
-		AuDebugOn(!pin_orig);
-		cpg->pin = pin_orig;
-	}
-	iput(h_dir);
-	dput(parent);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * generic routine for both of copy-up and copy-down.
- */
-/* cf. revalidate function in file.c */
-int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst,
-	       int (*cp)(struct dentry *dentry, aufs_bindex_t bdst,
-			 struct au_pin *pin,
-			 struct dentry *h_parent, void *arg),
-	       void *arg)
-{
-	int err;
-	struct au_pin pin;
-	struct dentry *d, *parent, *h_parent, *real_parent, *h_dentry;
-
-	err = 0;
-	parent = dget_parent(dentry);
-	if (IS_ROOT(parent))
-		goto out;
-
-	au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2,
-		    au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE);
-
-	/* do not use au_dpage */
-	real_parent = parent;
-	while (1) {
-		dput(parent);
-		parent = dget_parent(dentry);
-		h_parent = au_h_dptr(parent, bdst);
-		if (h_parent)
-			goto out; /* success */
-
-		/* find top dir which is necessary to cpup */
-		do {
-			d = parent;
-			dput(parent);
-			parent = dget_parent(d);
-			di_read_lock_parent3(parent, !AuLock_IR);
-			h_parent = au_h_dptr(parent, bdst);
-			di_read_unlock(parent, !AuLock_IR);
-		} while (!h_parent);
-
-		if (d != real_parent)
-			di_write_lock_child3(d);
-
-		/* somebody else might create while we were sleeping */
-		h_dentry = au_h_dptr(d, bdst);
-		if (!h_dentry || d_is_negative(h_dentry)) {
-			if (h_dentry)
-				au_update_dbstart(d);
-
-			au_pin_set_dentry(&pin, d);
-			err = au_do_pin(&pin);
-			if (!err) {
-				err = cp(d, bdst, &pin, h_parent, arg);
-				au_unpin(&pin);
-			}
-		}
-
-		if (d != real_parent)
-			di_write_unlock(d);
-		if (unlikely(err))
-			break;
-	}
-
-out:
-	dput(parent);
-	return err;
-}
-
-static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst,
-		       struct au_pin *pin,
-		       struct dentry *h_parent __maybe_unused,
-		       void *arg __maybe_unused)
-{
-	struct au_cp_generic cpg = {
-		.dentry	= dentry,
-		.bdst	= bdst,
-		.bsrc	= -1,
-		.len	= 0,
-		.pin	= pin,
-		.flags	= AuCpup_DTIME
-	};
-	return au_sio_cpup_simple(&cpg);
-}
-
-int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst)
-{
-	return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL);
-}
-
-int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst)
-{
-	int err;
-	struct dentry *parent;
-	struct inode *dir;
-
-	parent = dget_parent(dentry);
-	dir = d_inode(parent);
-	err = 0;
-	if (au_h_iptr(dir, bdst))
-		goto out;
-
-	di_read_unlock(parent, AuLock_IR);
-	di_write_lock_parent(parent);
-	/* someone else might change our inode while we were sleeping */
-	if (!au_h_iptr(dir, bdst))
-		err = au_cpup_dirs(dentry, bdst);
-	di_downgrade_lock(parent, AuLock_IR);
-
-out:
-	dput(parent);
-	return err;
-}
diff --git a/fs/aufs/cpup.h b/fs/aufs/cpup.h
deleted file mode 100644
index a5da7dbe9..000000000
--- a/fs/aufs/cpup.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * copy-up/down functions
- */
-
-#ifndef __AUFS_CPUP_H__
-#define __AUFS_CPUP_H__
-
-#ifdef __KERNEL__
-
-#include <linux/path.h>
-
-struct inode;
-struct file;
-struct au_pin;
-
-void au_cpup_attr_flags(struct inode *dst, unsigned int iflags);
-void au_cpup_attr_timesizes(struct inode *inode);
-void au_cpup_attr_nlink(struct inode *inode, int force);
-void au_cpup_attr_changeable(struct inode *inode);
-void au_cpup_igen(struct inode *inode, struct inode *h_inode);
-void au_cpup_attr_all(struct inode *inode, int force);
-
-/* ---------------------------------------------------------------------- */
-
-struct au_cp_generic {
-	struct dentry	*dentry;
-	aufs_bindex_t	bdst, bsrc;
-	loff_t		len;
-	struct au_pin	*pin;
-	unsigned int	flags;
-};
-
-/* cpup flags */
-#define AuCpup_DTIME		1		/* do dtime_store/revert */
-#define AuCpup_KEEPLINO		(1 << 1)	/* do not clear the lower xino,
-						   for link(2) */
-#define AuCpup_RENAME		(1 << 2)	/* rename after cpup */
-#define AuCpup_HOPEN		(1 << 3)	/* call h_open_pre/post() in
-						   cpup */
-#define AuCpup_OVERWRITE	(1 << 4)	/* allow overwriting the
-						   existing entry */
-#define AuCpup_RWDST		(1 << 5)	/* force write target even if
-						   the branch is marked as RO */
-
-#define au_ftest_cpup(flags, name)	((flags) & AuCpup_##name)
-#define au_fset_cpup(flags, name) \
-	do { (flags) |= AuCpup_##name; } while (0)
-#define au_fclr_cpup(flags, name) \
-	do { (flags) &= ~AuCpup_##name; } while (0)
-
-int au_copy_file(struct file *dst, struct file *src, loff_t len);
-int au_sio_cpup_simple(struct au_cp_generic *cpg);
-int au_sio_cpdown_simple(struct au_cp_generic *cpg);
-int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file);
-
-int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst,
-	       int (*cp)(struct dentry *dentry, aufs_bindex_t bdst,
-			 struct au_pin *pin,
-			 struct dentry *h_parent, void *arg),
-	       void *arg);
-int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst);
-int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst);
-
-/* ---------------------------------------------------------------------- */
-
-/* keep timestamps when copyup */
-struct au_dtime {
-	struct dentry *dt_dentry;
-	struct path dt_h_path;
-	struct timespec dt_atime, dt_mtime;
-};
-void au_dtime_store(struct au_dtime *dt, struct dentry *dentry,
-		    struct path *h_path);
-void au_dtime_revert(struct au_dtime *dt);
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_CPUP_H__ */
diff --git a/fs/aufs/dbgaufs.c b/fs/aufs/dbgaufs.c
deleted file mode 100644
index df4c65635..000000000
--- a/fs/aufs/dbgaufs.c
+++ /dev/null
@@ -1,432 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * debugfs interface
- */
-
-#include <linux/debugfs.h>
-#include "aufs.h"
-
-#ifndef CONFIG_SYSFS
-#error DEBUG_FS depends upon SYSFS
-#endif
-
-static struct dentry *dbgaufs;
-static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH;
-
-/* 20 is max digits length of ulong 64 */
-struct dbgaufs_arg {
-	int n;
-	char a[20 * 4];
-};
-
-/*
- * common function for all XINO files
- */
-static int dbgaufs_xi_release(struct inode *inode __maybe_unused,
-			      struct file *file)
-{
-	kfree(file->private_data);
-	return 0;
-}
-
-static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt)
-{
-	int err;
-	struct kstat st;
-	struct dbgaufs_arg *p;
-
-	err = -ENOMEM;
-	p = kmalloc(sizeof(*p), GFP_NOFS);
-	if (unlikely(!p))
-		goto out;
-
-	err = 0;
-	p->n = 0;
-	file->private_data = p;
-	if (!xf)
-		goto out;
-
-	err = vfs_getattr(&xf->f_path, &st);
-	if (!err) {
-		if (do_fcnt)
-			p->n = snprintf
-				(p->a, sizeof(p->a), "%ld, %llux%lu %lld\n",
-				 (long)file_count(xf), st.blocks, st.blksize,
-				 (long long)st.size);
-		else
-			p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n",
-					st.blocks, st.blksize,
-					(long long)st.size);
-		AuDebugOn(p->n >= sizeof(p->a));
-	} else {
-		p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err);
-		err = 0;
-	}
-
-out:
-	return err;
-
-}
-
-static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf,
-			       size_t count, loff_t *ppos)
-{
-	struct dbgaufs_arg *p;
-
-	p = file->private_data;
-	return simple_read_from_buffer(buf, count, ppos, p->a, p->n);
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct dbgaufs_plink_arg {
-	int n;
-	char a[];
-};
-
-static int dbgaufs_plink_release(struct inode *inode __maybe_unused,
-				 struct file *file)
-{
-	free_page((unsigned long)file->private_data);
-	return 0;
-}
-
-static int dbgaufs_plink_open(struct inode *inode, struct file *file)
-{
-	int err, i, limit;
-	unsigned long n, sum;
-	struct dbgaufs_plink_arg *p;
-	struct au_sbinfo *sbinfo;
-	struct super_block *sb;
-	struct au_sphlhead *sphl;
-
-	err = -ENOMEM;
-	p = (void *)get_zeroed_page(GFP_NOFS);
-	if (unlikely(!p))
-		goto out;
-
-	err = -EFBIG;
-	sbinfo = inode->i_private;
-	sb = sbinfo->si_sb;
-	si_noflush_read_lock(sb);
-	if (au_opt_test(au_mntflags(sb), PLINK)) {
-		limit = PAGE_SIZE - sizeof(p->n);
-
-		/* the number of buckets */
-		n = snprintf(p->a + p->n, limit, "%d\n", AuPlink_NHASH);
-		p->n += n;
-		limit -= n;
-
-		sum = 0;
-		for (i = 0, sphl = sbinfo->si_plink;
-		     i < AuPlink_NHASH;
-		     i++, sphl++) {
-			n = au_sphl_count(sphl);
-			sum += n;
-
-			n = snprintf(p->a + p->n, limit, "%lu ", n);
-			p->n += n;
-			limit -= n;
-			if (unlikely(limit <= 0))
-				goto out_free;
-		}
-		p->a[p->n - 1] = '\n';
-
-		/* the sum of plinks */
-		n = snprintf(p->a + p->n, limit, "%lu\n", sum);
-		p->n += n;
-		limit -= n;
-		if (unlikely(limit <= 0))
-			goto out_free;
-	} else {
-#define str "1\n0\n0\n"
-		p->n = sizeof(str) - 1;
-		strcpy(p->a, str);
-#undef str
-	}
-	si_read_unlock(sb);
-
-	err = 0;
-	file->private_data = p;
-	goto out; /* success */
-
-out_free:
-	free_page((unsigned long)p);
-out:
-	return err;
-}
-
-static ssize_t dbgaufs_plink_read(struct file *file, char __user *buf,
-				  size_t count, loff_t *ppos)
-{
-	struct dbgaufs_plink_arg *p;
-
-	p = file->private_data;
-	return simple_read_from_buffer(buf, count, ppos, p->a, p->n);
-}
-
-static const struct file_operations dbgaufs_plink_fop = {
-	.owner		= THIS_MODULE,
-	.open		= dbgaufs_plink_open,
-	.release	= dbgaufs_plink_release,
-	.read		= dbgaufs_plink_read
-};
-
-/* ---------------------------------------------------------------------- */
-
-static int dbgaufs_xib_open(struct inode *inode, struct file *file)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-	struct super_block *sb;
-
-	sbinfo = inode->i_private;
-	sb = sbinfo->si_sb;
-	si_noflush_read_lock(sb);
-	err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0);
-	si_read_unlock(sb);
-	return err;
-}
-
-static const struct file_operations dbgaufs_xib_fop = {
-	.owner		= THIS_MODULE,
-	.open		= dbgaufs_xib_open,
-	.release	= dbgaufs_xi_release,
-	.read		= dbgaufs_xi_read
-};
-
-/* ---------------------------------------------------------------------- */
-
-#define DbgaufsXi_PREFIX "xi"
-
-static int dbgaufs_xino_open(struct inode *inode, struct file *file)
-{
-	int err;
-	long l;
-	struct au_sbinfo *sbinfo;
-	struct super_block *sb;
-	struct file *xf;
-	struct qstr *name;
-
-	err = -ENOENT;
-	xf = NULL;
-	name = &file->f_path.dentry->d_name;
-	if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX)
-		     || memcmp(name->name, DbgaufsXi_PREFIX,
-			       sizeof(DbgaufsXi_PREFIX) - 1)))
-		goto out;
-	err = kstrtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l);
-	if (unlikely(err))
-		goto out;
-
-	sbinfo = inode->i_private;
-	sb = sbinfo->si_sb;
-	si_noflush_read_lock(sb);
-	if (l <= au_sbend(sb)) {
-		xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file;
-		err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1);
-	} else
-		err = -ENOENT;
-	si_read_unlock(sb);
-
-out:
-	return err;
-}
-
-static const struct file_operations dbgaufs_xino_fop = {
-	.owner		= THIS_MODULE,
-	.open		= dbgaufs_xino_open,
-	.release	= dbgaufs_xi_release,
-	.read		= dbgaufs_xi_read
-};
-
-void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex)
-{
-	aufs_bindex_t bend;
-	struct au_branch *br;
-	struct au_xino_file *xi;
-
-	if (!au_sbi(sb)->si_dbgaufs)
-		return;
-
-	bend = au_sbend(sb);
-	for (; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		xi = &br->br_xino;
-		debugfs_remove(xi->xi_dbgaufs);
-		xi->xi_dbgaufs = NULL;
-	}
-}
-
-void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex)
-{
-	struct au_sbinfo *sbinfo;
-	struct dentry *parent;
-	struct au_branch *br;
-	struct au_xino_file *xi;
-	aufs_bindex_t bend;
-	char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */
-
-	sbinfo = au_sbi(sb);
-	parent = sbinfo->si_dbgaufs;
-	if (!parent)
-		return;
-
-	bend = au_sbend(sb);
-	for (; bindex <= bend; bindex++) {
-		snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex);
-		br = au_sbr(sb, bindex);
-		xi = &br->br_xino;
-		AuDebugOn(xi->xi_dbgaufs);
-		xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent,
-						     sbinfo, &dbgaufs_xino_fop);
-		/* ignore an error */
-		if (unlikely(!xi->xi_dbgaufs))
-			AuWarn1("failed %s under debugfs\n", name);
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_EXPORT
-static int dbgaufs_xigen_open(struct inode *inode, struct file *file)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-	struct super_block *sb;
-
-	sbinfo = inode->i_private;
-	sb = sbinfo->si_sb;
-	si_noflush_read_lock(sb);
-	err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0);
-	si_read_unlock(sb);
-	return err;
-}
-
-static const struct file_operations dbgaufs_xigen_fop = {
-	.owner		= THIS_MODULE,
-	.open		= dbgaufs_xigen_open,
-	.release	= dbgaufs_xi_release,
-	.read		= dbgaufs_xi_read
-};
-
-static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo)
-{
-	int err;
-
-	/*
-	 * This function is a dynamic '__init' function actually,
-	 * so the tiny check for si_rwsem is unnecessary.
-	 */
-	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
-
-	err = -EIO;
-	sbinfo->si_dbgaufs_xigen = debugfs_create_file
-		("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
-		 &dbgaufs_xigen_fop);
-	if (sbinfo->si_dbgaufs_xigen)
-		err = 0;
-
-	return err;
-}
-#else
-static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo)
-{
-	return 0;
-}
-#endif /* CONFIG_AUFS_EXPORT */
-
-/* ---------------------------------------------------------------------- */
-
-void dbgaufs_si_fin(struct au_sbinfo *sbinfo)
-{
-	/*
-	 * This function is a dynamic '__fin' function actually,
-	 * so the tiny check for si_rwsem is unnecessary.
-	 */
-	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
-
-	debugfs_remove_recursive(sbinfo->si_dbgaufs);
-	sbinfo->si_dbgaufs = NULL;
-	kobject_put(&sbinfo->si_kobj);
-}
-
-int dbgaufs_si_init(struct au_sbinfo *sbinfo)
-{
-	int err;
-	char name[SysaufsSiNameLen];
-
-	/*
-	 * This function is a dynamic '__init' function actually,
-	 * so the tiny check for si_rwsem is unnecessary.
-	 */
-	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
-
-	err = -ENOENT;
-	if (!dbgaufs) {
-		AuErr1("/debug/aufs is uninitialized\n");
-		goto out;
-	}
-
-	err = -EIO;
-	sysaufs_name(sbinfo, name);
-	sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs);
-	if (unlikely(!sbinfo->si_dbgaufs))
-		goto out;
-	kobject_get(&sbinfo->si_kobj);
-
-	sbinfo->si_dbgaufs_xib = debugfs_create_file
-		("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
-		 &dbgaufs_xib_fop);
-	if (unlikely(!sbinfo->si_dbgaufs_xib))
-		goto out_dir;
-
-	sbinfo->si_dbgaufs_plink = debugfs_create_file
-		("plink", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
-		 &dbgaufs_plink_fop);
-	if (unlikely(!sbinfo->si_dbgaufs_plink))
-		goto out_dir;
-
-	err = dbgaufs_xigen_init(sbinfo);
-	if (!err)
-		goto out; /* success */
-
-out_dir:
-	dbgaufs_si_fin(sbinfo);
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void dbgaufs_fin(void)
-{
-	debugfs_remove(dbgaufs);
-}
-
-int __init dbgaufs_init(void)
-{
-	int err;
-
-	err = -EIO;
-	dbgaufs = debugfs_create_dir(AUFS_NAME, NULL);
-	if (dbgaufs)
-		err = 0;
-	return err;
-}
diff --git a/fs/aufs/dbgaufs.h b/fs/aufs/dbgaufs.h
deleted file mode 100644
index 03d47b80b..000000000
--- a/fs/aufs/dbgaufs.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * debugfs interface
- */
-
-#ifndef __DBGAUFS_H__
-#define __DBGAUFS_H__
-
-#ifdef __KERNEL__
-
-struct super_block;
-struct au_sbinfo;
-
-#ifdef CONFIG_DEBUG_FS
-/* dbgaufs.c */
-void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex);
-void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex);
-void dbgaufs_si_fin(struct au_sbinfo *sbinfo);
-int dbgaufs_si_init(struct au_sbinfo *sbinfo);
-void dbgaufs_fin(void);
-int __init dbgaufs_init(void);
-#else
-AuStubVoid(dbgaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex)
-AuStubVoid(dbgaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex)
-AuStubVoid(dbgaufs_si_fin, struct au_sbinfo *sbinfo)
-AuStubInt0(dbgaufs_si_init, struct au_sbinfo *sbinfo)
-AuStubVoid(dbgaufs_fin, void)
-AuStubInt0(__init dbgaufs_init, void)
-#endif /* CONFIG_DEBUG_FS */
-
-#endif /* __KERNEL__ */
-#endif /* __DBGAUFS_H__ */
diff --git a/fs/aufs/dcsub.c b/fs/aufs/dcsub.c
deleted file mode 100644
index 910a2e870..000000000
--- a/fs/aufs/dcsub.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * sub-routines for dentry cache
- */
-
-#include "aufs.h"
-
-static void au_dpage_free(struct au_dpage *dpage)
-{
-	int i;
-	struct dentry **p;
-
-	p = dpage->dentries;
-	for (i = 0; i < dpage->ndentry; i++)
-		dput(*p++);
-	free_page((unsigned long)dpage->dentries);
-}
-
-int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp)
-{
-	int err;
-	void *p;
-
-	err = -ENOMEM;
-	dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp);
-	if (unlikely(!dpages->dpages))
-		goto out;
-
-	p = (void *)__get_free_page(gfp);
-	if (unlikely(!p))
-		goto out_dpages;
-
-	dpages->dpages[0].ndentry = 0;
-	dpages->dpages[0].dentries = p;
-	dpages->ndpage = 1;
-	return 0; /* success */
-
-out_dpages:
-	kfree(dpages->dpages);
-out:
-	return err;
-}
-
-void au_dpages_free(struct au_dcsub_pages *dpages)
-{
-	int i;
-	struct au_dpage *p;
-
-	p = dpages->dpages;
-	for (i = 0; i < dpages->ndpage; i++)
-		au_dpage_free(p++);
-	kfree(dpages->dpages);
-}
-
-static int au_dpages_append(struct au_dcsub_pages *dpages,
-			    struct dentry *dentry, gfp_t gfp)
-{
-	int err, sz;
-	struct au_dpage *dpage;
-	void *p;
-
-	dpage = dpages->dpages + dpages->ndpage - 1;
-	sz = PAGE_SIZE / sizeof(dentry);
-	if (unlikely(dpage->ndentry >= sz)) {
-		AuLabel(new dpage);
-		err = -ENOMEM;
-		sz = dpages->ndpage * sizeof(*dpages->dpages);
-		p = au_kzrealloc(dpages->dpages, sz,
-				 sz + sizeof(*dpages->dpages), gfp);
-		if (unlikely(!p))
-			goto out;
-
-		dpages->dpages = p;
-		dpage = dpages->dpages + dpages->ndpage;
-		p = (void *)__get_free_page(gfp);
-		if (unlikely(!p))
-			goto out;
-
-		dpage->ndentry = 0;
-		dpage->dentries = p;
-		dpages->ndpage++;
-	}
-
-	AuDebugOn(au_dcount(dentry) <= 0);
-	dpage->dentries[dpage->ndentry++] = dget_dlock(dentry);
-	return 0; /* success */
-
-out:
-	return err;
-}
-
-/* todo: BAD approach */
-/* copied from linux/fs/dcache.c */
-enum d_walk_ret {
-	D_WALK_CONTINUE,
-	D_WALK_QUIT,
-	D_WALK_NORETRY,
-	D_WALK_SKIP,
-};
-
-extern void d_walk(struct dentry *parent, void *data,
-		   enum d_walk_ret (*enter)(void *, struct dentry *),
-		   void (*finish)(void *));
-
-struct ac_dpages_arg {
-	int err;
-	struct au_dcsub_pages *dpages;
-	struct super_block *sb;
-	au_dpages_test test;
-	void *arg;
-};
-
-static enum d_walk_ret au_call_dpages_append(void *_arg, struct dentry *dentry)
-{
-	enum d_walk_ret ret;
-	struct ac_dpages_arg *arg = _arg;
-
-	ret = D_WALK_CONTINUE;
-	if (dentry->d_sb == arg->sb
-	    && !IS_ROOT(dentry)
-	    && au_dcount(dentry) > 0
-	    && au_di(dentry)
-	    && (!arg->test || arg->test(dentry, arg->arg))) {
-		arg->err = au_dpages_append(arg->dpages, dentry, GFP_ATOMIC);
-		if (unlikely(arg->err))
-			ret = D_WALK_QUIT;
-	}
-
-	return ret;
-}
-
-int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root,
-		   au_dpages_test test, void *arg)
-{
-	struct ac_dpages_arg args = {
-		.err	= 0,
-		.dpages	= dpages,
-		.sb	= root->d_sb,
-		.test	= test,
-		.arg	= arg
-	};
-
-	d_walk(root, &args, au_call_dpages_append, NULL);
-
-	return args.err;
-}
-
-int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry,
-		       int do_include, au_dpages_test test, void *arg)
-{
-	int err;
-
-	err = 0;
-	write_seqlock(&rename_lock);
-	spin_lock(&dentry->d_lock);
-	if (do_include
-	    && au_dcount(dentry) > 0
-	    && (!test || test(dentry, arg)))
-		err = au_dpages_append(dpages, dentry, GFP_ATOMIC);
-	spin_unlock(&dentry->d_lock);
-	if (unlikely(err))
-		goto out;
-
-	/*
-	 * RCU for vfsmount is unnecessary since this is a traverse in a single
-	 * mount
-	 */
-	while (!IS_ROOT(dentry)) {
-		dentry = dentry->d_parent; /* rename_lock is locked */
-		spin_lock(&dentry->d_lock);
-		if (au_dcount(dentry) > 0
-		    && (!test || test(dentry, arg)))
-			err = au_dpages_append(dpages, dentry, GFP_ATOMIC);
-		spin_unlock(&dentry->d_lock);
-		if (unlikely(err))
-			break;
-	}
-
-out:
-	write_sequnlock(&rename_lock);
-	return err;
-}
-
-static inline int au_dcsub_dpages_aufs(struct dentry *dentry, void *arg)
-{
-	return au_di(dentry) && dentry->d_sb == arg;
-}
-
-int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages,
-			    struct dentry *dentry, int do_include)
-{
-	return au_dcsub_pages_rev(dpages, dentry, do_include,
-				  au_dcsub_dpages_aufs, dentry->d_sb);
-}
-
-int au_test_subdir(struct dentry *d1, struct dentry *d2)
-{
-	struct path path[2] = {
-		{
-			.dentry = d1
-		},
-		{
-			.dentry = d2
-		}
-	};
-
-	return path_is_under(path + 0, path + 1);
-}
diff --git a/fs/aufs/dcsub.h b/fs/aufs/dcsub.h
deleted file mode 100644
index d372325b2..000000000
--- a/fs/aufs/dcsub.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * sub-routines for dentry cache
- */
-
-#ifndef __AUFS_DCSUB_H__
-#define __AUFS_DCSUB_H__
-
-#ifdef __KERNEL__
-
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-struct au_dpage {
-	int ndentry;
-	struct dentry **dentries;
-};
-
-struct au_dcsub_pages {
-	int ndpage;
-	struct au_dpage *dpages;
-};
-
-/* ---------------------------------------------------------------------- */
-
-/* dcsub.c */
-int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp);
-void au_dpages_free(struct au_dcsub_pages *dpages);
-typedef int (*au_dpages_test)(struct dentry *dentry, void *arg);
-int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root,
-		   au_dpages_test test, void *arg);
-int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry,
-		       int do_include, au_dpages_test test, void *arg);
-int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages,
-			    struct dentry *dentry, int do_include);
-int au_test_subdir(struct dentry *d1, struct dentry *d2);
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * todo: in linux-3.13, several similar (but faster) helpers are added to
- * include/linux/dcache.h. Try them (in the future).
- */
-
-static inline int au_d_hashed_positive(struct dentry *d)
-{
-	int err;
-	struct inode *inode = d_inode(d);
-
-	err = 0;
-	if (unlikely(d_unhashed(d)
-		     || d_is_negative(d)
-		     || !inode->i_nlink))
-		err = -ENOENT;
-	return err;
-}
-
-static inline int au_d_linkable(struct dentry *d)
-{
-	int err;
-	struct inode *inode = d_inode(d);
-
-	err = au_d_hashed_positive(d);
-	if (err
-	    && d_is_positive(d)
-	    && (inode->i_state & I_LINKABLE))
-		err = 0;
-	return err;
-}
-
-static inline int au_d_alive(struct dentry *d)
-{
-	int err;
-	struct inode *inode;
-
-	err = 0;
-	if (!IS_ROOT(d))
-		err = au_d_hashed_positive(d);
-	else {
-		inode = d_inode(d);
-		if (unlikely(d_unlinked(d)
-			     || d_is_negative(d)
-			     || !inode->i_nlink))
-			err = -ENOENT;
-	}
-	return err;
-}
-
-static inline int au_alive_dir(struct dentry *d)
-{
-	int err;
-
-	err = au_d_alive(d);
-	if (unlikely(err || IS_DEADDIR(d_inode(d))))
-		err = -ENOENT;
-	return err;
-}
-
-static inline int au_qstreq(struct qstr *a, struct qstr *b)
-{
-	return a->len == b->len
-		&& !memcmp(a->name, b->name, a->len);
-}
-
-/*
- * by the commit
- * 360f547 2015-01-25 dcache: let the dentry count go down to zero without
- *			taking d_lock
- * the type of d_lockref.count became int, but the inlined function d_count()
- * still returns unsigned int.
- * I don't know why. Maybe it is for every d_count() users?
- * Anyway au_dcount() lives on.
- */
-static inline int au_dcount(struct dentry *d)
-{
-	return (int)d_count(d);
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_DCSUB_H__ */
diff --git a/fs/aufs/debug.c b/fs/aufs/debug.c
deleted file mode 100644
index a1154d6de..000000000
--- a/fs/aufs/debug.c
+++ /dev/null
@@ -1,440 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * debug print functions
- */
-
-#include "aufs.h"
-
-/* Returns 0, or -errno.  arg is in kp->arg. */
-static int param_atomic_t_set(const char *val, const struct kernel_param *kp)
-{
-	int err, n;
-
-	err = kstrtoint(val, 0, &n);
-	if (!err) {
-		if (n > 0)
-			au_debug_on();
-		else
-			au_debug_off();
-	}
-	return err;
-}
-
-/* Returns length written or -errno.  Buffer is 4k (ie. be short!) */
-static int param_atomic_t_get(char *buffer, const struct kernel_param *kp)
-{
-	atomic_t *a;
-
-	a = kp->arg;
-	return sprintf(buffer, "%d", atomic_read(a));
-}
-
-static struct kernel_param_ops param_ops_atomic_t = {
-	.set = param_atomic_t_set,
-	.get = param_atomic_t_get
-	/* void (*free)(void *arg) */
-};
-
-atomic_t aufs_debug = ATOMIC_INIT(0);
-MODULE_PARM_DESC(debug, "debug print");
-module_param_named(debug, aufs_debug, atomic_t, S_IRUGO | S_IWUSR | S_IWGRP);
-
-DEFINE_MUTEX(au_dbg_mtx);	/* just to serialize the dbg msgs */
-char *au_plevel = KERN_DEBUG;
-#define dpri(fmt, ...) do {					\
-	if ((au_plevel						\
-	     && strcmp(au_plevel, KERN_DEBUG))			\
-	    || au_debug_test())					\
-		printk("%s" fmt, au_plevel, ##__VA_ARGS__);	\
-} while (0)
-
-/* ---------------------------------------------------------------------- */
-
-void au_dpri_whlist(struct au_nhash *whlist)
-{
-	unsigned long ul, n;
-	struct hlist_head *head;
-	struct au_vdir_wh *pos;
-
-	n = whlist->nh_num;
-	head = whlist->nh_head;
-	for (ul = 0; ul < n; ul++) {
-		hlist_for_each_entry(pos, head, wh_hash)
-			dpri("b%d, %.*s, %d\n",
-			     pos->wh_bindex,
-			     pos->wh_str.len, pos->wh_str.name,
-			     pos->wh_str.len);
-		head++;
-	}
-}
-
-void au_dpri_vdir(struct au_vdir *vdir)
-{
-	unsigned long ul;
-	union au_vdir_deblk_p p;
-	unsigned char *o;
-
-	if (!vdir || IS_ERR(vdir)) {
-		dpri("err %ld\n", PTR_ERR(vdir));
-		return;
-	}
-
-	dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n",
-	     vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk,
-	     vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version);
-	for (ul = 0; ul < vdir->vd_nblk; ul++) {
-		p.deblk = vdir->vd_deblk[ul];
-		o = p.deblk;
-		dpri("[%lu]: %p\n", ul, o);
-	}
-}
-
-static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn,
-			struct dentry *wh)
-{
-	char *n = NULL;
-	int l = 0;
-
-	if (!inode || IS_ERR(inode)) {
-		dpri("i%d: err %ld\n", bindex, PTR_ERR(inode));
-		return -1;
-	}
-
-	/* the type of i_blocks depends upon CONFIG_LBDAF */
-	BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long)
-		     && sizeof(inode->i_blocks) != sizeof(u64));
-	if (wh) {
-		n = (void *)wh->d_name.name;
-		l = wh->d_name.len;
-	}
-
-	dpri("i%d: %p, i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu,"
-	     " hn %d, ct %lld, np %lu, st 0x%lx, f 0x%x, v %llu, g %x%s%.*s\n",
-	     bindex, inode,
-	     inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??",
-	     atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode,
-	     i_size_read(inode), (unsigned long long)inode->i_blocks,
-	     hn, (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff,
-	     inode->i_mapping ? inode->i_mapping->nrpages : 0,
-	     inode->i_state, inode->i_flags, inode->i_version,
-	     inode->i_generation,
-	     l ? ", wh " : "", l, n);
-	return 0;
-}
-
-void au_dpri_inode(struct inode *inode)
-{
-	struct au_iinfo *iinfo;
-	aufs_bindex_t bindex;
-	int err, hn;
-
-	err = do_pri_inode(-1, inode, -1, NULL);
-	if (err || !au_test_aufs(inode->i_sb))
-		return;
-
-	iinfo = au_ii(inode);
-	if (!iinfo)
-		return;
-	dpri("i-1: bstart %d, bend %d, gen %d\n",
-	     iinfo->ii_bstart, iinfo->ii_bend, au_iigen(inode, NULL));
-	if (iinfo->ii_bstart < 0)
-		return;
-	hn = 0;
-	for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; bindex++) {
-		hn = !!au_hn(iinfo->ii_hinode + bindex);
-		do_pri_inode(bindex, iinfo->ii_hinode[0 + bindex].hi_inode, hn,
-			     iinfo->ii_hinode[0 + bindex].hi_whdentry);
-	}
-}
-
-void au_dpri_dalias(struct inode *inode)
-{
-	struct dentry *d;
-
-	spin_lock(&inode->i_lock);
-	hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias)
-		au_dpri_dentry(d);
-	spin_unlock(&inode->i_lock);
-}
-
-static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry)
-{
-	struct dentry *wh = NULL;
-	int hn;
-	struct au_iinfo *iinfo;
-
-	if (!dentry || IS_ERR(dentry)) {
-		dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry));
-		return -1;
-	}
-	/* do not call dget_parent() here */
-	/* note: access d_xxx without d_lock */
-	dpri("d%d: %p, %pd2?, %s, cnt %d, flags 0x%x, %shashed\n",
-	     bindex, dentry, dentry,
-	     dentry->d_sb ? au_sbtype(dentry->d_sb) : "??",
-	     au_dcount(dentry), dentry->d_flags,
-	     d_unhashed(dentry) ? "un" : "");
-	hn = -1;
-	if (bindex >= 0
-	    && d_is_positive(dentry)
-	    && au_test_aufs(dentry->d_sb)) {
-		iinfo = au_ii(d_inode(dentry));
-		if (iinfo) {
-			hn = !!au_hn(iinfo->ii_hinode + bindex);
-			wh = iinfo->ii_hinode[0 + bindex].hi_whdentry;
-		}
-	}
-	do_pri_inode(bindex, d_inode(dentry), hn, wh);
-	return 0;
-}
-
-void au_dpri_dentry(struct dentry *dentry)
-{
-	struct au_dinfo *dinfo;
-	aufs_bindex_t bindex;
-	int err;
-	struct au_hdentry *hdp;
-
-	err = do_pri_dentry(-1, dentry);
-	if (err || !au_test_aufs(dentry->d_sb))
-		return;
-
-	dinfo = au_di(dentry);
-	if (!dinfo)
-		return;
-	dpri("d-1: bstart %d, bend %d, bwh %d, bdiropq %d, gen %d, tmp %d\n",
-	     dinfo->di_bstart, dinfo->di_bend,
-	     dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry),
-	     dinfo->di_tmpfile);
-	if (dinfo->di_bstart < 0)
-		return;
-	hdp = dinfo->di_hdentry;
-	for (bindex = dinfo->di_bstart; bindex <= dinfo->di_bend; bindex++)
-		do_pri_dentry(bindex, hdp[0 + bindex].hd_dentry);
-}
-
-static int do_pri_file(aufs_bindex_t bindex, struct file *file)
-{
-	char a[32];
-
-	if (!file || IS_ERR(file)) {
-		dpri("f%d: err %ld\n", bindex, PTR_ERR(file));
-		return -1;
-	}
-	a[0] = 0;
-	if (bindex < 0
-	    && !IS_ERR_OR_NULL(file->f_path.dentry)
-	    && au_test_aufs(file->f_path.dentry->d_sb)
-	    && au_fi(file))
-		snprintf(a, sizeof(a), ", gen %d, mmapped %d",
-			 au_figen(file), atomic_read(&au_fi(file)->fi_mmapped));
-	dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n",
-	     bindex, file->f_mode, file->f_flags, (long)file_count(file),
-	     file->f_version, file->f_pos, a);
-	if (!IS_ERR_OR_NULL(file->f_path.dentry))
-		do_pri_dentry(bindex, file->f_path.dentry);
-	return 0;
-}
-
-void au_dpri_file(struct file *file)
-{
-	struct au_finfo *finfo;
-	struct au_fidir *fidir;
-	struct au_hfile *hfile;
-	aufs_bindex_t bindex;
-	int err;
-
-	err = do_pri_file(-1, file);
-	if (err
-	    || IS_ERR_OR_NULL(file->f_path.dentry)
-	    || !au_test_aufs(file->f_path.dentry->d_sb))
-		return;
-
-	finfo = au_fi(file);
-	if (!finfo)
-		return;
-	if (finfo->fi_btop < 0)
-		return;
-	fidir = finfo->fi_hdir;
-	if (!fidir)
-		do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file);
-	else
-		for (bindex = finfo->fi_btop;
-		     bindex >= 0 && bindex <= fidir->fd_bbot;
-		     bindex++) {
-			hfile = fidir->fd_hfile + bindex;
-			do_pri_file(bindex, hfile ? hfile->hf_file : NULL);
-		}
-}
-
-static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br)
-{
-	struct vfsmount *mnt;
-	struct super_block *sb;
-
-	if (!br || IS_ERR(br))
-		goto out;
-	mnt = au_br_mnt(br);
-	if (!mnt || IS_ERR(mnt))
-		goto out;
-	sb = mnt->mnt_sb;
-	if (!sb || IS_ERR(sb))
-		goto out;
-
-	dpri("s%d: {perm 0x%x, id %d, cnt %d, wbr %p}, "
-	     "%s, dev 0x%02x%02x, flags 0x%lx, cnt %d, active %d, "
-	     "xino %d\n",
-	     bindex, br->br_perm, br->br_id, atomic_read(&br->br_count),
-	     br->br_wbr, au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev),
-	     sb->s_flags, sb->s_count,
-	     atomic_read(&sb->s_active), !!br->br_xino.xi_file);
-	return 0;
-
-out:
-	dpri("s%d: err %ld\n", bindex, PTR_ERR(br));
-	return -1;
-}
-
-void au_dpri_sb(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-	aufs_bindex_t bindex;
-	int err;
-	/* to reuduce stack size */
-	struct {
-		struct vfsmount mnt;
-		struct au_branch fake;
-	} *a;
-
-	/* this function can be called from magic sysrq */
-	a = kzalloc(sizeof(*a), GFP_ATOMIC);
-	if (unlikely(!a)) {
-		dpri("no memory\n");
-		return;
-	}
-
-	a->mnt.mnt_sb = sb;
-	a->fake.br_perm = 0;
-	a->fake.br_path.mnt = &a->mnt;
-	a->fake.br_xino.xi_file = NULL;
-	atomic_set(&a->fake.br_count, 0);
-	smp_mb(); /* atomic_set */
-	err = do_pri_br(-1, &a->fake);
-	kfree(a);
-	dpri("dev 0x%x\n", sb->s_dev);
-	if (err || !au_test_aufs(sb))
-		return;
-
-	sbinfo = au_sbi(sb);
-	if (!sbinfo)
-		return;
-	dpri("nw %d, gen %u, kobj %d\n",
-	     atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation,
-	     atomic_read(&sbinfo->si_kobj.kref.refcount));
-	for (bindex = 0; bindex <= sbinfo->si_bend; bindex++)
-		do_pri_br(bindex, sbinfo->si_branch[0 + bindex]);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line)
-{
-	struct inode *h_inode, *inode = d_inode(dentry);
-	struct dentry *h_dentry;
-	aufs_bindex_t bindex, bend, bi;
-
-	if (!inode /* || au_di(dentry)->di_lsc == AuLsc_DI_TMP */)
-		return;
-
-	bend = au_dbend(dentry);
-	bi = au_ibend(inode);
-	if (bi < bend)
-		bend = bi;
-	bindex = au_dbstart(dentry);
-	bi = au_ibstart(inode);
-	if (bi > bindex)
-		bindex = bi;
-
-	for (; bindex <= bend; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-		h_inode = au_h_iptr(inode, bindex);
-		if (unlikely(h_inode != d_inode(h_dentry))) {
-			au_debug_on();
-			AuDbg("b%d, %s:%d\n", bindex, func, line);
-			AuDbgDentry(dentry);
-			AuDbgInode(inode);
-			au_debug_off();
-			BUG();
-		}
-	}
-}
-
-void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen)
-{
-	int err, i, j;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry **dentries;
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	AuDebugOn(err);
-	err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/1);
-	AuDebugOn(err);
-	for (i = dpages.ndpage - 1; !err && i >= 0; i--) {
-		dpage = dpages.dpages + i;
-		dentries = dpage->dentries;
-		for (j = dpage->ndentry - 1; !err && j >= 0; j--)
-			AuDebugOn(au_digen_test(dentries[j], sigen));
-	}
-	au_dpages_free(&dpages);
-}
-
-void au_dbg_verify_kthread(void)
-{
-	if (au_wkq_test()) {
-		au_dbg_blocked();
-		/*
-		 * It may be recursive, but udba=notify between two aufs mounts,
-		 * where a single ro branch is shared, is not a problem.
-		 */
-		/* WARN_ON(1); */
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-int __init au_debug_init(void)
-{
-	aufs_bindex_t bindex;
-	struct au_vdir_destr destr;
-
-	bindex = -1;
-	AuDebugOn(bindex >= 0);
-
-	destr.len = -1;
-	AuDebugOn(destr.len < NAME_MAX);
-
-#ifdef CONFIG_4KSTACKS
-	pr_warn("CONFIG_4KSTACKS is defined.\n");
-#endif
-
-	return 0;
-}
diff --git a/fs/aufs/debug.h b/fs/aufs/debug.h
deleted file mode 100644
index de6c2a682..000000000
--- a/fs/aufs/debug.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * debug print functions
- */
-
-#ifndef __AUFS_DEBUG_H__
-#define __AUFS_DEBUG_H__
-
-#ifdef __KERNEL__
-
-#include <linux/atomic.h>
-#include <linux/module.h>
-#include <linux/kallsyms.h>
-#include <linux/sysrq.h>
-
-#ifdef CONFIG_AUFS_DEBUG
-#define AuDebugOn(a)		BUG_ON(a)
-
-/* module parameter */
-extern atomic_t aufs_debug;
-static inline void au_debug_on(void)
-{
-	atomic_inc(&aufs_debug);
-}
-static inline void au_debug_off(void)
-{
-	atomic_dec_if_positive(&aufs_debug);
-}
-
-static inline int au_debug_test(void)
-{
-	return atomic_read(&aufs_debug) > 0;
-}
-#else
-#define AuDebugOn(a)		do {} while (0)
-AuStubVoid(au_debug_on, void)
-AuStubVoid(au_debug_off, void)
-AuStubInt0(au_debug_test, void)
-#endif /* CONFIG_AUFS_DEBUG */
-
-#define param_check_atomic_t(name, p) __param_check(name, p, atomic_t)
-
-/* ---------------------------------------------------------------------- */
-
-/* debug print */
-
-#define AuDbg(fmt, ...) do { \
-	if (au_debug_test()) \
-		pr_debug("DEBUG: " fmt, ##__VA_ARGS__); \
-} while (0)
-#define AuLabel(l)		AuDbg(#l "\n")
-#define AuIOErr(fmt, ...)	pr_err("I/O Error, " fmt, ##__VA_ARGS__)
-#define AuWarn1(fmt, ...) do { \
-	static unsigned char _c; \
-	if (!_c++) \
-		pr_warn(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define AuErr1(fmt, ...) do { \
-	static unsigned char _c; \
-	if (!_c++) \
-		pr_err(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define AuIOErr1(fmt, ...) do { \
-	static unsigned char _c; \
-	if (!_c++) \
-		AuIOErr(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define AuUnsupportMsg	"This operation is not supported." \
-			" Please report this application to aufs-users ML."
-#define AuUnsupport(fmt, ...) do { \
-	pr_err(AuUnsupportMsg "\n" fmt, ##__VA_ARGS__); \
-	dump_stack(); \
-} while (0)
-
-#define AuTraceErr(e) do { \
-	if (unlikely((e) < 0)) \
-		AuDbg("err %d\n", (int)(e)); \
-} while (0)
-
-#define AuTraceErrPtr(p) do { \
-	if (IS_ERR(p)) \
-		AuDbg("err %ld\n", PTR_ERR(p)); \
-} while (0)
-
-/* dirty macros for debug print, use with "%.*s" and caution */
-#define AuLNPair(qstr)		(qstr)->len, (qstr)->name
-
-/* ---------------------------------------------------------------------- */
-
-struct dentry;
-#ifdef CONFIG_AUFS_DEBUG
-extern struct mutex au_dbg_mtx;
-extern char *au_plevel;
-struct au_nhash;
-void au_dpri_whlist(struct au_nhash *whlist);
-struct au_vdir;
-void au_dpri_vdir(struct au_vdir *vdir);
-struct inode;
-void au_dpri_inode(struct inode *inode);
-void au_dpri_dalias(struct inode *inode);
-void au_dpri_dentry(struct dentry *dentry);
-struct file;
-void au_dpri_file(struct file *filp);
-struct super_block;
-void au_dpri_sb(struct super_block *sb);
-
-#define au_dbg_verify_dinode(d) __au_dbg_verify_dinode(d, __func__, __LINE__)
-void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line);
-void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen);
-void au_dbg_verify_kthread(void);
-
-int __init au_debug_init(void);
-
-#define AuDbgWhlist(w) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#w "\n"); \
-	au_dpri_whlist(w); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgVdir(v) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#v "\n"); \
-	au_dpri_vdir(v); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgInode(i) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#i "\n"); \
-	au_dpri_inode(i); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgDAlias(i) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#i "\n"); \
-	au_dpri_dalias(i); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgDentry(d) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#d "\n"); \
-	au_dpri_dentry(d); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgFile(f) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#f "\n"); \
-	au_dpri_file(f); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgSb(sb) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#sb "\n"); \
-	au_dpri_sb(sb); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgSym(addr) do {				\
-	char sym[KSYM_SYMBOL_LEN];			\
-	sprint_symbol(sym, (unsigned long)addr);	\
-	AuDbg("%s\n", sym);				\
-} while (0)
-#else
-AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry)
-AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen)
-AuStubVoid(au_dbg_verify_kthread, void)
-AuStubInt0(__init au_debug_init, void)
-
-#define AuDbgWhlist(w)		do {} while (0)
-#define AuDbgVdir(v)		do {} while (0)
-#define AuDbgInode(i)		do {} while (0)
-#define AuDbgDAlias(i)		do {} while (0)
-#define AuDbgDentry(d)		do {} while (0)
-#define AuDbgFile(f)		do {} while (0)
-#define AuDbgSb(sb)		do {} while (0)
-#define AuDbgSym(addr)		do {} while (0)
-#endif /* CONFIG_AUFS_DEBUG */
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_MAGIC_SYSRQ
-int __init au_sysrq_init(void);
-void au_sysrq_fin(void);
-
-#ifdef CONFIG_HW_CONSOLE
-#define au_dbg_blocked() do { \
-	WARN_ON(1); \
-	handle_sysrq('w'); \
-} while (0)
-#else
-AuStubVoid(au_dbg_blocked, void)
-#endif
-
-#else
-AuStubInt0(__init au_sysrq_init, void)
-AuStubVoid(au_sysrq_fin, void)
-AuStubVoid(au_dbg_blocked, void)
-#endif /* CONFIG_AUFS_MAGIC_SYSRQ */
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_DEBUG_H__ */
diff --git a/fs/aufs/dentry.c b/fs/aufs/dentry.c
deleted file mode 100644
index 1191a6c93..000000000
--- a/fs/aufs/dentry.c
+++ /dev/null
@@ -1,1105 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * lookup and dentry operations
- */
-
-#include <linux/namei.h>
-#include "aufs.h"
-
-#define AuLkup_ALLOW_NEG	1
-#define AuLkup_IGNORE_PERM	(1 << 1)
-#define au_ftest_lkup(flags, name)	((flags) & AuLkup_##name)
-#define au_fset_lkup(flags, name) \
-	do { (flags) |= AuLkup_##name; } while (0)
-#define au_fclr_lkup(flags, name) \
-	do { (flags) &= ~AuLkup_##name; } while (0)
-
-struct au_do_lookup_args {
-	unsigned int		flags;
-	mode_t			type;
-};
-
-/*
- * returns positive/negative dentry, NULL or an error.
- * NULL means whiteout-ed or not-found.
- */
-static struct dentry*
-au_do_lookup(struct dentry *h_parent, struct dentry *dentry,
-	     aufs_bindex_t bindex, struct qstr *wh_name,
-	     struct au_do_lookup_args *args)
-{
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-	struct au_branch *br;
-	int wh_found, opq;
-	unsigned char wh_able;
-	const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG);
-	const unsigned char ignore_perm = !!au_ftest_lkup(args->flags,
-							  IGNORE_PERM);
-
-	wh_found = 0;
-	br = au_sbr(dentry->d_sb, bindex);
-	wh_able = !!au_br_whable(br->br_perm);
-	if (wh_able)
-		wh_found = au_wh_test(h_parent, wh_name, /*try_sio*/0);
-	h_dentry = ERR_PTR(wh_found);
-	if (!wh_found)
-		goto real_lookup;
-	if (unlikely(wh_found < 0))
-		goto out;
-
-	/* We found a whiteout */
-	/* au_set_dbend(dentry, bindex); */
-	au_set_dbwh(dentry, bindex);
-	if (!allow_neg)
-		return NULL; /* success */
-
-real_lookup:
-	if (!ignore_perm)
-		h_dentry = vfsub_lkup_one(&dentry->d_name, h_parent);
-	else
-		h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent);
-	if (IS_ERR(h_dentry)) {
-		if (PTR_ERR(h_dentry) == -ENAMETOOLONG
-		    && !allow_neg)
-			h_dentry = NULL;
-		goto out;
-	}
-
-	h_inode = d_inode(h_dentry);
-	if (d_is_negative(h_dentry)) {
-		if (!allow_neg)
-			goto out_neg;
-	} else if (wh_found
-		   || (args->type && args->type != (h_inode->i_mode & S_IFMT)))
-		goto out_neg;
-
-	if (au_dbend(dentry) <= bindex)
-		au_set_dbend(dentry, bindex);
-	if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry))
-		au_set_dbstart(dentry, bindex);
-	au_set_h_dptr(dentry, bindex, h_dentry);
-
-	if (!d_is_dir(h_dentry)
-	    || !wh_able
-	    || (d_really_is_positive(dentry) && !d_is_dir(dentry)))
-		goto out; /* success */
-
-	mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD);
-	opq = au_diropq_test(h_dentry);
-	mutex_unlock(&h_inode->i_mutex);
-	if (opq > 0)
-		au_set_dbdiropq(dentry, bindex);
-	else if (unlikely(opq < 0)) {
-		au_set_h_dptr(dentry, bindex, NULL);
-		h_dentry = ERR_PTR(opq);
-	}
-	goto out;
-
-out_neg:
-	dput(h_dentry);
-	h_dentry = NULL;
-out:
-	return h_dentry;
-}
-
-static int au_test_shwh(struct super_block *sb, const struct qstr *name)
-{
-	if (unlikely(!au_opt_test(au_mntflags(sb), SHWH)
-		     && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)))
-		return -EPERM;
-	return 0;
-}
-
-/*
- * returns the number of lower positive dentries,
- * otherwise an error.
- * can be called at unlinking with @type is zero.
- */
-int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type)
-{
-	int npositive, err;
-	aufs_bindex_t bindex, btail, bdiropq;
-	unsigned char isdir, dirperm1;
-	struct qstr whname;
-	struct au_do_lookup_args args = {
-		.flags		= 0,
-		.type		= type
-	};
-	const struct qstr *name = &dentry->d_name;
-	struct dentry *parent;
-	struct super_block *sb;
-
-	sb = dentry->d_sb;
-	err = au_test_shwh(sb, name);
-	if (unlikely(err))
-		goto out;
-
-	err = au_wh_name_alloc(&whname, name);
-	if (unlikely(err))
-		goto out;
-
-	isdir = !!d_is_dir(dentry);
-	if (!type)
-		au_fset_lkup(args.flags, ALLOW_NEG);
-	dirperm1 = !!au_opt_test(au_mntflags(sb), DIRPERM1);
-
-	npositive = 0;
-	parent = dget_parent(dentry);
-	btail = au_dbtaildir(parent);
-	for (bindex = bstart; bindex <= btail; bindex++) {
-		struct dentry *h_parent, *h_dentry;
-		struct inode *h_inode, *h_dir;
-
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (h_dentry) {
-			if (d_is_positive(h_dentry))
-				npositive++;
-			if (type != S_IFDIR)
-				break;
-			continue;
-		}
-		h_parent = au_h_dptr(parent, bindex);
-		if (!h_parent || !d_is_dir(h_parent))
-			continue;
-
-		h_dir = d_inode(h_parent);
-		mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT);
-		h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname,
-					&args);
-		mutex_unlock(&h_dir->i_mutex);
-		err = PTR_ERR(h_dentry);
-		if (IS_ERR(h_dentry))
-			goto out_parent;
-		if (h_dentry)
-			au_fclr_lkup(args.flags, ALLOW_NEG);
-		if (dirperm1)
-			au_fset_lkup(args.flags, IGNORE_PERM);
-
-		if (au_dbwh(dentry) >= 0)
-			break;
-		if (!h_dentry)
-			continue;
-		if (d_is_negative(h_dentry))
-			continue;
-		h_inode = d_inode(h_dentry);
-		npositive++;
-		if (!args.type)
-			args.type = h_inode->i_mode & S_IFMT;
-		if (args.type != S_IFDIR)
-			break;
-		else if (isdir) {
-			/* the type of lower may be different */
-			bdiropq = au_dbdiropq(dentry);
-			if (bdiropq >= 0 && bdiropq <= bindex)
-				break;
-		}
-	}
-
-	if (npositive) {
-		AuLabel(positive);
-		au_update_dbstart(dentry);
-	}
-	err = npositive;
-	if (unlikely(!au_opt_test(au_mntflags(sb), UDBA_NONE)
-		     && au_dbstart(dentry) < 0)) {
-		err = -EIO;
-		AuIOErr("both of real entry and whiteout found, %pd, err %d\n",
-			dentry, err);
-	}
-
-out_parent:
-	dput(parent);
-	kfree(whname.name);
-out:
-	return err;
-}
-
-struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent)
-{
-	struct dentry *dentry;
-	int wkq_err;
-
-	if (!au_test_h_perm_sio(d_inode(parent), MAY_EXEC))
-		dentry = vfsub_lkup_one(name, parent);
-	else {
-		struct vfsub_lkup_one_args args = {
-			.errp	= &dentry,
-			.name	= name,
-			.parent	= parent
-		};
-
-		wkq_err = au_wkq_wait(vfsub_call_lkup_one, &args);
-		if (unlikely(wkq_err))
-			dentry = ERR_PTR(wkq_err);
-	}
-
-	return dentry;
-}
-
-/*
- * lookup @dentry on @bindex which should be negative.
- */
-int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh)
-{
-	int err;
-	struct dentry *parent, *h_parent, *h_dentry;
-	struct au_branch *br;
-
-	parent = dget_parent(dentry);
-	h_parent = au_h_dptr(parent, bindex);
-	br = au_sbr(dentry->d_sb, bindex);
-	if (wh)
-		h_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name);
-	else
-		h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent);
-	err = PTR_ERR(h_dentry);
-	if (IS_ERR(h_dentry))
-		goto out;
-	if (unlikely(d_is_positive(h_dentry))) {
-		err = -EIO;
-		AuIOErr("%pd should be negative on b%d.\n", h_dentry, bindex);
-		dput(h_dentry);
-		goto out;
-	}
-
-	err = 0;
-	if (bindex < au_dbstart(dentry))
-		au_set_dbstart(dentry, bindex);
-	if (au_dbend(dentry) < bindex)
-		au_set_dbend(dentry, bindex);
-	au_set_h_dptr(dentry, bindex, h_dentry);
-
-out:
-	dput(parent);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* subset of struct inode */
-struct au_iattr {
-	unsigned long		i_ino;
-	/* unsigned int		i_nlink; */
-	kuid_t			i_uid;
-	kgid_t			i_gid;
-	u64			i_version;
-/*
-	loff_t			i_size;
-	blkcnt_t		i_blocks;
-*/
-	umode_t			i_mode;
-};
-
-static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode)
-{
-	ia->i_ino = h_inode->i_ino;
-	/* ia->i_nlink = h_inode->i_nlink; */
-	ia->i_uid = h_inode->i_uid;
-	ia->i_gid = h_inode->i_gid;
-	ia->i_version = h_inode->i_version;
-/*
-	ia->i_size = h_inode->i_size;
-	ia->i_blocks = h_inode->i_blocks;
-*/
-	ia->i_mode = (h_inode->i_mode & S_IFMT);
-}
-
-static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode)
-{
-	return ia->i_ino != h_inode->i_ino
-		/* || ia->i_nlink != h_inode->i_nlink */
-		|| !uid_eq(ia->i_uid, h_inode->i_uid)
-		|| !gid_eq(ia->i_gid, h_inode->i_gid)
-		|| ia->i_version != h_inode->i_version
-/*
-		|| ia->i_size != h_inode->i_size
-		|| ia->i_blocks != h_inode->i_blocks
-*/
-		|| ia->i_mode != (h_inode->i_mode & S_IFMT);
-}
-
-static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent,
-			      struct au_branch *br)
-{
-	int err;
-	struct au_iattr ia;
-	struct inode *h_inode;
-	struct dentry *h_d;
-	struct super_block *h_sb;
-
-	err = 0;
-	memset(&ia, -1, sizeof(ia));
-	h_sb = h_dentry->d_sb;
-	h_inode = NULL;
-	if (d_is_positive(h_dentry)) {
-		h_inode = d_inode(h_dentry);
-		au_iattr_save(&ia, h_inode);
-	} else if (au_test_nfs(h_sb) || au_test_fuse(h_sb))
-		/* nfs d_revalidate may return 0 for negative dentry */
-		/* fuse d_revalidate always return 0 for negative dentry */
-		goto out;
-
-	/* main purpose is namei.c:cached_lookup() and d_revalidate */
-	h_d = vfsub_lkup_one(&h_dentry->d_name, h_parent);
-	err = PTR_ERR(h_d);
-	if (IS_ERR(h_d))
-		goto out;
-
-	err = 0;
-	if (unlikely(h_d != h_dentry
-		     || d_inode(h_d) != h_inode
-		     || (h_inode && au_iattr_test(&ia, h_inode))))
-		err = au_busy_or_stale();
-	dput(h_d);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir,
-		struct dentry *h_parent, struct au_branch *br)
-{
-	int err;
-
-	err = 0;
-	if (udba == AuOpt_UDBA_REVAL
-	    && !au_test_fs_remote(h_dentry->d_sb)) {
-		IMustLock(h_dir);
-		err = (d_inode(h_dentry->d_parent) != h_dir);
-	} else if (udba != AuOpt_UDBA_NONE)
-		err = au_h_verify_dentry(h_dentry, h_parent, br);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_do_refresh_hdentry(struct dentry *dentry, struct dentry *parent)
-{
-	int err;
-	aufs_bindex_t new_bindex, bindex, bend, bwh, bdiropq;
-	struct au_hdentry tmp, *p, *q;
-	struct au_dinfo *dinfo;
-	struct super_block *sb;
-
-	DiMustWriteLock(dentry);
-
-	sb = dentry->d_sb;
-	dinfo = au_di(dentry);
-	bend = dinfo->di_bend;
-	bwh = dinfo->di_bwh;
-	bdiropq = dinfo->di_bdiropq;
-	p = dinfo->di_hdentry + dinfo->di_bstart;
-	for (bindex = dinfo->di_bstart; bindex <= bend; bindex++, p++) {
-		if (!p->hd_dentry)
-			continue;
-
-		new_bindex = au_br_index(sb, p->hd_id);
-		if (new_bindex == bindex)
-			continue;
-
-		if (dinfo->di_bwh == bindex)
-			bwh = new_bindex;
-		if (dinfo->di_bdiropq == bindex)
-			bdiropq = new_bindex;
-		if (new_bindex < 0) {
-			au_hdput(p);
-			p->hd_dentry = NULL;
-			continue;
-		}
-
-		/* swap two lower dentries, and loop again */
-		q = dinfo->di_hdentry + new_bindex;
-		tmp = *q;
-		*q = *p;
-		*p = tmp;
-		if (tmp.hd_dentry) {
-			bindex--;
-			p--;
-		}
-	}
-
-	dinfo->di_bwh = -1;
-	if (bwh >= 0 && bwh <= au_sbend(sb) && au_sbr_whable(sb, bwh))
-		dinfo->di_bwh = bwh;
-
-	dinfo->di_bdiropq = -1;
-	if (bdiropq >= 0
-	    && bdiropq <= au_sbend(sb)
-	    && au_sbr_whable(sb, bdiropq))
-		dinfo->di_bdiropq = bdiropq;
-
-	err = -EIO;
-	dinfo->di_bstart = -1;
-	dinfo->di_bend = -1;
-	bend = au_dbend(parent);
-	p = dinfo->di_hdentry;
-	for (bindex = 0; bindex <= bend; bindex++, p++)
-		if (p->hd_dentry) {
-			dinfo->di_bstart = bindex;
-			break;
-		}
-
-	if (dinfo->di_bstart >= 0) {
-		p = dinfo->di_hdentry + bend;
-		for (bindex = bend; bindex >= 0; bindex--, p--)
-			if (p->hd_dentry) {
-				dinfo->di_bend = bindex;
-				err = 0;
-				break;
-			}
-	}
-
-	return err;
-}
-
-static void au_do_hide(struct dentry *dentry)
-{
-	struct inode *inode;
-
-	if (d_really_is_positive(dentry)) {
-		inode = d_inode(dentry);
-		if (!d_is_dir(dentry)) {
-			if (inode->i_nlink && !d_unhashed(dentry))
-				drop_nlink(inode);
-		} else {
-			clear_nlink(inode);
-			/* stop next lookup */
-			inode->i_flags |= S_DEAD;
-		}
-		smp_mb(); /* necessary? */
-	}
-	d_drop(dentry);
-}
-
-static int au_hide_children(struct dentry *parent)
-{
-	int err, i, j, ndentry;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry *dentry;
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	err = au_dcsub_pages(&dpages, parent, NULL, NULL);
-	if (unlikely(err))
-		goto out_dpages;
-
-	/* in reverse order */
-	for (i = dpages.ndpage - 1; i >= 0; i--) {
-		dpage = dpages.dpages + i;
-		ndentry = dpage->ndentry;
-		for (j = ndentry - 1; j >= 0; j--) {
-			dentry = dpage->dentries[j];
-			if (dentry != parent)
-				au_do_hide(dentry);
-		}
-	}
-
-out_dpages:
-	au_dpages_free(&dpages);
-out:
-	return err;
-}
-
-static void au_hide(struct dentry *dentry)
-{
-	int err;
-
-	AuDbgDentry(dentry);
-	if (d_is_dir(dentry)) {
-		/* shrink_dcache_parent(dentry); */
-		err = au_hide_children(dentry);
-		if (unlikely(err))
-			AuIOErr("%pd, failed hiding children, ignored %d\n",
-				dentry, err);
-	}
-	au_do_hide(dentry);
-}
-
-/*
- * By adding a dirty branch, a cached dentry may be affected in various ways.
- *
- * a dirty branch is added
- * - on the top of layers
- * - in the middle of layers
- * - to the bottom of layers
- *
- * on the added branch there exists
- * - a whiteout
- * - a diropq
- * - a same named entry
- *   + exist
- *     * negative --> positive
- *     * positive --> positive
- *	 - type is unchanged
- *	 - type is changed
- *   + doesn't exist
- *     * negative --> negative
- *     * positive --> negative (rejected by au_br_del() for non-dir case)
- * - none
- */
-static int au_refresh_by_dinfo(struct dentry *dentry, struct au_dinfo *dinfo,
-			       struct au_dinfo *tmp)
-{
-	int err;
-	aufs_bindex_t bindex, bend;
-	struct {
-		struct dentry *dentry;
-		struct inode *inode;
-		mode_t mode;
-	} orig_h, tmp_h;
-	struct au_hdentry *hd;
-	struct inode *inode, *h_inode;
-	struct dentry *h_dentry;
-
-	err = 0;
-	AuDebugOn(dinfo->di_bstart < 0);
-	orig_h.mode = 0;
-	orig_h.dentry = dinfo->di_hdentry[dinfo->di_bstart].hd_dentry;
-	orig_h.inode = NULL;
-	if (d_is_positive(orig_h.dentry)) {
-		orig_h.inode = d_inode(orig_h.dentry);
-		orig_h.mode = orig_h.inode->i_mode & S_IFMT;
-	}
-	memset(&tmp_h, 0, sizeof(tmp_h));
-	if (tmp->di_bstart >= 0) {
-		tmp_h.dentry = tmp->di_hdentry[tmp->di_bstart].hd_dentry;
-		tmp_h.inode = NULL;
-		if (d_is_positive(tmp_h.dentry)) {
-			tmp_h.inode = d_inode(tmp_h.dentry);
-			tmp_h.mode = tmp_h.inode->i_mode & S_IFMT;
-		}
-	}
-
-	inode = NULL;
-	if (d_really_is_positive(dentry))
-		inode = d_inode(dentry);
-	if (!orig_h.inode) {
-		AuDbg("nagative originally\n");
-		if (inode) {
-			au_hide(dentry);
-			goto out;
-		}
-		AuDebugOn(inode);
-		AuDebugOn(dinfo->di_bstart != dinfo->di_bend);
-		AuDebugOn(dinfo->di_bdiropq != -1);
-
-		if (!tmp_h.inode) {
-			AuDbg("negative --> negative\n");
-			/* should have only one negative lower */
-			if (tmp->di_bstart >= 0
-			    && tmp->di_bstart < dinfo->di_bstart) {
-				AuDebugOn(tmp->di_bstart != tmp->di_bend);
-				AuDebugOn(dinfo->di_bstart != dinfo->di_bend);
-				au_set_h_dptr(dentry, dinfo->di_bstart, NULL);
-				au_di_cp(dinfo, tmp);
-				hd = tmp->di_hdentry + tmp->di_bstart;
-				au_set_h_dptr(dentry, tmp->di_bstart,
-					      dget(hd->hd_dentry));
-			}
-			au_dbg_verify_dinode(dentry);
-		} else {
-			AuDbg("negative --> positive\n");
-			/*
-			 * similar to the behaviour of creating with bypassing
-			 * aufs.
-			 * unhash it in order to force an error in the
-			 * succeeding create operation.
-			 * we should not set S_DEAD here.
-			 */
-			d_drop(dentry);
-			/* au_di_swap(tmp, dinfo); */
-			au_dbg_verify_dinode(dentry);
-		}
-	} else {
-		AuDbg("positive originally\n");
-		/* inode may be NULL */
-		AuDebugOn(inode && (inode->i_mode & S_IFMT) != orig_h.mode);
-		if (!tmp_h.inode) {
-			AuDbg("positive --> negative\n");
-			/* or bypassing aufs */
-			au_hide(dentry);
-			if (tmp->di_bwh >= 0 && tmp->di_bwh <= dinfo->di_bstart)
-				dinfo->di_bwh = tmp->di_bwh;
-			if (inode)
-				err = au_refresh_hinode_self(inode);
-			au_dbg_verify_dinode(dentry);
-		} else if (orig_h.mode == tmp_h.mode) {
-			AuDbg("positive --> positive, same type\n");
-			if (!S_ISDIR(orig_h.mode)
-			    && dinfo->di_bstart > tmp->di_bstart) {
-				/*
-				 * similar to the behaviour of removing and
-				 * creating.
-				 */
-				au_hide(dentry);
-				if (inode)
-					err = au_refresh_hinode_self(inode);
-				au_dbg_verify_dinode(dentry);
-			} else {
-				/* fill empty slots */
-				if (dinfo->di_bstart > tmp->di_bstart)
-					dinfo->di_bstart = tmp->di_bstart;
-				if (dinfo->di_bend < tmp->di_bend)
-					dinfo->di_bend = tmp->di_bend;
-				dinfo->di_bwh = tmp->di_bwh;
-				dinfo->di_bdiropq = tmp->di_bdiropq;
-				hd = tmp->di_hdentry;
-				bend = dinfo->di_bend;
-				for (bindex = tmp->di_bstart; bindex <= bend;
-				     bindex++) {
-					if (au_h_dptr(dentry, bindex))
-						continue;
-					h_dentry = hd[bindex].hd_dentry;
-					if (!h_dentry)
-						continue;
-					AuDebugOn(d_is_negative(h_dentry));
-					h_inode = d_inode(h_dentry);
-					AuDebugOn(orig_h.mode
-						  != (h_inode->i_mode
-						      & S_IFMT));
-					au_set_h_dptr(dentry, bindex,
-						      dget(h_dentry));
-				}
-				err = au_refresh_hinode(inode, dentry);
-				au_dbg_verify_dinode(dentry);
-			}
-		} else {
-			AuDbg("positive --> positive, different type\n");
-			/* similar to the behaviour of removing and creating */
-			au_hide(dentry);
-			if (inode)
-				err = au_refresh_hinode_self(inode);
-			au_dbg_verify_dinode(dentry);
-		}
-	}
-
-out:
-	return err;
-}
-
-int au_refresh_dentry(struct dentry *dentry, struct dentry *parent)
-{
-	int err, ebrange;
-	unsigned int sigen;
-	struct au_dinfo *dinfo, *tmp;
-	struct super_block *sb;
-	struct inode *inode;
-
-	DiMustWriteLock(dentry);
-	AuDebugOn(IS_ROOT(dentry));
-	AuDebugOn(d_really_is_negative(parent));
-
-	sb = dentry->d_sb;
-	sigen = au_sigen(sb);
-	err = au_digen_test(parent, sigen);
-	if (unlikely(err))
-		goto out;
-
-	dinfo = au_di(dentry);
-	err = au_di_realloc(dinfo, au_sbend(sb) + 1);
-	if (unlikely(err))
-		goto out;
-	ebrange = au_dbrange_test(dentry);
-	if (!ebrange)
-		ebrange = au_do_refresh_hdentry(dentry, parent);
-
-	if (d_unhashed(dentry) || ebrange /* || dinfo->di_tmpfile */) {
-		AuDebugOn(au_dbstart(dentry) < 0 && au_dbend(dentry) >= 0);
-		if (d_really_is_positive(dentry)) {
-			inode = d_inode(dentry);
-			err = au_refresh_hinode_self(inode);
-		}
-		au_dbg_verify_dinode(dentry);
-		if (!err)
-			goto out_dgen; /* success */
-		goto out;
-	}
-
-	/* temporary dinfo */
-	AuDbgDentry(dentry);
-	err = -ENOMEM;
-	tmp = au_di_alloc(sb, AuLsc_DI_TMP);
-	if (unlikely(!tmp))
-		goto out;
-	au_di_swap(tmp, dinfo);
-	/* returns the number of positive dentries */
-	/*
-	 * if current working dir is removed, it returns an error.
-	 * but the dentry is legal.
-	 */
-	err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0);
-	AuDbgDentry(dentry);
-	au_di_swap(tmp, dinfo);
-	if (err == -ENOENT)
-		err = 0;
-	if (err >= 0) {
-		/* compare/refresh by dinfo */
-		AuDbgDentry(dentry);
-		err = au_refresh_by_dinfo(dentry, dinfo, tmp);
-		au_dbg_verify_dinode(dentry);
-		AuTraceErr(err);
-	}
-	au_rw_write_unlock(&tmp->di_rwsem);
-	au_di_free(tmp);
-	if (unlikely(err))
-		goto out;
-
-out_dgen:
-	au_update_digen(dentry);
-out:
-	if (unlikely(err && !(dentry->d_flags & DCACHE_NFSFS_RENAMED))) {
-		AuIOErr("failed refreshing %pd, %d\n", dentry, err);
-		AuDbgDentry(dentry);
-	}
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_do_h_d_reval(struct dentry *h_dentry, unsigned int flags,
-			   struct dentry *dentry, aufs_bindex_t bindex)
-{
-	int err, valid;
-
-	err = 0;
-	if (!(h_dentry->d_flags & DCACHE_OP_REVALIDATE))
-		goto out;
-
-	AuDbg("b%d\n", bindex);
-	/*
-	 * gave up supporting LOOKUP_CREATE/OPEN for lower fs,
-	 * due to whiteout and branch permission.
-	 */
-	flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE
-		   | LOOKUP_FOLLOW | LOOKUP_EXCL);
-	/* it may return tri-state */
-	valid = h_dentry->d_op->d_revalidate(h_dentry, flags);
-
-	if (unlikely(valid < 0))
-		err = valid;
-	else if (!valid)
-		err = -EINVAL;
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* todo: remove this */
-static int h_d_revalidate(struct dentry *dentry, struct inode *inode,
-			  unsigned int flags, int do_udba)
-{
-	int err;
-	umode_t mode, h_mode;
-	aufs_bindex_t bindex, btail, bstart, ibs, ibe;
-	unsigned char plus, unhashed, is_root, h_plus, h_nfs, tmpfile;
-	struct inode *h_inode, *h_cached_inode;
-	struct dentry *h_dentry;
-	struct qstr *name, *h_name;
-
-	err = 0;
-	plus = 0;
-	mode = 0;
-	ibs = -1;
-	ibe = -1;
-	unhashed = !!d_unhashed(dentry);
-	is_root = !!IS_ROOT(dentry);
-	name = &dentry->d_name;
-	tmpfile = au_di(dentry)->di_tmpfile;
-
-	/*
-	 * Theoretically, REVAL test should be unnecessary in case of
-	 * {FS,I}NOTIFY.
-	 * But {fs,i}notify doesn't fire some necessary events,
-	 *	IN_ATTRIB for atime/nlink/pageio
-	 * Let's do REVAL test too.
-	 */
-	if (do_udba && inode) {
-		mode = (inode->i_mode & S_IFMT);
-		plus = (inode->i_nlink > 0);
-		ibs = au_ibstart(inode);
-		ibe = au_ibend(inode);
-	}
-
-	bstart = au_dbstart(dentry);
-	btail = bstart;
-	if (inode && S_ISDIR(inode->i_mode))
-		btail = au_dbtaildir(dentry);
-	for (bindex = bstart; bindex <= btail; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-
-		AuDbg("b%d, %pd\n", bindex, h_dentry);
-		h_nfs = !!au_test_nfs(h_dentry->d_sb);
-		spin_lock(&h_dentry->d_lock);
-		h_name = &h_dentry->d_name;
-		if (unlikely(do_udba
-			     && !is_root
-			     && ((!h_nfs
-				  && (unhashed != !!d_unhashed(h_dentry)
-				      || (!tmpfile
-					  && !au_qstreq(name, h_name))
-					  ))
-				 || (h_nfs
-				     && !(flags & LOOKUP_OPEN)
-				     && (h_dentry->d_flags
-					 & DCACHE_NFSFS_RENAMED)))
-			    )) {
-			int h_unhashed;
-
-			h_unhashed = d_unhashed(h_dentry);
-			spin_unlock(&h_dentry->d_lock);
-			AuDbg("unhash 0x%x 0x%x, %pd %pd\n",
-			      unhashed, h_unhashed, dentry, h_dentry);
-			goto err;
-		}
-		spin_unlock(&h_dentry->d_lock);
-
-		err = au_do_h_d_reval(h_dentry, flags, dentry, bindex);
-		if (unlikely(err))
-			/* do not goto err, to keep the errno */
-			break;
-
-		/* todo: plink too? */
-		if (!do_udba)
-			continue;
-
-		/* UDBA tests */
-		if (unlikely(!!inode != d_is_positive(h_dentry)))
-			goto err;
-
-		h_inode = NULL;
-		if (d_is_positive(h_dentry))
-			h_inode = d_inode(h_dentry);
-		h_plus = plus;
-		h_mode = mode;
-		h_cached_inode = h_inode;
-		if (h_inode) {
-			h_mode = (h_inode->i_mode & S_IFMT);
-			h_plus = (h_inode->i_nlink > 0);
-		}
-		if (inode && ibs <= bindex && bindex <= ibe)
-			h_cached_inode = au_h_iptr(inode, bindex);
-
-		if (!h_nfs) {
-			if (unlikely(plus != h_plus && !tmpfile))
-				goto err;
-		} else {
-			if (unlikely(!(h_dentry->d_flags & DCACHE_NFSFS_RENAMED)
-				     && !is_root
-				     && !IS_ROOT(h_dentry)
-				     && unhashed != d_unhashed(h_dentry)))
-				goto err;
-		}
-		if (unlikely(mode != h_mode
-			     || h_cached_inode != h_inode))
-			goto err;
-		continue;
-
-err:
-		err = -EINVAL;
-		break;
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-/* todo: consolidate with do_refresh() and au_reval_for_attr() */
-static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen)
-{
-	int err;
-	struct dentry *parent;
-
-	if (!au_digen_test(dentry, sigen))
-		return 0;
-
-	parent = dget_parent(dentry);
-	di_read_lock_parent(parent, AuLock_IR);
-	AuDebugOn(au_digen_test(parent, sigen));
-	au_dbg_verify_gen(parent, sigen);
-	err = au_refresh_dentry(dentry, parent);
-	di_read_unlock(parent, AuLock_IR);
-	dput(parent);
-	AuTraceErr(err);
-	return err;
-}
-
-int au_reval_dpath(struct dentry *dentry, unsigned int sigen)
-{
-	int err;
-	struct dentry *d, *parent;
-
-	if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR))
-		return simple_reval_dpath(dentry, sigen);
-
-	/* slow loop, keep it simple and stupid */
-	/* cf: au_cpup_dirs() */
-	err = 0;
-	parent = NULL;
-	while (au_digen_test(dentry, sigen)) {
-		d = dentry;
-		while (1) {
-			dput(parent);
-			parent = dget_parent(d);
-			if (!au_digen_test(parent, sigen))
-				break;
-			d = parent;
-		}
-
-		if (d != dentry)
-			di_write_lock_child2(d);
-
-		/* someone might update our dentry while we were sleeping */
-		if (au_digen_test(d, sigen)) {
-			/*
-			 * todo: consolidate with simple_reval_dpath(),
-			 * do_refresh() and au_reval_for_attr().
-			 */
-			di_read_lock_parent(parent, AuLock_IR);
-			err = au_refresh_dentry(d, parent);
-			di_read_unlock(parent, AuLock_IR);
-		}
-
-		if (d != dentry)
-			di_write_unlock(d);
-		dput(parent);
-		if (unlikely(err))
-			break;
-	}
-
-	return err;
-}
-
-/*
- * if valid returns 1, otherwise 0.
- */
-static int aufs_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	int valid, err;
-	unsigned int sigen;
-	unsigned char do_udba;
-	struct super_block *sb;
-	struct inode *inode;
-
-	/* todo: support rcu-walk? */
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	valid = 0;
-	if (unlikely(!au_di(dentry)))
-		goto out;
-
-	valid = 1;
-	sb = dentry->d_sb;
-	/*
-	 * todo: very ugly
-	 * i_mutex of parent dir may be held,
-	 * but we should not return 'invalid' due to busy.
-	 */
-	err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW | AuLock_NOPLM);
-	if (unlikely(err)) {
-		valid = err;
-		AuTraceErr(err);
-		goto out;
-	}
-	inode = NULL;
-	if (d_really_is_positive(dentry))
-		inode = d_inode(dentry);
-	if (unlikely(inode && is_bad_inode(inode))) {
-		err = -EINVAL;
-		AuTraceErr(err);
-		goto out_dgrade;
-	}
-	if (unlikely(au_dbrange_test(dentry))) {
-		err = -EINVAL;
-		AuTraceErr(err);
-		goto out_dgrade;
-	}
-
-	sigen = au_sigen(sb);
-	if (au_digen_test(dentry, sigen)) {
-		AuDebugOn(IS_ROOT(dentry));
-		err = au_reval_dpath(dentry, sigen);
-		if (unlikely(err)) {
-			AuTraceErr(err);
-			goto out_dgrade;
-		}
-	}
-	di_downgrade_lock(dentry, AuLock_IR);
-
-	err = -EINVAL;
-	if (!(flags & (LOOKUP_OPEN | LOOKUP_EMPTY))
-	    && inode
-	    && !(inode->i_state && I_LINKABLE)
-	    && (IS_DEADDIR(inode) || !inode->i_nlink))
-		goto out_inval;
-
-	do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE);
-	if (do_udba && inode) {
-		aufs_bindex_t bstart = au_ibstart(inode);
-		struct inode *h_inode;
-
-		if (bstart >= 0) {
-			h_inode = au_h_iptr(inode, bstart);
-			if (h_inode && au_test_higen(inode, h_inode))
-				goto out_inval;
-		}
-	}
-
-	err = h_d_revalidate(dentry, inode, flags, do_udba);
-	if (unlikely(!err && do_udba && au_dbstart(dentry) < 0)) {
-		err = -EIO;
-		AuDbg("both of real entry and whiteout found, %p, err %d\n",
-		      dentry, err);
-	}
-	goto out_inval;
-
-out_dgrade:
-	di_downgrade_lock(dentry, AuLock_IR);
-out_inval:
-	aufs_read_unlock(dentry, AuLock_IR);
-	AuTraceErr(err);
-	valid = !err;
-out:
-	if (!valid) {
-		AuDbg("%pd invalid, %d\n", dentry, valid);
-		d_drop(dentry);
-	}
-	return valid;
-}
-
-static void aufs_d_release(struct dentry *dentry)
-{
-	if (au_di(dentry)) {
-		au_di_fin(dentry);
-		au_hn_di_reinit(dentry);
-	}
-}
-
-const struct dentry_operations aufs_dop = {
-	.d_revalidate		= aufs_d_revalidate,
-	.d_weak_revalidate	= aufs_d_revalidate,
-	.d_release		= aufs_d_release
-};
diff --git a/fs/aufs/dentry.h b/fs/aufs/dentry.h
deleted file mode 100644
index 2ce602e1a..000000000
--- a/fs/aufs/dentry.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * lookup and dentry operations
- */
-
-#ifndef __AUFS_DENTRY_H__
-#define __AUFS_DENTRY_H__
-
-#ifdef __KERNEL__
-
-#include <linux/dcache.h>
-#include "rwsem.h"
-
-struct au_hdentry {
-	struct dentry		*hd_dentry;
-	aufs_bindex_t		hd_id;
-};
-
-struct au_dinfo {
-	atomic_t		di_generation;
-
-	struct au_rwsem		di_rwsem;
-	aufs_bindex_t		di_bstart, di_bend, di_bwh, di_bdiropq;
-	unsigned char		di_tmpfile; /* to allow the different name */
-	struct au_hdentry	*di_hdentry;
-} ____cacheline_aligned_in_smp;
-
-/* ---------------------------------------------------------------------- */
-
-/* dentry.c */
-extern const struct dentry_operations aufs_dop;
-struct au_branch;
-struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent);
-int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir,
-		struct dentry *h_parent, struct au_branch *br);
-
-int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type);
-int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh);
-int au_refresh_dentry(struct dentry *dentry, struct dentry *parent);
-int au_reval_dpath(struct dentry *dentry, unsigned int sigen);
-
-/* dinfo.c */
-void au_di_init_once(void *_di);
-struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc);
-void au_di_free(struct au_dinfo *dinfo);
-void au_di_swap(struct au_dinfo *a, struct au_dinfo *b);
-void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src);
-int au_di_init(struct dentry *dentry);
-void au_di_fin(struct dentry *dentry);
-int au_di_realloc(struct au_dinfo *dinfo, int nbr);
-
-void di_read_lock(struct dentry *d, int flags, unsigned int lsc);
-void di_read_unlock(struct dentry *d, int flags);
-void di_downgrade_lock(struct dentry *d, int flags);
-void di_write_lock(struct dentry *d, unsigned int lsc);
-void di_write_unlock(struct dentry *d);
-void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir);
-void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir);
-void di_write_unlock2(struct dentry *d1, struct dentry *d2);
-
-struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex);
-struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex);
-aufs_bindex_t au_dbtail(struct dentry *dentry);
-aufs_bindex_t au_dbtaildir(struct dentry *dentry);
-
-void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex,
-		   struct dentry *h_dentry);
-int au_digen_test(struct dentry *dentry, unsigned int sigen);
-int au_dbrange_test(struct dentry *dentry);
-void au_update_digen(struct dentry *dentry);
-void au_update_dbrange(struct dentry *dentry, int do_put_zero);
-void au_update_dbstart(struct dentry *dentry);
-void au_update_dbend(struct dentry *dentry);
-int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry);
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct au_dinfo *au_di(struct dentry *dentry)
-{
-	return dentry->d_fsdata;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* lock subclass for dinfo */
-enum {
-	AuLsc_DI_CHILD,		/* child first */
-	AuLsc_DI_CHILD2,	/* rename(2), link(2), and cpup at hnotify */
-	AuLsc_DI_CHILD3,	/* copyup dirs */
-	AuLsc_DI_PARENT,
-	AuLsc_DI_PARENT2,
-	AuLsc_DI_PARENT3,
-	AuLsc_DI_TMP		/* temp for replacing dinfo */
-};
-
-/*
- * di_read_lock_child, di_write_lock_child,
- * di_read_lock_child2, di_write_lock_child2,
- * di_read_lock_child3, di_write_lock_child3,
- * di_read_lock_parent, di_write_lock_parent,
- * di_read_lock_parent2, di_write_lock_parent2,
- * di_read_lock_parent3, di_write_lock_parent3,
- */
-#define AuReadLockFunc(name, lsc) \
-static inline void di_read_lock_##name(struct dentry *d, int flags) \
-{ di_read_lock(d, flags, AuLsc_DI_##lsc); }
-
-#define AuWriteLockFunc(name, lsc) \
-static inline void di_write_lock_##name(struct dentry *d) \
-{ di_write_lock(d, AuLsc_DI_##lsc); }
-
-#define AuRWLockFuncs(name, lsc) \
-	AuReadLockFunc(name, lsc) \
-	AuWriteLockFunc(name, lsc)
-
-AuRWLockFuncs(child, CHILD);
-AuRWLockFuncs(child2, CHILD2);
-AuRWLockFuncs(child3, CHILD3);
-AuRWLockFuncs(parent, PARENT);
-AuRWLockFuncs(parent2, PARENT2);
-AuRWLockFuncs(parent3, PARENT3);
-
-#undef AuReadLockFunc
-#undef AuWriteLockFunc
-#undef AuRWLockFuncs
-
-#define DiMustNoWaiters(d)	AuRwMustNoWaiters(&au_di(d)->di_rwsem)
-#define DiMustAnyLock(d)	AuRwMustAnyLock(&au_di(d)->di_rwsem)
-#define DiMustWriteLock(d)	AuRwMustWriteLock(&au_di(d)->di_rwsem)
-
-/* ---------------------------------------------------------------------- */
-
-/* todo: memory barrier? */
-static inline unsigned int au_digen(struct dentry *d)
-{
-	return atomic_read(&au_di(d)->di_generation);
-}
-
-static inline void au_h_dentry_init(struct au_hdentry *hdentry)
-{
-	hdentry->hd_dentry = NULL;
-}
-
-static inline void au_hdput(struct au_hdentry *hd)
-{
-	if (hd)
-		dput(hd->hd_dentry);
-}
-
-static inline aufs_bindex_t au_dbstart(struct dentry *dentry)
-{
-	DiMustAnyLock(dentry);
-	return au_di(dentry)->di_bstart;
-}
-
-static inline aufs_bindex_t au_dbend(struct dentry *dentry)
-{
-	DiMustAnyLock(dentry);
-	return au_di(dentry)->di_bend;
-}
-
-static inline aufs_bindex_t au_dbwh(struct dentry *dentry)
-{
-	DiMustAnyLock(dentry);
-	return au_di(dentry)->di_bwh;
-}
-
-static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry)
-{
-	DiMustAnyLock(dentry);
-	return au_di(dentry)->di_bdiropq;
-}
-
-/* todo: hard/soft set? */
-static inline void au_set_dbstart(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	DiMustWriteLock(dentry);
-	au_di(dentry)->di_bstart = bindex;
-}
-
-static inline void au_set_dbend(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	DiMustWriteLock(dentry);
-	au_di(dentry)->di_bend = bindex;
-}
-
-static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	DiMustWriteLock(dentry);
-	/* dbwh can be outside of bstart - bend range */
-	au_di(dentry)->di_bwh = bindex;
-}
-
-static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	DiMustWriteLock(dentry);
-	au_di(dentry)->di_bdiropq = bindex;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_HNOTIFY
-static inline void au_digen_dec(struct dentry *d)
-{
-	atomic_dec(&au_di(d)->di_generation);
-}
-
-static inline void au_hn_di_reinit(struct dentry *dentry)
-{
-	dentry->d_fsdata = NULL;
-}
-#else
-AuStubVoid(au_hn_di_reinit, struct dentry *dentry __maybe_unused)
-#endif /* CONFIG_AUFS_HNOTIFY */
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_DENTRY_H__ */
diff --git a/fs/aufs/dinfo.c b/fs/aufs/dinfo.c
deleted file mode 100644
index 7b9ca3b88..000000000
--- a/fs/aufs/dinfo.c
+++ /dev/null
@@ -1,550 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * dentry private data
- */
-
-#include "aufs.h"
-
-void au_di_init_once(void *_dinfo)
-{
-	struct au_dinfo *dinfo = _dinfo;
-	static struct lock_class_key aufs_di;
-
-	au_rw_init(&dinfo->di_rwsem);
-	au_rw_class(&dinfo->di_rwsem, &aufs_di);
-}
-
-struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc)
-{
-	struct au_dinfo *dinfo;
-	int nbr, i;
-
-	dinfo = au_cache_alloc_dinfo();
-	if (unlikely(!dinfo))
-		goto out;
-
-	nbr = au_sbend(sb) + 1;
-	if (nbr <= 0)
-		nbr = 1;
-	dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS);
-	if (dinfo->di_hdentry) {
-		au_rw_write_lock_nested(&dinfo->di_rwsem, lsc);
-		dinfo->di_bstart = -1;
-		dinfo->di_bend = -1;
-		dinfo->di_bwh = -1;
-		dinfo->di_bdiropq = -1;
-		dinfo->di_tmpfile = 0;
-		for (i = 0; i < nbr; i++)
-			dinfo->di_hdentry[i].hd_id = -1;
-		goto out;
-	}
-
-	au_cache_free_dinfo(dinfo);
-	dinfo = NULL;
-
-out:
-	return dinfo;
-}
-
-void au_di_free(struct au_dinfo *dinfo)
-{
-	struct au_hdentry *p;
-	aufs_bindex_t bend, bindex;
-
-	/* dentry may not be revalidated */
-	bindex = dinfo->di_bstart;
-	if (bindex >= 0) {
-		bend = dinfo->di_bend;
-		p = dinfo->di_hdentry + bindex;
-		while (bindex++ <= bend)
-			au_hdput(p++);
-	}
-	kfree(dinfo->di_hdentry);
-	au_cache_free_dinfo(dinfo);
-}
-
-void au_di_swap(struct au_dinfo *a, struct au_dinfo *b)
-{
-	struct au_hdentry *p;
-	aufs_bindex_t bi;
-
-	AuRwMustWriteLock(&a->di_rwsem);
-	AuRwMustWriteLock(&b->di_rwsem);
-
-#define DiSwap(v, name)				\
-	do {					\
-		v = a->di_##name;		\
-		a->di_##name = b->di_##name;	\
-		b->di_##name = v;		\
-	} while (0)
-
-	DiSwap(p, hdentry);
-	DiSwap(bi, bstart);
-	DiSwap(bi, bend);
-	DiSwap(bi, bwh);
-	DiSwap(bi, bdiropq);
-	/* smp_mb(); */
-
-#undef DiSwap
-}
-
-void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src)
-{
-	AuRwMustWriteLock(&dst->di_rwsem);
-	AuRwMustWriteLock(&src->di_rwsem);
-
-	dst->di_bstart = src->di_bstart;
-	dst->di_bend = src->di_bend;
-	dst->di_bwh = src->di_bwh;
-	dst->di_bdiropq = src->di_bdiropq;
-	/* smp_mb(); */
-}
-
-int au_di_init(struct dentry *dentry)
-{
-	int err;
-	struct super_block *sb;
-	struct au_dinfo *dinfo;
-
-	err = 0;
-	sb = dentry->d_sb;
-	dinfo = au_di_alloc(sb, AuLsc_DI_CHILD);
-	if (dinfo) {
-		atomic_set(&dinfo->di_generation, au_sigen(sb));
-		/* smp_mb(); */ /* atomic_set */
-		dentry->d_fsdata = dinfo;
-	} else
-		err = -ENOMEM;
-
-	return err;
-}
-
-void au_di_fin(struct dentry *dentry)
-{
-	struct au_dinfo *dinfo;
-
-	dinfo = au_di(dentry);
-	AuRwDestroy(&dinfo->di_rwsem);
-	au_di_free(dinfo);
-}
-
-int au_di_realloc(struct au_dinfo *dinfo, int nbr)
-{
-	int err, sz;
-	struct au_hdentry *hdp;
-
-	AuRwMustWriteLock(&dinfo->di_rwsem);
-
-	err = -ENOMEM;
-	sz = sizeof(*hdp) * (dinfo->di_bend + 1);
-	if (!sz)
-		sz = sizeof(*hdp);
-	hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS);
-	if (hdp) {
-		dinfo->di_hdentry = hdp;
-		err = 0;
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void do_ii_write_lock(struct inode *inode, unsigned int lsc)
-{
-	switch (lsc) {
-	case AuLsc_DI_CHILD:
-		ii_write_lock_child(inode);
-		break;
-	case AuLsc_DI_CHILD2:
-		ii_write_lock_child2(inode);
-		break;
-	case AuLsc_DI_CHILD3:
-		ii_write_lock_child3(inode);
-		break;
-	case AuLsc_DI_PARENT:
-		ii_write_lock_parent(inode);
-		break;
-	case AuLsc_DI_PARENT2:
-		ii_write_lock_parent2(inode);
-		break;
-	case AuLsc_DI_PARENT3:
-		ii_write_lock_parent3(inode);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static void do_ii_read_lock(struct inode *inode, unsigned int lsc)
-{
-	switch (lsc) {
-	case AuLsc_DI_CHILD:
-		ii_read_lock_child(inode);
-		break;
-	case AuLsc_DI_CHILD2:
-		ii_read_lock_child2(inode);
-		break;
-	case AuLsc_DI_CHILD3:
-		ii_read_lock_child3(inode);
-		break;
-	case AuLsc_DI_PARENT:
-		ii_read_lock_parent(inode);
-		break;
-	case AuLsc_DI_PARENT2:
-		ii_read_lock_parent2(inode);
-		break;
-	case AuLsc_DI_PARENT3:
-		ii_read_lock_parent3(inode);
-		break;
-	default:
-		BUG();
-	}
-}
-
-void di_read_lock(struct dentry *d, int flags, unsigned int lsc)
-{
-	struct inode *inode;
-
-	au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc);
-	if (d_really_is_positive(d)) {
-		inode = d_inode(d);
-		if (au_ftest_lock(flags, IW))
-			do_ii_write_lock(inode, lsc);
-		else if (au_ftest_lock(flags, IR))
-			do_ii_read_lock(inode, lsc);
-	}
-}
-
-void di_read_unlock(struct dentry *d, int flags)
-{
-	struct inode *inode;
-
-	if (d_really_is_positive(d)) {
-		inode = d_inode(d);
-		if (au_ftest_lock(flags, IW)) {
-			au_dbg_verify_dinode(d);
-			ii_write_unlock(inode);
-		} else if (au_ftest_lock(flags, IR)) {
-			au_dbg_verify_dinode(d);
-			ii_read_unlock(inode);
-		}
-	}
-	au_rw_read_unlock(&au_di(d)->di_rwsem);
-}
-
-void di_downgrade_lock(struct dentry *d, int flags)
-{
-	if (d_really_is_positive(d) && au_ftest_lock(flags, IR))
-		ii_downgrade_lock(d_inode(d));
-	au_rw_dgrade_lock(&au_di(d)->di_rwsem);
-}
-
-void di_write_lock(struct dentry *d, unsigned int lsc)
-{
-	au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc);
-	if (d_really_is_positive(d))
-		do_ii_write_lock(d_inode(d), lsc);
-}
-
-void di_write_unlock(struct dentry *d)
-{
-	au_dbg_verify_dinode(d);
-	if (d_really_is_positive(d))
-		ii_write_unlock(d_inode(d));
-	au_rw_write_unlock(&au_di(d)->di_rwsem);
-}
-
-void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir)
-{
-	AuDebugOn(d1 == d2
-		  || d_inode(d1) == d_inode(d2)
-		  || d1->d_sb != d2->d_sb);
-
-	if (isdir && au_test_subdir(d1, d2)) {
-		di_write_lock_child(d1);
-		di_write_lock_child2(d2);
-	} else {
-		/* there should be no races */
-		di_write_lock_child(d2);
-		di_write_lock_child2(d1);
-	}
-}
-
-void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir)
-{
-	AuDebugOn(d1 == d2
-		  || d_inode(d1) == d_inode(d2)
-		  || d1->d_sb != d2->d_sb);
-
-	if (isdir && au_test_subdir(d1, d2)) {
-		di_write_lock_parent(d1);
-		di_write_lock_parent2(d2);
-	} else {
-		/* there should be no races */
-		di_write_lock_parent(d2);
-		di_write_lock_parent2(d1);
-	}
-}
-
-void di_write_unlock2(struct dentry *d1, struct dentry *d2)
-{
-	di_write_unlock(d1);
-	if (d_inode(d1) == d_inode(d2))
-		au_rw_write_unlock(&au_di(d2)->di_rwsem);
-	else
-		di_write_unlock(d2);
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	struct dentry *d;
-
-	DiMustAnyLock(dentry);
-
-	if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry))
-		return NULL;
-	AuDebugOn(bindex < 0);
-	d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry;
-	AuDebugOn(d && au_dcount(d) <= 0);
-	return d;
-}
-
-/*
- * extended version of au_h_dptr().
- * returns a hashed and positive (or linkable) h_dentry in bindex, NULL, or
- * error.
- */
-struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	struct dentry *h_dentry;
-	struct inode *inode, *h_inode;
-
-	AuDebugOn(d_really_is_negative(dentry));
-
-	h_dentry = NULL;
-	if (au_dbstart(dentry) <= bindex
-	    && bindex <= au_dbend(dentry))
-		h_dentry = au_h_dptr(dentry, bindex);
-	if (h_dentry && !au_d_linkable(h_dentry)) {
-		dget(h_dentry);
-		goto out; /* success */
-	}
-
-	inode = d_inode(dentry);
-	AuDebugOn(bindex < au_ibstart(inode));
-	AuDebugOn(au_ibend(inode) < bindex);
-	h_inode = au_h_iptr(inode, bindex);
-	h_dentry = d_find_alias(h_inode);
-	if (h_dentry) {
-		if (!IS_ERR(h_dentry)) {
-			if (!au_d_linkable(h_dentry))
-				goto out; /* success */
-			dput(h_dentry);
-		} else
-			goto out;
-	}
-
-	if (au_opt_test(au_mntflags(dentry->d_sb), PLINK)) {
-		h_dentry = au_plink_lkup(inode, bindex);
-		AuDebugOn(!h_dentry);
-		if (!IS_ERR(h_dentry)) {
-			if (!au_d_hashed_positive(h_dentry))
-				goto out; /* success */
-			dput(h_dentry);
-			h_dentry = NULL;
-		}
-	}
-
-out:
-	AuDbgDentry(h_dentry);
-	return h_dentry;
-}
-
-aufs_bindex_t au_dbtail(struct dentry *dentry)
-{
-	aufs_bindex_t bend, bwh;
-
-	bend = au_dbend(dentry);
-	if (0 <= bend) {
-		bwh = au_dbwh(dentry);
-		if (!bwh)
-			return bwh;
-		if (0 < bwh && bwh < bend)
-			return bwh - 1;
-	}
-	return bend;
-}
-
-aufs_bindex_t au_dbtaildir(struct dentry *dentry)
-{
-	aufs_bindex_t bend, bopq;
-
-	bend = au_dbtail(dentry);
-	if (0 <= bend) {
-		bopq = au_dbdiropq(dentry);
-		if (0 <= bopq && bopq < bend)
-			bend = bopq;
-	}
-	return bend;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex,
-		   struct dentry *h_dentry)
-{
-	struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex;
-	struct au_branch *br;
-
-	DiMustWriteLock(dentry);
-
-	au_hdput(hd);
-	hd->hd_dentry = h_dentry;
-	if (h_dentry) {
-		br = au_sbr(dentry->d_sb, bindex);
-		hd->hd_id = br->br_id;
-	}
-}
-
-int au_dbrange_test(struct dentry *dentry)
-{
-	int err;
-	aufs_bindex_t bstart, bend;
-
-	err = 0;
-	bstart = au_dbstart(dentry);
-	bend = au_dbend(dentry);
-	if (bstart >= 0)
-		AuDebugOn(bend < 0 && bstart > bend);
-	else {
-		err = -EIO;
-		AuDebugOn(bend >= 0);
-	}
-
-	return err;
-}
-
-int au_digen_test(struct dentry *dentry, unsigned int sigen)
-{
-	int err;
-
-	err = 0;
-	if (unlikely(au_digen(dentry) != sigen
-		     || au_iigen_test(d_inode(dentry), sigen)))
-		err = -EIO;
-
-	return err;
-}
-
-void au_update_digen(struct dentry *dentry)
-{
-	atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb));
-	/* smp_mb(); */ /* atomic_set */
-}
-
-void au_update_dbrange(struct dentry *dentry, int do_put_zero)
-{
-	struct au_dinfo *dinfo;
-	struct dentry *h_d;
-	struct au_hdentry *hdp;
-
-	DiMustWriteLock(dentry);
-
-	dinfo = au_di(dentry);
-	if (!dinfo || dinfo->di_bstart < 0)
-		return;
-
-	hdp = dinfo->di_hdentry;
-	if (do_put_zero) {
-		aufs_bindex_t bindex, bend;
-
-		bend = dinfo->di_bend;
-		for (bindex = dinfo->di_bstart; bindex <= bend; bindex++) {
-			h_d = hdp[0 + bindex].hd_dentry;
-			if (h_d && d_is_negative(h_d))
-				au_set_h_dptr(dentry, bindex, NULL);
-		}
-	}
-
-	dinfo->di_bstart = -1;
-	while (++dinfo->di_bstart <= dinfo->di_bend)
-		if (hdp[0 + dinfo->di_bstart].hd_dentry)
-			break;
-	if (dinfo->di_bstart > dinfo->di_bend) {
-		dinfo->di_bstart = -1;
-		dinfo->di_bend = -1;
-		return;
-	}
-
-	dinfo->di_bend++;
-	while (0 <= --dinfo->di_bend)
-		if (hdp[0 + dinfo->di_bend].hd_dentry)
-			break;
-	AuDebugOn(dinfo->di_bstart > dinfo->di_bend || dinfo->di_bend < 0);
-}
-
-void au_update_dbstart(struct dentry *dentry)
-{
-	aufs_bindex_t bindex, bend;
-	struct dentry *h_dentry;
-
-	bend = au_dbend(dentry);
-	for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-		if (d_is_positive(h_dentry)) {
-			au_set_dbstart(dentry, bindex);
-			return;
-		}
-		au_set_h_dptr(dentry, bindex, NULL);
-	}
-}
-
-void au_update_dbend(struct dentry *dentry)
-{
-	aufs_bindex_t bindex, bstart;
-	struct dentry *h_dentry;
-
-	bstart = au_dbstart(dentry);
-	for (bindex = au_dbend(dentry); bindex >= bstart; bindex--) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-		if (d_is_positive(h_dentry)) {
-			au_set_dbend(dentry, bindex);
-			return;
-		}
-		au_set_h_dptr(dentry, bindex, NULL);
-	}
-}
-
-int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry)
-{
-	aufs_bindex_t bindex, bend;
-
-	bend = au_dbend(dentry);
-	for (bindex = au_dbstart(dentry); bindex <= bend; bindex++)
-		if (au_h_dptr(dentry, bindex) == h_dentry)
-			return bindex;
-	return -1;
-}
diff --git a/fs/aufs/dir.c b/fs/aufs/dir.c
deleted file mode 100644
index 7705c2997..000000000
--- a/fs/aufs/dir.c
+++ /dev/null
@@ -1,753 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * directory operations
- */
-
-#include <linux/fs_stack.h>
-#include "aufs.h"
-
-void au_add_nlink(struct inode *dir, struct inode *h_dir)
-{
-	unsigned int nlink;
-
-	AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode));
-
-	nlink = dir->i_nlink;
-	nlink += h_dir->i_nlink - 2;
-	if (h_dir->i_nlink < 2)
-		nlink += 2;
-	smp_mb(); /* for i_nlink */
-	/* 0 can happen in revaliding */
-	set_nlink(dir, nlink);
-}
-
-void au_sub_nlink(struct inode *dir, struct inode *h_dir)
-{
-	unsigned int nlink;
-
-	AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode));
-
-	nlink = dir->i_nlink;
-	nlink -= h_dir->i_nlink - 2;
-	if (h_dir->i_nlink < 2)
-		nlink -= 2;
-	smp_mb(); /* for i_nlink */
-	/* nlink == 0 means the branch-fs is broken */
-	set_nlink(dir, nlink);
-}
-
-loff_t au_dir_size(struct file *file, struct dentry *dentry)
-{
-	loff_t sz;
-	aufs_bindex_t bindex, bend;
-	struct file *h_file;
-	struct dentry *h_dentry;
-
-	sz = 0;
-	if (file) {
-		AuDebugOn(!d_is_dir(file->f_path.dentry));
-
-		bend = au_fbend_dir(file);
-		for (bindex = au_fbstart(file);
-		     bindex <= bend && sz < KMALLOC_MAX_SIZE;
-		     bindex++) {
-			h_file = au_hf_dir(file, bindex);
-			if (h_file && file_inode(h_file))
-				sz += vfsub_f_size_read(h_file);
-		}
-	} else {
-		AuDebugOn(!dentry);
-		AuDebugOn(!d_is_dir(dentry));
-
-		bend = au_dbtaildir(dentry);
-		for (bindex = au_dbstart(dentry);
-		     bindex <= bend && sz < KMALLOC_MAX_SIZE;
-		     bindex++) {
-			h_dentry = au_h_dptr(dentry, bindex);
-			if (h_dentry && d_is_positive(h_dentry))
-				sz += i_size_read(d_inode(h_dentry));
-		}
-	}
-	if (sz < KMALLOC_MAX_SIZE)
-		sz = roundup_pow_of_two(sz);
-	if (sz > KMALLOC_MAX_SIZE)
-		sz = KMALLOC_MAX_SIZE;
-	else if (sz < NAME_MAX) {
-		BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX);
-		sz = AUFS_RDBLK_DEF;
-	}
-	return sz;
-}
-
-struct au_dir_ts_arg {
-	struct dentry *dentry;
-	aufs_bindex_t brid;
-};
-
-static void au_do_dir_ts(void *arg)
-{
-	struct au_dir_ts_arg *a = arg;
-	struct au_dtime dt;
-	struct path h_path;
-	struct inode *dir, *h_dir;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct au_hinode *hdir;
-	int err;
-	aufs_bindex_t bstart, bindex;
-
-	sb = a->dentry->d_sb;
-	if (d_really_is_negative(a->dentry))
-		goto out;
-	aufs_read_lock(a->dentry, AuLock_DW | AuLock_DIR); /* noflush */
-
-	/* no dir->i_mutex lock */
-	dir = d_inode(a->dentry);
-	bstart = au_ibstart(dir);
-	bindex = au_br_index(sb, a->brid);
-	if (bindex < bstart)
-		goto out_unlock;
-
-	br = au_sbr(sb, bindex);
-	h_path.dentry = au_h_dptr(a->dentry, bindex);
-	if (!h_path.dentry)
-		goto out_unlock;
-	h_path.mnt = au_br_mnt(br);
-	au_dtime_store(&dt, a->dentry, &h_path);
-
-	br = au_sbr(sb, bstart);
-	if (!au_br_writable(br->br_perm))
-		goto out_unlock;
-	h_path.dentry = au_h_dptr(a->dentry, bstart);
-	h_path.mnt = au_br_mnt(br);
-	err = vfsub_mnt_want_write(h_path.mnt);
-	if (err)
-		goto out_unlock;
-	hdir = au_hi(dir, bstart);
-	au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
-	h_dir = au_h_iptr(dir, bstart);
-	if (h_dir->i_nlink
-	    && timespec_compare(&h_dir->i_mtime, &dt.dt_mtime) < 0) {
-		dt.dt_h_path = h_path;
-		au_dtime_revert(&dt);
-	}
-	au_hn_imtx_unlock(hdir);
-	vfsub_mnt_drop_write(h_path.mnt);
-	au_cpup_attr_timesizes(dir);
-
-out_unlock:
-	aufs_read_unlock(a->dentry, AuLock_DW);
-out:
-	dput(a->dentry);
-	au_nwt_done(&au_sbi(sb)->si_nowait);
-	kfree(arg);
-}
-
-void au_dir_ts(struct inode *dir, aufs_bindex_t bindex)
-{
-	int perm, wkq_err;
-	aufs_bindex_t bstart;
-	struct au_dir_ts_arg *arg;
-	struct dentry *dentry;
-	struct super_block *sb;
-
-	IMustLock(dir);
-
-	dentry = d_find_any_alias(dir);
-	AuDebugOn(!dentry);
-	sb = dentry->d_sb;
-	bstart = au_ibstart(dir);
-	if (bstart == bindex) {
-		au_cpup_attr_timesizes(dir);
-		goto out;
-	}
-
-	perm = au_sbr_perm(sb, bstart);
-	if (!au_br_writable(perm))
-		goto out;
-
-	arg = kmalloc(sizeof(*arg), GFP_NOFS);
-	if (!arg)
-		goto out;
-
-	arg->dentry = dget(dentry); /* will be dput-ted by au_do_dir_ts() */
-	arg->brid = au_sbr_id(sb, bindex);
-	wkq_err = au_wkq_nowait(au_do_dir_ts, arg, sb, /*flags*/0);
-	if (unlikely(wkq_err)) {
-		pr_err("wkq %d\n", wkq_err);
-		dput(dentry);
-		kfree(arg);
-	}
-
-out:
-	dput(dentry);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int reopen_dir(struct file *file)
-{
-	int err;
-	unsigned int flags;
-	aufs_bindex_t bindex, btail, bstart;
-	struct dentry *dentry, *h_dentry;
-	struct file *h_file;
-
-	/* open all lower dirs */
-	dentry = file->f_path.dentry;
-	bstart = au_dbstart(dentry);
-	for (bindex = au_fbstart(file); bindex < bstart; bindex++)
-		au_set_h_fptr(file, bindex, NULL);
-	au_set_fbstart(file, bstart);
-
-	btail = au_dbtaildir(dentry);
-	for (bindex = au_fbend_dir(file); btail < bindex; bindex--)
-		au_set_h_fptr(file, bindex, NULL);
-	au_set_fbend_dir(file, btail);
-
-	flags = vfsub_file_flags(file);
-	for (bindex = bstart; bindex <= btail; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-		h_file = au_hf_dir(file, bindex);
-		if (h_file)
-			continue;
-
-		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
-		err = PTR_ERR(h_file);
-		if (IS_ERR(h_file))
-			goto out; /* close all? */
-		au_set_h_fptr(file, bindex, h_file);
-	}
-	au_update_figen(file);
-	/* todo: necessary? */
-	/* file->f_ra = h_file->f_ra; */
-	err = 0;
-
-out:
-	return err;
-}
-
-static int do_open_dir(struct file *file, int flags, struct file *h_file)
-{
-	int err;
-	aufs_bindex_t bindex, btail;
-	struct dentry *dentry, *h_dentry;
-
-	FiMustWriteLock(file);
-	AuDebugOn(h_file);
-
-	err = 0;
-	dentry = file->f_path.dentry;
-	file->f_version = d_inode(dentry)->i_version;
-	bindex = au_dbstart(dentry);
-	au_set_fbstart(file, bindex);
-	btail = au_dbtaildir(dentry);
-	au_set_fbend_dir(file, btail);
-	for (; !err && bindex <= btail; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-
-		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
-		if (IS_ERR(h_file)) {
-			err = PTR_ERR(h_file);
-			break;
-		}
-		au_set_h_fptr(file, bindex, h_file);
-	}
-	au_update_figen(file);
-	/* todo: necessary? */
-	/* file->f_ra = h_file->f_ra; */
-	if (!err)
-		return 0; /* success */
-
-	/* close all */
-	for (bindex = au_fbstart(file); bindex <= btail; bindex++)
-		au_set_h_fptr(file, bindex, NULL);
-	au_set_fbstart(file, -1);
-	au_set_fbend_dir(file, -1);
-
-	return err;
-}
-
-static int aufs_open_dir(struct inode *inode __maybe_unused,
-			 struct file *file)
-{
-	int err;
-	struct super_block *sb;
-	struct au_fidir *fidir;
-
-	err = -ENOMEM;
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	fidir = au_fidir_alloc(sb);
-	if (fidir) {
-		struct au_do_open_args args = {
-			.open	= do_open_dir,
-			.fidir	= fidir
-		};
-		err = au_do_open(file, &args);
-		if (unlikely(err))
-			kfree(fidir);
-	}
-	si_read_unlock(sb);
-	return err;
-}
-
-static int aufs_release_dir(struct inode *inode __maybe_unused,
-			    struct file *file)
-{
-	struct au_vdir *vdir_cache;
-	struct au_finfo *finfo;
-	struct au_fidir *fidir;
-	aufs_bindex_t bindex, bend;
-
-	finfo = au_fi(file);
-	fidir = finfo->fi_hdir;
-	if (fidir) {
-		au_sphl_del(&finfo->fi_hlist,
-			    &au_sbi(file->f_path.dentry->d_sb)->si_files);
-		vdir_cache = fidir->fd_vdir_cache; /* lock-free */
-		if (vdir_cache)
-			au_vdir_free(vdir_cache);
-
-		bindex = finfo->fi_btop;
-		if (bindex >= 0) {
-			/*
-			 * calls fput() instead of filp_close(),
-			 * since no dnotify or lock for the lower file.
-			 */
-			bend = fidir->fd_bbot;
-			for (; bindex <= bend; bindex++)
-				au_set_h_fptr(file, bindex, NULL);
-		}
-		kfree(fidir);
-		finfo->fi_hdir = NULL;
-	}
-	au_finfo_fin(file);
-	return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_do_flush_dir(struct file *file, fl_owner_t id)
-{
-	int err;
-	aufs_bindex_t bindex, bend;
-	struct file *h_file;
-
-	err = 0;
-	bend = au_fbend_dir(file);
-	for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) {
-		h_file = au_hf_dir(file, bindex);
-		if (h_file)
-			err = vfsub_flush(h_file, id);
-	}
-	return err;
-}
-
-static int aufs_flush_dir(struct file *file, fl_owner_t id)
-{
-	return au_do_flush(file, id, au_do_flush_dir);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync)
-{
-	int err;
-	aufs_bindex_t bend, bindex;
-	struct inode *inode;
-	struct super_block *sb;
-
-	err = 0;
-	sb = dentry->d_sb;
-	inode = d_inode(dentry);
-	IMustLock(inode);
-	bend = au_dbend(dentry);
-	for (bindex = au_dbstart(dentry); !err && bindex <= bend; bindex++) {
-		struct path h_path;
-
-		if (au_test_ro(sb, bindex, inode))
-			continue;
-		h_path.dentry = au_h_dptr(dentry, bindex);
-		if (!h_path.dentry)
-			continue;
-
-		h_path.mnt = au_sbr_mnt(sb, bindex);
-		err = vfsub_fsync(NULL, &h_path, datasync);
-	}
-
-	return err;
-}
-
-static int au_do_fsync_dir(struct file *file, int datasync)
-{
-	int err;
-	aufs_bindex_t bend, bindex;
-	struct file *h_file;
-	struct super_block *sb;
-	struct inode *inode;
-
-	err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1);
-	if (unlikely(err))
-		goto out;
-
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	bend = au_fbend_dir(file);
-	for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) {
-		h_file = au_hf_dir(file, bindex);
-		if (!h_file || au_test_ro(sb, bindex, inode))
-			continue;
-
-		err = vfsub_fsync(h_file, &h_file->f_path, datasync);
-	}
-
-out:
-	return err;
-}
-
-/*
- * @file may be NULL
- */
-static int aufs_fsync_dir(struct file *file, loff_t start, loff_t end,
-			  int datasync)
-{
-	int err;
-	struct dentry *dentry;
-	struct inode *inode;
-	struct super_block *sb;
-	struct mutex *mtx;
-
-	err = 0;
-	dentry = file->f_path.dentry;
-	inode = d_inode(dentry);
-	mtx = &inode->i_mutex;
-	mutex_lock(mtx);
-	sb = dentry->d_sb;
-	si_noflush_read_lock(sb);
-	if (file)
-		err = au_do_fsync_dir(file, datasync);
-	else {
-		di_write_lock_child(dentry);
-		err = au_do_fsync_dir_no_file(dentry, datasync);
-	}
-	au_cpup_attr_timesizes(inode);
-	di_write_unlock(dentry);
-	if (file)
-		fi_write_unlock(file);
-
-	si_read_unlock(sb);
-	mutex_unlock(mtx);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_iterate(struct file *file, struct dir_context *ctx)
-{
-	int err;
-	struct dentry *dentry;
-	struct inode *inode, *h_inode;
-	struct super_block *sb;
-
-	AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos);
-
-	dentry = file->f_path.dentry;
-	inode = d_inode(dentry);
-	IMustLock(inode);
-
-	sb = dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1);
-	if (unlikely(err))
-		goto out;
-	err = au_alive_dir(dentry);
-	if (!err)
-		err = au_vdir_init(file);
-	di_downgrade_lock(dentry, AuLock_IR);
-	if (unlikely(err))
-		goto out_unlock;
-
-	h_inode = au_h_iptr(inode, au_ibstart(inode));
-	if (!au_test_nfsd()) {
-		err = au_vdir_fill_de(file, ctx);
-		fsstack_copy_attr_atime(inode, h_inode);
-	} else {
-		/*
-		 * nfsd filldir may call lookup_one_len(), vfs_getattr(),
-		 * encode_fh() and others.
-		 */
-		atomic_inc(&h_inode->i_count);
-		di_read_unlock(dentry, AuLock_IR);
-		si_read_unlock(sb);
-		err = au_vdir_fill_de(file, ctx);
-		fsstack_copy_attr_atime(inode, h_inode);
-		fi_write_unlock(file);
-		iput(h_inode);
-
-		AuTraceErr(err);
-		return err;
-	}
-
-out_unlock:
-	di_read_unlock(dentry, AuLock_IR);
-	fi_write_unlock(file);
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#define AuTestEmpty_WHONLY	1
-#define AuTestEmpty_CALLED	(1 << 1)
-#define AuTestEmpty_SHWH	(1 << 2)
-#define au_ftest_testempty(flags, name)	((flags) & AuTestEmpty_##name)
-#define au_fset_testempty(flags, name) \
-	do { (flags) |= AuTestEmpty_##name; } while (0)
-#define au_fclr_testempty(flags, name) \
-	do { (flags) &= ~AuTestEmpty_##name; } while (0)
-
-#ifndef CONFIG_AUFS_SHWH
-#undef AuTestEmpty_SHWH
-#define AuTestEmpty_SHWH	0
-#endif
-
-struct test_empty_arg {
-	struct dir_context ctx;
-	struct au_nhash *whlist;
-	unsigned int flags;
-	int err;
-	aufs_bindex_t bindex;
-};
-
-static int test_empty_cb(struct dir_context *ctx, const char *__name,
-			 int namelen, loff_t offset __maybe_unused, u64 ino,
-			 unsigned int d_type)
-{
-	struct test_empty_arg *arg = container_of(ctx, struct test_empty_arg,
-						  ctx);
-	char *name = (void *)__name;
-
-	arg->err = 0;
-	au_fset_testempty(arg->flags, CALLED);
-	/* smp_mb(); */
-	if (name[0] == '.'
-	    && (namelen == 1 || (name[1] == '.' && namelen == 2)))
-		goto out; /* success */
-
-	if (namelen <= AUFS_WH_PFX_LEN
-	    || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
-		if (au_ftest_testempty(arg->flags, WHONLY)
-		    && !au_nhash_test_known_wh(arg->whlist, name, namelen))
-			arg->err = -ENOTEMPTY;
-		goto out;
-	}
-
-	name += AUFS_WH_PFX_LEN;
-	namelen -= AUFS_WH_PFX_LEN;
-	if (!au_nhash_test_known_wh(arg->whlist, name, namelen))
-		arg->err = au_nhash_append_wh
-			(arg->whlist, name, namelen, ino, d_type, arg->bindex,
-			 au_ftest_testempty(arg->flags, SHWH));
-
-out:
-	/* smp_mb(); */
-	AuTraceErr(arg->err);
-	return arg->err;
-}
-
-static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg)
-{
-	int err;
-	struct file *h_file;
-
-	h_file = au_h_open(dentry, arg->bindex,
-			   O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE,
-			   /*file*/NULL, /*force_wr*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = 0;
-	if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE)
-	    && !file_inode(h_file)->i_nlink)
-		goto out_put;
-
-	do {
-		arg->err = 0;
-		au_fclr_testempty(arg->flags, CALLED);
-		/* smp_mb(); */
-		err = vfsub_iterate_dir(h_file, &arg->ctx);
-		if (err >= 0)
-			err = arg->err;
-	} while (!err && au_ftest_testempty(arg->flags, CALLED));
-
-out_put:
-	fput(h_file);
-	au_sbr_put(dentry->d_sb, arg->bindex);
-out:
-	return err;
-}
-
-struct do_test_empty_args {
-	int *errp;
-	struct dentry *dentry;
-	struct test_empty_arg *arg;
-};
-
-static void call_do_test_empty(void *args)
-{
-	struct do_test_empty_args *a = args;
-	*a->errp = do_test_empty(a->dentry, a->arg);
-}
-
-static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg)
-{
-	int err, wkq_err;
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-
-	h_dentry = au_h_dptr(dentry, arg->bindex);
-	h_inode = d_inode(h_dentry);
-	/* todo: i_mode changes anytime? */
-	mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD);
-	err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ);
-	mutex_unlock(&h_inode->i_mutex);
-	if (!err)
-		err = do_test_empty(dentry, arg);
-	else {
-		struct do_test_empty_args args = {
-			.errp	= &err,
-			.dentry	= dentry,
-			.arg	= arg
-		};
-		unsigned int flags = arg->flags;
-
-		wkq_err = au_wkq_wait(call_do_test_empty, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-		arg->flags = flags;
-	}
-
-	return err;
-}
-
-int au_test_empty_lower(struct dentry *dentry)
-{
-	int err;
-	unsigned int rdhash;
-	aufs_bindex_t bindex, bstart, btail;
-	struct au_nhash whlist;
-	struct test_empty_arg arg = {
-		.ctx = {
-			.actor = test_empty_cb
-		}
-	};
-	int (*test_empty)(struct dentry *dentry, struct test_empty_arg *arg);
-
-	SiMustAnyLock(dentry->d_sb);
-
-	rdhash = au_sbi(dentry->d_sb)->si_rdhash;
-	if (!rdhash)
-		rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry));
-	err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-
-	arg.flags = 0;
-	arg.whlist = &whlist;
-	bstart = au_dbstart(dentry);
-	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH))
-		au_fset_testempty(arg.flags, SHWH);
-	test_empty = do_test_empty;
-	if (au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1))
-		test_empty = sio_test_empty;
-	arg.bindex = bstart;
-	err = test_empty(dentry, &arg);
-	if (unlikely(err))
-		goto out_whlist;
-
-	au_fset_testempty(arg.flags, WHONLY);
-	btail = au_dbtaildir(dentry);
-	for (bindex = bstart + 1; !err && bindex <= btail; bindex++) {
-		struct dentry *h_dentry;
-
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (h_dentry && d_is_positive(h_dentry)) {
-			arg.bindex = bindex;
-			err = test_empty(dentry, &arg);
-		}
-	}
-
-out_whlist:
-	au_nhash_wh_free(&whlist);
-out:
-	return err;
-}
-
-int au_test_empty(struct dentry *dentry, struct au_nhash *whlist)
-{
-	int err;
-	struct test_empty_arg arg = {
-		.ctx = {
-			.actor = test_empty_cb
-		}
-	};
-	aufs_bindex_t bindex, btail;
-
-	err = 0;
-	arg.whlist = whlist;
-	arg.flags = AuTestEmpty_WHONLY;
-	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH))
-		au_fset_testempty(arg.flags, SHWH);
-	btail = au_dbtaildir(dentry);
-	for (bindex = au_dbstart(dentry); !err && bindex <= btail; bindex++) {
-		struct dentry *h_dentry;
-
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (h_dentry && d_is_positive(h_dentry)) {
-			arg.bindex = bindex;
-			err = sio_test_empty(dentry, &arg);
-		}
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-const struct file_operations aufs_dir_fop = {
-	.owner		= THIS_MODULE,
-	.llseek		= default_llseek,
-	.read		= generic_read_dir,
-	.iterate	= aufs_iterate,
-	.unlocked_ioctl	= aufs_ioctl_dir,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= aufs_compat_ioctl_dir,
-#endif
-	.open		= aufs_open_dir,
-	.release	= aufs_release_dir,
-	.flush		= aufs_flush_dir,
-	.fsync		= aufs_fsync_dir
-};
diff --git a/fs/aufs/dir.h b/fs/aufs/dir.h
deleted file mode 100644
index e4acf732c..000000000
--- a/fs/aufs/dir.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * directory operations
- */
-
-#ifndef __AUFS_DIR_H__
-#define __AUFS_DIR_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-
-/* ---------------------------------------------------------------------- */
-
-/* need to be faster and smaller */
-
-struct au_nhash {
-	unsigned int		nh_num;
-	struct hlist_head	*nh_head;
-};
-
-struct au_vdir_destr {
-	unsigned char	len;
-	unsigned char	name[0];
-} __packed;
-
-struct au_vdir_dehstr {
-	struct hlist_node	hash;
-	struct au_vdir_destr	*str;
-} ____cacheline_aligned_in_smp;
-
-struct au_vdir_de {
-	ino_t			de_ino;
-	unsigned char		de_type;
-	/* caution: packed */
-	struct au_vdir_destr	de_str;
-} __packed;
-
-struct au_vdir_wh {
-	struct hlist_node	wh_hash;
-#ifdef CONFIG_AUFS_SHWH
-	ino_t			wh_ino;
-	aufs_bindex_t		wh_bindex;
-	unsigned char		wh_type;
-#else
-	aufs_bindex_t		wh_bindex;
-#endif
-	/* caution: packed */
-	struct au_vdir_destr	wh_str;
-} __packed;
-
-union au_vdir_deblk_p {
-	unsigned char		*deblk;
-	struct au_vdir_de	*de;
-};
-
-struct au_vdir {
-	unsigned char	**vd_deblk;
-	unsigned long	vd_nblk;
-	struct {
-		unsigned long		ul;
-		union au_vdir_deblk_p	p;
-	} vd_last;
-
-	unsigned long	vd_version;
-	unsigned int	vd_deblk_sz;
-	unsigned long	vd_jiffy;
-} ____cacheline_aligned_in_smp;
-
-/* ---------------------------------------------------------------------- */
-
-/* dir.c */
-extern const struct file_operations aufs_dir_fop;
-void au_add_nlink(struct inode *dir, struct inode *h_dir);
-void au_sub_nlink(struct inode *dir, struct inode *h_dir);
-loff_t au_dir_size(struct file *file, struct dentry *dentry);
-void au_dir_ts(struct inode *dir, aufs_bindex_t bsrc);
-int au_test_empty_lower(struct dentry *dentry);
-int au_test_empty(struct dentry *dentry, struct au_nhash *whlist);
-
-/* vdir.c */
-unsigned int au_rdhash_est(loff_t sz);
-int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp);
-void au_nhash_wh_free(struct au_nhash *whlist);
-int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt,
-			    int limit);
-int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen);
-int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino,
-		       unsigned int d_type, aufs_bindex_t bindex,
-		       unsigned char shwh);
-void au_vdir_free(struct au_vdir *vdir);
-int au_vdir_init(struct file *file);
-int au_vdir_fill_de(struct file *file, struct dir_context *ctx);
-
-/* ioctl.c */
-long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg);
-
-#ifdef CONFIG_AUFS_RDU
-/* rdu.c */
-long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-#ifdef CONFIG_COMPAT
-long au_rdu_compat_ioctl(struct file *file, unsigned int cmd,
-			 unsigned long arg);
-#endif
-#else
-AuStub(long, au_rdu_ioctl, return -EINVAL, struct file *file,
-       unsigned int cmd, unsigned long arg)
-#ifdef CONFIG_COMPAT
-AuStub(long, au_rdu_compat_ioctl, return -EINVAL, struct file *file,
-       unsigned int cmd, unsigned long arg)
-#endif
-#endif
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_DIR_H__ */
diff --git a/fs/aufs/dynop.c b/fs/aufs/dynop.c
deleted file mode 100644
index a04d02f91..000000000
--- a/fs/aufs/dynop.c
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
- * Copyright (C) 2010-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * dynamically customizable operations for regular files
- */
-
-#include "aufs.h"
-
-#define DyPrSym(key)	AuDbgSym(key->dk_op.dy_hop)
-
-/*
- * How large will these lists be?
- * Usually just a few elements, 20-30 at most for each, I guess.
- */
-static struct au_splhead dynop[AuDyLast];
-
-static struct au_dykey *dy_gfind_get(struct au_splhead *spl, const void *h_op)
-{
-	struct au_dykey *key, *tmp;
-	struct list_head *head;
-
-	key = NULL;
-	head = &spl->head;
-	rcu_read_lock();
-	list_for_each_entry_rcu(tmp, head, dk_list)
-		if (tmp->dk_op.dy_hop == h_op) {
-			key = tmp;
-			kref_get(&key->dk_kref);
-			break;
-		}
-	rcu_read_unlock();
-
-	return key;
-}
-
-static struct au_dykey *dy_bradd(struct au_branch *br, struct au_dykey *key)
-{
-	struct au_dykey **k, *found;
-	const void *h_op = key->dk_op.dy_hop;
-	int i;
-
-	found = NULL;
-	k = br->br_dykey;
-	for (i = 0; i < AuBrDynOp; i++)
-		if (k[i]) {
-			if (k[i]->dk_op.dy_hop == h_op) {
-				found = k[i];
-				break;
-			}
-		} else
-			break;
-	if (!found) {
-		spin_lock(&br->br_dykey_lock);
-		for (; i < AuBrDynOp; i++)
-			if (k[i]) {
-				if (k[i]->dk_op.dy_hop == h_op) {
-					found = k[i];
-					break;
-				}
-			} else {
-				k[i] = key;
-				break;
-			}
-		spin_unlock(&br->br_dykey_lock);
-		BUG_ON(i == AuBrDynOp); /* expand the array */
-	}
-
-	return found;
-}
-
-/* kref_get() if @key is already added */
-static struct au_dykey *dy_gadd(struct au_splhead *spl, struct au_dykey *key)
-{
-	struct au_dykey *tmp, *found;
-	struct list_head *head;
-	const void *h_op = key->dk_op.dy_hop;
-
-	found = NULL;
-	head = &spl->head;
-	spin_lock(&spl->spin);
-	list_for_each_entry(tmp, head, dk_list)
-		if (tmp->dk_op.dy_hop == h_op) {
-			kref_get(&tmp->dk_kref);
-			found = tmp;
-			break;
-		}
-	if (!found)
-		list_add_rcu(&key->dk_list, head);
-	spin_unlock(&spl->spin);
-
-	if (!found)
-		DyPrSym(key);
-	return found;
-}
-
-static void dy_free_rcu(struct rcu_head *rcu)
-{
-	struct au_dykey *key;
-
-	key = container_of(rcu, struct au_dykey, dk_rcu);
-	DyPrSym(key);
-	kfree(key);
-}
-
-static void dy_free(struct kref *kref)
-{
-	struct au_dykey *key;
-	struct au_splhead *spl;
-
-	key = container_of(kref, struct au_dykey, dk_kref);
-	spl = dynop + key->dk_op.dy_type;
-	au_spl_del_rcu(&key->dk_list, spl);
-	call_rcu(&key->dk_rcu, dy_free_rcu);
-}
-
-void au_dy_put(struct au_dykey *key)
-{
-	kref_put(&key->dk_kref, dy_free);
-}
-
-/* ---------------------------------------------------------------------- */
-
-#define DyDbgSize(cnt, op)	AuDebugOn(cnt != sizeof(op)/sizeof(void *))
-
-#ifdef CONFIG_AUFS_DEBUG
-#define DyDbgDeclare(cnt)	unsigned int cnt = 0
-#define DyDbgInc(cnt)		do { cnt++; } while (0)
-#else
-#define DyDbgDeclare(cnt)	do {} while (0)
-#define DyDbgInc(cnt)		do {} while (0)
-#endif
-
-#define DySet(func, dst, src, h_op, h_sb) do {				\
-	DyDbgInc(cnt);							\
-	if (h_op->func) {						\
-		if (src.func)						\
-			dst.func = src.func;				\
-		else							\
-			AuDbg("%s %s\n", au_sbtype(h_sb), #func);	\
-	}								\
-} while (0)
-
-#define DySetForce(func, dst, src) do {		\
-	AuDebugOn(!src.func);			\
-	DyDbgInc(cnt);				\
-	dst.func = src.func;			\
-} while (0)
-
-#define DySetAop(func) \
-	DySet(func, dyaop->da_op, aufs_aop, h_aop, h_sb)
-#define DySetAopForce(func) \
-	DySetForce(func, dyaop->da_op, aufs_aop)
-
-static void dy_aop(struct au_dykey *key, const void *h_op,
-		   struct super_block *h_sb __maybe_unused)
-{
-	struct au_dyaop *dyaop = (void *)key;
-	const struct address_space_operations *h_aop = h_op;
-	DyDbgDeclare(cnt);
-
-	AuDbg("%s\n", au_sbtype(h_sb));
-
-	DySetAop(writepage);
-	DySetAopForce(readpage);	/* force */
-	DySetAop(writepages);
-	DySetAop(set_page_dirty);
-	DySetAop(readpages);
-	DySetAop(write_begin);
-	DySetAop(write_end);
-	DySetAop(bmap);
-	DySetAop(invalidatepage);
-	DySetAop(releasepage);
-	DySetAop(freepage);
-	/* this one will be changed according to an aufs mount option */
-	DySetAop(direct_IO);
-	DySetAop(migratepage);
-	DySetAop(launder_page);
-	DySetAop(is_partially_uptodate);
-	DySetAop(is_dirty_writeback);
-	DySetAop(error_remove_page);
-	DySetAop(swap_activate);
-	DySetAop(swap_deactivate);
-
-	DyDbgSize(cnt, *h_aop);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void dy_bug(struct kref *kref)
-{
-	BUG();
-}
-
-static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br)
-{
-	struct au_dykey *key, *old;
-	struct au_splhead *spl;
-	struct op {
-		unsigned int sz;
-		void (*set)(struct au_dykey *key, const void *h_op,
-			    struct super_block *h_sb __maybe_unused);
-	};
-	static const struct op a[] = {
-		[AuDy_AOP] = {
-			.sz	= sizeof(struct au_dyaop),
-			.set	= dy_aop
-		}
-	};
-	const struct op *p;
-
-	spl = dynop + op->dy_type;
-	key = dy_gfind_get(spl, op->dy_hop);
-	if (key)
-		goto out_add; /* success */
-
-	p = a + op->dy_type;
-	key = kzalloc(p->sz, GFP_NOFS);
-	if (unlikely(!key)) {
-		key = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	key->dk_op.dy_hop = op->dy_hop;
-	kref_init(&key->dk_kref);
-	p->set(key, op->dy_hop, au_br_sb(br));
-	old = dy_gadd(spl, key);
-	if (old) {
-		kfree(key);
-		key = old;
-	}
-
-out_add:
-	old = dy_bradd(br, key);
-	if (old)
-		/* its ref-count should never be zero here */
-		kref_put(&key->dk_kref, dy_bug);
-out:
-	return key;
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * Aufs prohibits O_DIRECT by defaut even if the branch supports it.
- * This behaviour is necessary to return an error from open(O_DIRECT) instead
- * of the succeeding I/O. The dio mount option enables O_DIRECT and makes
- * open(O_DIRECT) always succeed, but the succeeding I/O may return an error.
- * See the aufs manual in detail.
- */
-static void dy_adx(struct au_dyaop *dyaop, int do_dx)
-{
-	if (!do_dx)
-		dyaop->da_op.direct_IO = NULL;
-	else
-		dyaop->da_op.direct_IO = aufs_aop.direct_IO;
-}
-
-static struct au_dyaop *dy_aget(struct au_branch *br,
-				const struct address_space_operations *h_aop,
-				int do_dx)
-{
-	struct au_dyaop *dyaop;
-	struct au_dynop op;
-
-	op.dy_type = AuDy_AOP;
-	op.dy_haop = h_aop;
-	dyaop = (void *)dy_get(&op, br);
-	if (IS_ERR(dyaop))
-		goto out;
-	dy_adx(dyaop, do_dx);
-
-out:
-	return dyaop;
-}
-
-int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex,
-		struct inode *h_inode)
-{
-	int err, do_dx;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct au_dyaop *dyaop;
-
-	AuDebugOn(!S_ISREG(h_inode->i_mode));
-	IiMustWriteLock(inode);
-
-	sb = inode->i_sb;
-	br = au_sbr(sb, bindex);
-	do_dx = !!au_opt_test(au_mntflags(sb), DIO);
-	dyaop = dy_aget(br, h_inode->i_mapping->a_ops, do_dx);
-	err = PTR_ERR(dyaop);
-	if (IS_ERR(dyaop))
-		/* unnecessary to call dy_fput() */
-		goto out;
-
-	err = 0;
-	inode->i_mapping->a_ops = &dyaop->da_op;
-
-out:
-	return err;
-}
-
-/*
- * Is it safe to replace a_ops during the inode/file is in operation?
- * Yes, I hope so.
- */
-int au_dy_irefresh(struct inode *inode)
-{
-	int err;
-	aufs_bindex_t bstart;
-	struct inode *h_inode;
-
-	err = 0;
-	if (S_ISREG(inode->i_mode)) {
-		bstart = au_ibstart(inode);
-		h_inode = au_h_iptr(inode, bstart);
-		err = au_dy_iaop(inode, bstart, h_inode);
-	}
-	return err;
-}
-
-void au_dy_arefresh(int do_dx)
-{
-	struct au_splhead *spl;
-	struct list_head *head;
-	struct au_dykey *key;
-
-	spl = dynop + AuDy_AOP;
-	head = &spl->head;
-	spin_lock(&spl->spin);
-	list_for_each_entry(key, head, dk_list)
-		dy_adx((void *)key, do_dx);
-	spin_unlock(&spl->spin);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void __init au_dy_init(void)
-{
-	int i;
-
-	/* make sure that 'struct au_dykey *' can be any type */
-	BUILD_BUG_ON(offsetof(struct au_dyaop, da_key));
-
-	for (i = 0; i < AuDyLast; i++)
-		au_spl_init(dynop + i);
-}
-
-void au_dy_fin(void)
-{
-	int i;
-
-	for (i = 0; i < AuDyLast; i++)
-		WARN_ON(!list_empty(&dynop[i].head));
-}
diff --git a/fs/aufs/dynop.h b/fs/aufs/dynop.h
deleted file mode 100644
index 724dddab1..000000000
--- a/fs/aufs/dynop.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (C) 2010-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * dynamically customizable operations (for regular files only)
- */
-
-#ifndef __AUFS_DYNOP_H__
-#define __AUFS_DYNOP_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-#include <linux/kref.h>
-
-enum {AuDy_AOP, AuDyLast};
-
-struct au_dynop {
-	int						dy_type;
-	union {
-		const void				*dy_hop;
-		const struct address_space_operations	*dy_haop;
-	};
-};
-
-struct au_dykey {
-	union {
-		struct list_head	dk_list;
-		struct rcu_head		dk_rcu;
-	};
-	struct au_dynop		dk_op;
-
-	/*
-	 * during I am in the branch local array, kref is gotten. when the
-	 * branch is removed, kref is put.
-	 */
-	struct kref		dk_kref;
-};
-
-/* stop unioning since their sizes are very different from each other */
-struct au_dyaop {
-	struct au_dykey			da_key;
-	struct address_space_operations	da_op; /* not const */
-};
-
-/* ---------------------------------------------------------------------- */
-
-/* dynop.c */
-struct au_branch;
-void au_dy_put(struct au_dykey *key);
-int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex,
-		struct inode *h_inode);
-int au_dy_irefresh(struct inode *inode);
-void au_dy_arefresh(int do_dio);
-
-void __init au_dy_init(void);
-void au_dy_fin(void);
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_DYNOP_H__ */
diff --git a/fs/aufs/export.c b/fs/aufs/export.c
deleted file mode 100644
index c85b036b8..000000000
--- a/fs/aufs/export.c
+++ /dev/null
@@ -1,832 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * export via nfs
- */
-
-#include <linux/exportfs.h>
-#include <linux/fs_struct.h>
-#include <linux/namei.h>
-#include <linux/nsproxy.h>
-#include <linux/random.h>
-#include <linux/writeback.h>
-#include "../fs/mount.h"
-#include "aufs.h"
-
-union conv {
-#ifdef CONFIG_AUFS_INO_T_64
-	__u32 a[2];
-#else
-	__u32 a[1];
-#endif
-	ino_t ino;
-};
-
-static ino_t decode_ino(__u32 *a)
-{
-	union conv u;
-
-	BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a));
-	u.a[0] = a[0];
-#ifdef CONFIG_AUFS_INO_T_64
-	u.a[1] = a[1];
-#endif
-	return u.ino;
-}
-
-static void encode_ino(__u32 *a, ino_t ino)
-{
-	union conv u;
-
-	u.ino = ino;
-	a[0] = u.a[0];
-#ifdef CONFIG_AUFS_INO_T_64
-	a[1] = u.a[1];
-#endif
-}
-
-/* NFS file handle */
-enum {
-	Fh_br_id,
-	Fh_sigen,
-#ifdef CONFIG_AUFS_INO_T_64
-	/* support 64bit inode number */
-	Fh_ino1,
-	Fh_ino2,
-	Fh_dir_ino1,
-	Fh_dir_ino2,
-#else
-	Fh_ino1,
-	Fh_dir_ino1,
-#endif
-	Fh_igen,
-	Fh_h_type,
-	Fh_tail,
-
-	Fh_ino = Fh_ino1,
-	Fh_dir_ino = Fh_dir_ino1
-};
-
-static int au_test_anon(struct dentry *dentry)
-{
-	/* note: read d_flags without d_lock */
-	return !!(dentry->d_flags & DCACHE_DISCONNECTED);
-}
-
-int au_test_nfsd(void)
-{
-	int ret;
-	struct task_struct *tsk = current;
-	char comm[sizeof(tsk->comm)];
-
-	ret = 0;
-	if (tsk->flags & PF_KTHREAD) {
-		get_task_comm(comm, tsk);
-		ret = !strcmp(comm, "nfsd");
-	}
-
-	return ret;
-}
-
-/* ---------------------------------------------------------------------- */
-/* inode generation external table */
-
-void au_xigen_inc(struct inode *inode)
-{
-	loff_t pos;
-	ssize_t sz;
-	__u32 igen;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-
-	sb = inode->i_sb;
-	AuDebugOn(!au_opt_test(au_mntflags(sb), XINO));
-
-	sbinfo = au_sbi(sb);
-	pos = inode->i_ino;
-	pos *= sizeof(igen);
-	igen = inode->i_generation + 1;
-	sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen,
-			 sizeof(igen), &pos);
-	if (sz == sizeof(igen))
-		return; /* success */
-
-	if (unlikely(sz >= 0))
-		AuIOErr("xigen error (%zd)\n", sz);
-}
-
-int au_xigen_new(struct inode *inode)
-{
-	int err;
-	loff_t pos;
-	ssize_t sz;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-	struct file *file;
-
-	err = 0;
-	/* todo: dirty, at mount time */
-	if (inode->i_ino == AUFS_ROOT_INO)
-		goto out;
-	sb = inode->i_sb;
-	SiMustAnyLock(sb);
-	if (unlikely(!au_opt_test(au_mntflags(sb), XINO)))
-		goto out;
-
-	err = -EFBIG;
-	pos = inode->i_ino;
-	if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) {
-		AuIOErr1("too large i%lld\n", pos);
-		goto out;
-	}
-	pos *= sizeof(inode->i_generation);
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	file = sbinfo->si_xigen;
-	BUG_ON(!file);
-
-	if (vfsub_f_size_read(file)
-	    < pos + sizeof(inode->i_generation)) {
-		inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next);
-		sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation,
-				 sizeof(inode->i_generation), &pos);
-	} else
-		sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation,
-				sizeof(inode->i_generation), &pos);
-	if (sz == sizeof(inode->i_generation))
-		goto out; /* success */
-
-	err = sz;
-	if (unlikely(sz >= 0)) {
-		err = -EIO;
-		AuIOErr("xigen error (%zd)\n", sz);
-	}
-
-out:
-	return err;
-}
-
-int au_xigen_set(struct super_block *sb, struct file *base)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-	struct file *file;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	file = au_xino_create2(base, sbinfo->si_xigen);
-	err = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out;
-	err = 0;
-	if (sbinfo->si_xigen)
-		fput(sbinfo->si_xigen);
-	sbinfo->si_xigen = file;
-
-out:
-	return err;
-}
-
-void au_xigen_clr(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	if (sbinfo->si_xigen) {
-		fput(sbinfo->si_xigen);
-		sbinfo->si_xigen = NULL;
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino,
-				    ino_t dir_ino)
-{
-	struct dentry *dentry, *d;
-	struct inode *inode;
-	unsigned int sigen;
-
-	dentry = NULL;
-	inode = ilookup(sb, ino);
-	if (!inode)
-		goto out;
-
-	dentry = ERR_PTR(-ESTALE);
-	sigen = au_sigen(sb);
-	if (unlikely(is_bad_inode(inode)
-		     || IS_DEADDIR(inode)
-		     || sigen != au_iigen(inode, NULL)))
-		goto out_iput;
-
-	dentry = NULL;
-	if (!dir_ino || S_ISDIR(inode->i_mode))
-		dentry = d_find_alias(inode);
-	else {
-		spin_lock(&inode->i_lock);
-		hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) {
-			spin_lock(&d->d_lock);
-			if (!au_test_anon(d)
-			    && d_inode(d->d_parent)->i_ino == dir_ino) {
-				dentry = dget_dlock(d);
-				spin_unlock(&d->d_lock);
-				break;
-			}
-			spin_unlock(&d->d_lock);
-		}
-		spin_unlock(&inode->i_lock);
-	}
-	if (unlikely(dentry && au_digen_test(dentry, sigen))) {
-		/* need to refresh */
-		dput(dentry);
-		dentry = NULL;
-	}
-
-out_iput:
-	iput(inode);
-out:
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* todo: dirty? */
-/* if exportfs_decode_fh() passed vfsmount*, we could be happy */
-
-struct au_compare_mnt_args {
-	/* input */
-	struct super_block *sb;
-
-	/* output */
-	struct vfsmount *mnt;
-};
-
-static int au_compare_mnt(struct vfsmount *mnt, void *arg)
-{
-	struct au_compare_mnt_args *a = arg;
-
-	if (mnt->mnt_sb != a->sb)
-		return 0;
-	a->mnt = mntget(mnt);
-	return 1;
-}
-
-static struct vfsmount *au_mnt_get(struct super_block *sb)
-{
-	int err;
-	struct path root;
-	struct au_compare_mnt_args args = {
-		.sb = sb
-	};
-
-	get_fs_root(current->fs, &root);
-	rcu_read_lock();
-	err = iterate_mounts(au_compare_mnt, &args, root.mnt);
-	rcu_read_unlock();
-	path_put(&root);
-	AuDebugOn(!err);
-	AuDebugOn(!args.mnt);
-	return args.mnt;
-}
-
-struct au_nfsd_si_lock {
-	unsigned int sigen;
-	aufs_bindex_t bindex, br_id;
-	unsigned char force_lock;
-};
-
-static int si_nfsd_read_lock(struct super_block *sb,
-			     struct au_nfsd_si_lock *nsi_lock)
-{
-	int err;
-	aufs_bindex_t bindex;
-
-	si_read_lock(sb, AuLock_FLUSH);
-
-	/* branch id may be wrapped around */
-	err = 0;
-	bindex = au_br_index(sb, nsi_lock->br_id);
-	if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb))
-		goto out; /* success */
-
-	err = -ESTALE;
-	bindex = -1;
-	if (!nsi_lock->force_lock)
-		si_read_unlock(sb);
-
-out:
-	nsi_lock->bindex = bindex;
-	return err;
-}
-
-struct find_name_by_ino {
-	struct dir_context ctx;
-	int called, found;
-	ino_t ino;
-	char *name;
-	int namelen;
-};
-
-static int
-find_name_by_ino(struct dir_context *ctx, const char *name, int namelen,
-		 loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct find_name_by_ino *a = container_of(ctx, struct find_name_by_ino,
-						  ctx);
-
-	a->called++;
-	if (a->ino != ino)
-		return 0;
-
-	memcpy(a->name, name, namelen);
-	a->namelen = namelen;
-	a->found = 1;
-	return 1;
-}
-
-static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino,
-				     struct au_nfsd_si_lock *nsi_lock)
-{
-	struct dentry *dentry, *parent;
-	struct file *file;
-	struct inode *dir;
-	struct find_name_by_ino arg = {
-		.ctx = {
-			.actor = find_name_by_ino
-		}
-	};
-	int err;
-
-	parent = path->dentry;
-	if (nsi_lock)
-		si_read_unlock(parent->d_sb);
-	file = vfsub_dentry_open(path, au_dir_roflags);
-	dentry = (void *)file;
-	if (IS_ERR(file))
-		goto out;
-
-	dentry = ERR_PTR(-ENOMEM);
-	arg.name = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!arg.name))
-		goto out_file;
-	arg.ino = ino;
-	arg.found = 0;
-	do {
-		arg.called = 0;
-		/* smp_mb(); */
-		err = vfsub_iterate_dir(file, &arg.ctx);
-	} while (!err && !arg.found && arg.called);
-	dentry = ERR_PTR(err);
-	if (unlikely(err))
-		goto out_name;
-	/* instead of ENOENT */
-	dentry = ERR_PTR(-ESTALE);
-	if (!arg.found)
-		goto out_name;
-
-	/* do not call vfsub_lkup_one() */
-	dir = d_inode(parent);
-	mutex_lock(&dir->i_mutex);
-	dentry = vfsub_lookup_one_len(arg.name, parent, arg.namelen);
-	mutex_unlock(&dir->i_mutex);
-	AuTraceErrPtr(dentry);
-	if (IS_ERR(dentry))
-		goto out_name;
-	AuDebugOn(au_test_anon(dentry));
-	if (unlikely(d_really_is_negative(dentry))) {
-		dput(dentry);
-		dentry = ERR_PTR(-ENOENT);
-	}
-
-out_name:
-	free_page((unsigned long)arg.name);
-out_file:
-	fput(file);
-out:
-	if (unlikely(nsi_lock
-		     && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0))
-		if (!IS_ERR(dentry)) {
-			dput(dentry);
-			dentry = ERR_PTR(-ESTALE);
-		}
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino,
-					ino_t dir_ino,
-					struct au_nfsd_si_lock *nsi_lock)
-{
-	struct dentry *dentry;
-	struct path path;
-
-	if (dir_ino != AUFS_ROOT_INO) {
-		path.dentry = decode_by_ino(sb, dir_ino, 0);
-		dentry = path.dentry;
-		if (!path.dentry || IS_ERR(path.dentry))
-			goto out;
-		AuDebugOn(au_test_anon(path.dentry));
-	} else
-		path.dentry = dget(sb->s_root);
-
-	path.mnt = au_mnt_get(sb);
-	dentry = au_lkup_by_ino(&path, ino, nsi_lock);
-	path_put(&path);
-
-out:
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int h_acceptable(void *expv, struct dentry *dentry)
-{
-	return 1;
-}
-
-static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath,
-			   char *buf, int len, struct super_block *sb)
-{
-	char *p;
-	int n;
-	struct path path;
-
-	p = d_path(h_rootpath, buf, len);
-	if (IS_ERR(p))
-		goto out;
-	n = strlen(p);
-
-	path.mnt = h_rootpath->mnt;
-	path.dentry = h_parent;
-	p = d_path(&path, buf, len);
-	if (IS_ERR(p))
-		goto out;
-	if (n != 1)
-		p += n;
-
-	path.mnt = au_mnt_get(sb);
-	path.dentry = sb->s_root;
-	p = d_path(&path, buf, len - strlen(p));
-	mntput(path.mnt);
-	if (IS_ERR(p))
-		goto out;
-	if (n != 1)
-		p[strlen(p)] = '/';
-
-out:
-	AuTraceErrPtr(p);
-	return p;
-}
-
-static
-struct dentry *decode_by_path(struct super_block *sb, ino_t ino, __u32 *fh,
-			      int fh_len, struct au_nfsd_si_lock *nsi_lock)
-{
-	struct dentry *dentry, *h_parent, *root;
-	struct super_block *h_sb;
-	char *pathname, *p;
-	struct vfsmount *h_mnt;
-	struct au_branch *br;
-	int err;
-	struct path path;
-
-	br = au_sbr(sb, nsi_lock->bindex);
-	h_mnt = au_br_mnt(br);
-	h_sb = h_mnt->mnt_sb;
-	/* todo: call lower fh_to_dentry()? fh_to_parent()? */
-	h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail),
-				      fh_len - Fh_tail, fh[Fh_h_type],
-				      h_acceptable, /*context*/NULL);
-	dentry = h_parent;
-	if (unlikely(!h_parent || IS_ERR(h_parent))) {
-		AuWarn1("%s decode_fh failed, %ld\n",
-			au_sbtype(h_sb), PTR_ERR(h_parent));
-		goto out;
-	}
-	dentry = NULL;
-	if (unlikely(au_test_anon(h_parent))) {
-		AuWarn1("%s decode_fh returned a disconnected dentry\n",
-			au_sbtype(h_sb));
-		goto out_h_parent;
-	}
-
-	dentry = ERR_PTR(-ENOMEM);
-	pathname = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!pathname))
-		goto out_h_parent;
-
-	root = sb->s_root;
-	path.mnt = h_mnt;
-	di_read_lock_parent(root, !AuLock_IR);
-	path.dentry = au_h_dptr(root, nsi_lock->bindex);
-	di_read_unlock(root, !AuLock_IR);
-	p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb);
-	dentry = (void *)p;
-	if (IS_ERR(p))
-		goto out_pathname;
-
-	si_read_unlock(sb);
-	err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
-	dentry = ERR_PTR(err);
-	if (unlikely(err))
-		goto out_relock;
-
-	dentry = ERR_PTR(-ENOENT);
-	AuDebugOn(au_test_anon(path.dentry));
-	if (unlikely(d_really_is_negative(path.dentry)))
-		goto out_path;
-
-	if (ino != d_inode(path.dentry)->i_ino)
-		dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL);
-	else
-		dentry = dget(path.dentry);
-
-out_path:
-	path_put(&path);
-out_relock:
-	if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0))
-		if (!IS_ERR(dentry)) {
-			dput(dentry);
-			dentry = ERR_PTR(-ESTALE);
-		}
-out_pathname:
-	free_page((unsigned long)pathname);
-out_h_parent:
-	dput(h_parent);
-out:
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct dentry *
-aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
-		  int fh_type)
-{
-	struct dentry *dentry;
-	__u32 *fh = fid->raw;
-	struct au_branch *br;
-	ino_t ino, dir_ino;
-	struct au_nfsd_si_lock nsi_lock = {
-		.force_lock	= 0
-	};
-
-	dentry = ERR_PTR(-ESTALE);
-	/* it should never happen, but the file handle is unreliable */
-	if (unlikely(fh_len < Fh_tail))
-		goto out;
-	nsi_lock.sigen = fh[Fh_sigen];
-	nsi_lock.br_id = fh[Fh_br_id];
-
-	/* branch id may be wrapped around */
-	br = NULL;
-	if (unlikely(si_nfsd_read_lock(sb, &nsi_lock)))
-		goto out;
-	nsi_lock.force_lock = 1;
-
-	/* is this inode still cached? */
-	ino = decode_ino(fh + Fh_ino);
-	/* it should never happen */
-	if (unlikely(ino == AUFS_ROOT_INO))
-		goto out;
-
-	dir_ino = decode_ino(fh + Fh_dir_ino);
-	dentry = decode_by_ino(sb, ino, dir_ino);
-	if (IS_ERR(dentry))
-		goto out_unlock;
-	if (dentry)
-		goto accept;
-
-	/* is the parent dir cached? */
-	br = au_sbr(sb, nsi_lock.bindex);
-	atomic_inc(&br->br_count);
-	dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock);
-	if (IS_ERR(dentry))
-		goto out_unlock;
-	if (dentry)
-		goto accept;
-
-	/* lookup path */
-	dentry = decode_by_path(sb, ino, fh, fh_len, &nsi_lock);
-	if (IS_ERR(dentry))
-		goto out_unlock;
-	if (unlikely(!dentry))
-		/* todo?: make it ESTALE */
-		goto out_unlock;
-
-accept:
-	if (!au_digen_test(dentry, au_sigen(sb))
-	    && d_inode(dentry)->i_generation == fh[Fh_igen])
-		goto out_unlock; /* success */
-
-	dput(dentry);
-	dentry = ERR_PTR(-ESTALE);
-out_unlock:
-	if (br)
-		atomic_dec(&br->br_count);
-	si_read_unlock(sb);
-out:
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-#if 0 /* reserved for future use */
-/* support subtreecheck option */
-static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid,
-					int fh_len, int fh_type)
-{
-	struct dentry *parent;
-	__u32 *fh = fid->raw;
-	ino_t dir_ino;
-
-	dir_ino = decode_ino(fh + Fh_dir_ino);
-	parent = decode_by_ino(sb, dir_ino, 0);
-	if (IS_ERR(parent))
-		goto out;
-	if (!parent)
-		parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]),
-					dir_ino, fh, fh_len);
-
-out:
-	AuTraceErrPtr(parent);
-	return parent;
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
-			  struct inode *dir)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct super_block *sb, *h_sb;
-	struct dentry *dentry, *parent, *h_parent;
-	struct inode *h_dir;
-	struct au_branch *br;
-
-	err = -ENOSPC;
-	if (unlikely(*max_len <= Fh_tail)) {
-		AuWarn1("NFSv2 client (max_len %d)?\n", *max_len);
-		goto out;
-	}
-
-	err = FILEID_ROOT;
-	if (inode->i_ino == AUFS_ROOT_INO) {
-		AuDebugOn(inode->i_ino != AUFS_ROOT_INO);
-		goto out;
-	}
-
-	h_parent = NULL;
-	sb = inode->i_sb;
-	err = si_read_lock(sb, AuLock_FLUSH);
-	if (unlikely(err))
-		goto out;
-
-#ifdef CONFIG_AUFS_DEBUG
-	if (unlikely(!au_opt_test(au_mntflags(sb), XINO)))
-		AuWarn1("NFS-exporting requires xino\n");
-#endif
-	err = -EIO;
-	parent = NULL;
-	ii_read_lock_child(inode);
-	bindex = au_ibstart(inode);
-	if (!dir) {
-		dentry = d_find_any_alias(inode);
-		if (unlikely(!dentry))
-			goto out_unlock;
-		AuDebugOn(au_test_anon(dentry));
-		parent = dget_parent(dentry);
-		dput(dentry);
-		if (unlikely(!parent))
-			goto out_unlock;
-		if (d_really_is_positive(parent))
-			dir = d_inode(parent);
-	}
-
-	ii_read_lock_parent(dir);
-	h_dir = au_h_iptr(dir, bindex);
-	ii_read_unlock(dir);
-	if (unlikely(!h_dir))
-		goto out_parent;
-	h_parent = d_find_any_alias(h_dir);
-	if (unlikely(!h_parent))
-		goto out_hparent;
-
-	err = -EPERM;
-	br = au_sbr(sb, bindex);
-	h_sb = au_br_sb(br);
-	if (unlikely(!h_sb->s_export_op)) {
-		AuErr1("%s branch is not exportable\n", au_sbtype(h_sb));
-		goto out_hparent;
-	}
-
-	fh[Fh_br_id] = br->br_id;
-	fh[Fh_sigen] = au_sigen(sb);
-	encode_ino(fh + Fh_ino, inode->i_ino);
-	encode_ino(fh + Fh_dir_ino, dir->i_ino);
-	fh[Fh_igen] = inode->i_generation;
-
-	*max_len -= Fh_tail;
-	fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail),
-					   max_len,
-					   /*connectable or subtreecheck*/0);
-	err = fh[Fh_h_type];
-	*max_len += Fh_tail;
-	/* todo: macros? */
-	if (err != FILEID_INVALID)
-		err = 99;
-	else
-		AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb));
-
-out_hparent:
-	dput(h_parent);
-out_parent:
-	dput(parent);
-out_unlock:
-	ii_read_unlock(inode);
-	si_read_unlock(sb);
-out:
-	if (unlikely(err < 0))
-		err = FILEID_INVALID;
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_commit_metadata(struct inode *inode)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct super_block *sb;
-	struct inode *h_inode;
-	int (*f)(struct inode *inode);
-
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-	ii_write_lock_child(inode);
-	bindex = au_ibstart(inode);
-	AuDebugOn(bindex < 0);
-	h_inode = au_h_iptr(inode, bindex);
-
-	f = h_inode->i_sb->s_export_op->commit_metadata;
-	if (f)
-		err = f(h_inode);
-	else {
-		struct writeback_control wbc = {
-			.sync_mode	= WB_SYNC_ALL,
-			.nr_to_write	= 0 /* metadata only */
-		};
-
-		err = sync_inode(h_inode, &wbc);
-	}
-
-	au_cpup_attr_timesizes(inode);
-	ii_write_unlock(inode);
-	si_read_unlock(sb);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct export_operations aufs_export_op = {
-	.fh_to_dentry		= aufs_fh_to_dentry,
-	/* .fh_to_parent	= aufs_fh_to_parent, */
-	.encode_fh		= aufs_encode_fh,
-	.commit_metadata	= aufs_commit_metadata
-};
-
-void au_export_init(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-	__u32 u;
-
-	sb->s_export_op = &aufs_export_op;
-	sbinfo = au_sbi(sb);
-	sbinfo->si_xigen = NULL;
-	get_random_bytes(&u, sizeof(u));
-	BUILD_BUG_ON(sizeof(u) != sizeof(int));
-	atomic_set(&sbinfo->si_xigen_next, u);
-}
diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c
deleted file mode 100644
index 34037c708..000000000
--- a/fs/aufs/f_op.c
+++ /dev/null
@@ -1,738 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * file and vm operations
- */
-
-#include <linux/aio.h>
-#include <linux/fs_stack.h>
-#include <linux/mman.h>
-#include <linux/security.h>
-#include "aufs.h"
-
-int au_do_open_nondir(struct file *file, int flags, struct file *h_file)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct dentry *dentry;
-	struct au_finfo *finfo;
-	struct inode *h_inode;
-
-	FiMustWriteLock(file);
-
-	err = 0;
-	dentry = file->f_path.dentry;
-	AuDebugOn(IS_ERR_OR_NULL(dentry));
-	finfo = au_fi(file);
-	memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop));
-	atomic_set(&finfo->fi_mmapped, 0);
-	bindex = au_dbstart(dentry);
-	if (!h_file)
-		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
-	else
-		get_file(h_file);
-	if (IS_ERR(h_file))
-		err = PTR_ERR(h_file);
-	else {
-		if ((flags & __O_TMPFILE)
-		    && !(flags & O_EXCL)) {
-			h_inode = file_inode(h_file);
-			spin_lock(&h_inode->i_lock);
-			h_inode->i_state |= I_LINKABLE;
-			spin_unlock(&h_inode->i_lock);
-		}
-		au_set_fbstart(file, bindex);
-		au_set_h_fptr(file, bindex, h_file);
-		au_update_figen(file);
-		/* todo: necessary? */
-		/* file->f_ra = h_file->f_ra; */
-	}
-
-	return err;
-}
-
-static int aufs_open_nondir(struct inode *inode __maybe_unused,
-			    struct file *file)
-{
-	int err;
-	struct super_block *sb;
-	struct au_do_open_args args = {
-		.open	= au_do_open_nondir
-	};
-
-	AuDbg("%pD, f_flags 0x%x, f_mode 0x%x\n",
-	      file, vfsub_file_flags(file), file->f_mode);
-
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	err = au_do_open(file, &args);
-	si_read_unlock(sb);
-	return err;
-}
-
-int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file)
-{
-	struct au_finfo *finfo;
-	aufs_bindex_t bindex;
-
-	finfo = au_fi(file);
-	au_sphl_del(&finfo->fi_hlist,
-		    &au_sbi(file->f_path.dentry->d_sb)->si_files);
-	bindex = finfo->fi_btop;
-	if (bindex >= 0)
-		au_set_h_fptr(file, bindex, NULL);
-
-	au_finfo_fin(file);
-	return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_do_flush_nondir(struct file *file, fl_owner_t id)
-{
-	int err;
-	struct file *h_file;
-
-	err = 0;
-	h_file = au_hf_top(file);
-	if (h_file)
-		err = vfsub_flush(h_file, id);
-	return err;
-}
-
-static int aufs_flush_nondir(struct file *file, fl_owner_t id)
-{
-	return au_do_flush(file, id, au_do_flush_nondir);
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * read and write functions acquire [fdi]_rwsem once, but release before
- * mmap_sem. This is because to stop a race condition between mmap(2).
- * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping
- * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in
- * read functions after [fdi]_rwsem are released, but it should be harmless.
- */
-
-/* Callers should call au_read_post() or fput() in the end */
-struct file *au_read_pre(struct file *file, int keep_fi)
-{
-	struct file *h_file;
-	int err;
-
-	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0);
-	if (!err) {
-		di_read_unlock(file->f_path.dentry, AuLock_IR);
-		h_file = au_hf_top(file);
-		get_file(h_file);
-		if (!keep_fi)
-			fi_read_unlock(file);
-	} else
-		h_file = ERR_PTR(err);
-
-	return h_file;
-}
-
-static void au_read_post(struct inode *inode, struct file *h_file)
-{
-	/* update without lock, I don't think it a problem */
-	fsstack_copy_attr_atime(inode, file_inode(h_file));
-	fput(h_file);
-}
-
-struct au_write_pre {
-	blkcnt_t blks;
-	aufs_bindex_t bstart;
-};
-
-/*
- * return with iinfo is write-locked
- * callers should call au_write_post() or iinfo_write_unlock() + fput() in the
- * end
- */
-static struct file *au_write_pre(struct file *file, int do_ready,
-				 struct au_write_pre *wpre)
-{
-	struct file *h_file;
-	struct dentry *dentry;
-	int err;
-	struct au_pin pin;
-
-	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
-	h_file = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	dentry = file->f_path.dentry;
-	if (do_ready) {
-		err = au_ready_to_write(file, -1, &pin);
-		if (unlikely(err)) {
-			h_file = ERR_PTR(err);
-			di_write_unlock(dentry);
-			goto out_fi;
-		}
-	}
-
-	di_downgrade_lock(dentry, /*flags*/0);
-	if (wpre)
-		wpre->bstart = au_fbstart(file);
-	h_file = au_hf_top(file);
-	get_file(h_file);
-	if (wpre)
-		wpre->blks = file_inode(h_file)->i_blocks;
-	if (do_ready)
-		au_unpin(&pin);
-	di_read_unlock(dentry, /*flags*/0);
-
-out_fi:
-	fi_write_unlock(file);
-out:
-	return h_file;
-}
-
-static void au_write_post(struct inode *inode, struct file *h_file,
-			  struct au_write_pre *wpre, ssize_t written)
-{
-	struct inode *h_inode;
-
-	au_cpup_attr_timesizes(inode);
-	AuDebugOn(au_ibstart(inode) != wpre->bstart);
-	h_inode = file_inode(h_file);
-	inode->i_mode = h_inode->i_mode;
-	ii_write_unlock(inode);
-	fput(h_file);
-
-	/* AuDbg("blks %llu, %llu\n", (u64)blks, (u64)h_inode->i_blocks); */
-	if (written > 0)
-		au_fhsm_wrote(inode->i_sb, wpre->bstart,
-			      /*force*/h_inode->i_blocks > wpre->blks);
-}
-
-static ssize_t aufs_read(struct file *file, char __user *buf, size_t count,
-			 loff_t *ppos)
-{
-	ssize_t err;
-	struct inode *inode;
-	struct file *h_file;
-	struct super_block *sb;
-
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	/* filedata may be obsoleted by concurrent copyup, but no problem */
-	err = vfsub_read_u(h_file, buf, count, ppos);
-	/* todo: necessary? */
-	/* file->f_ra = h_file->f_ra; */
-	au_read_post(inode, h_file);
-
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-/*
- * todo: very ugly
- * it locks both of i_mutex and si_rwsem for read in safe.
- * if the plink maintenance mode continues forever (that is the problem),
- * may loop forever.
- */
-static void au_mtx_and_read_lock(struct inode *inode)
-{
-	int err;
-	struct super_block *sb = inode->i_sb;
-
-	while (1) {
-		mutex_lock(&inode->i_mutex);
-		err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-		if (!err)
-			break;
-		mutex_unlock(&inode->i_mutex);
-		si_read_lock(sb, AuLock_NOPLMW);
-		si_read_unlock(sb);
-	}
-}
-
-static ssize_t aufs_write(struct file *file, const char __user *ubuf,
-			  size_t count, loff_t *ppos)
-{
-	ssize_t err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *h_file;
-	char __user *buf = (char __user *)ubuf;
-
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = vfsub_write_u(h_file, buf, count, ppos);
-	au_write_post(inode, h_file, &wpre, err);
-
-out:
-	si_read_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
-
-static ssize_t au_do_iter(struct file *h_file, int rw, struct kiocb *kio,
-			  struct iov_iter *iov_iter)
-{
-	ssize_t err;
-	struct file *file;
-	ssize_t (*iter)(struct kiocb *, struct iov_iter *);
-
-	err = security_file_permission(h_file, rw);
-	if (unlikely(err))
-		goto out;
-
-	err = -ENOSYS;
-	iter = NULL;
-	if (rw == MAY_READ)
-		iter = h_file->f_op->read_iter;
-	else if (rw == MAY_WRITE)
-		iter = h_file->f_op->write_iter;
-
-	file = kio->ki_filp;
-	kio->ki_filp = h_file;
-	if (iter) {
-		lockdep_off();
-		err = iter(kio, iov_iter);
-		lockdep_on();
-	} else
-		/* currently there is no such fs */
-		WARN_ON_ONCE(1);
-	kio->ki_filp = file;
-
-out:
-	return err;
-}
-
-static ssize_t aufs_read_iter(struct kiocb *kio, struct iov_iter *iov_iter)
-{
-	ssize_t err;
-	struct file *file, *h_file;
-	struct inode *inode;
-	struct super_block *sb;
-
-	file = kio->ki_filp;
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = au_do_iter(h_file, MAY_READ, kio, iov_iter);
-	/* todo: necessary? */
-	/* file->f_ra = h_file->f_ra; */
-	au_read_post(inode, h_file);
-
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-static ssize_t aufs_write_iter(struct kiocb *kio, struct iov_iter *iov_iter)
-{
-	ssize_t err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *file, *h_file;
-
-	file = kio->ki_filp;
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = au_do_iter(h_file, MAY_WRITE, kio, iov_iter);
-	au_write_post(inode, h_file, &wpre, err);
-
-out:
-	si_read_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
-
-static ssize_t aufs_splice_read(struct file *file, loff_t *ppos,
-				struct pipe_inode_info *pipe, size_t len,
-				unsigned int flags)
-{
-	ssize_t err;
-	struct file *h_file;
-	struct inode *inode;
-	struct super_block *sb;
-
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/1);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	if (au_test_loopback_kthread()) {
-		au_warn_loopback(h_file->f_path.dentry->d_sb);
-		if (file->f_mapping != h_file->f_mapping) {
-			file->f_mapping = h_file->f_mapping;
-			smp_mb(); /* unnecessary? */
-		}
-	}
-	fi_read_unlock(file);
-
-	err = vfsub_splice_to(h_file, ppos, pipe, len, flags);
-	/* todo: necessasry? */
-	/* file->f_ra = h_file->f_ra; */
-	au_read_post(inode, h_file);
-
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-static ssize_t
-aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos,
-		  size_t len, unsigned int flags)
-{
-	ssize_t err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *h_file;
-
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = vfsub_splice_from(pipe, h_file, ppos, len, flags);
-	au_write_post(inode, h_file, &wpre, err);
-
-out:
-	si_read_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
-
-static long aufs_fallocate(struct file *file, int mode, loff_t offset,
-			   loff_t len)
-{
-	long err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *h_file;
-
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	lockdep_off();
-	err = vfs_fallocate(h_file, mode, offset, len);
-	lockdep_on();
-	au_write_post(inode, h_file, &wpre, /*written*/1);
-
-out:
-	si_read_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * The locking order around current->mmap_sem.
- * - in most and regular cases
- *   file I/O syscall -- aufs_read() or something
- *	-- si_rwsem for read -- mmap_sem
- *	(Note that [fdi]i_rwsem are released before mmap_sem).
- * - in mmap case
- *   mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem
- * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for
- * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in
- * file I/O. Aufs needs to stop lockdep in aufs_mmap() though.
- * It means that when aufs acquires si_rwsem for write, the process should never
- * acquire mmap_sem.
- *
- * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a
- * problem either since any directory is not able to be mmap-ed.
- * The similar scenario is applied to aufs_readlink() too.
- */
-
-#if 0 /* stop calling security_file_mmap() */
-/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */
-#define AuConv_VM_PROT(f, b)	_calc_vm_trans(f, VM_##b, PROT_##b)
-
-static unsigned long au_arch_prot_conv(unsigned long flags)
-{
-	/* currently ppc64 only */
-#ifdef CONFIG_PPC64
-	/* cf. linux/arch/powerpc/include/asm/mman.h */
-	AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO);
-	return AuConv_VM_PROT(flags, SAO);
-#else
-	AuDebugOn(arch_calc_vm_prot_bits(-1));
-	return 0;
-#endif
-}
-
-static unsigned long au_prot_conv(unsigned long flags)
-{
-	return AuConv_VM_PROT(flags, READ)
-		| AuConv_VM_PROT(flags, WRITE)
-		| AuConv_VM_PROT(flags, EXEC)
-		| au_arch_prot_conv(flags);
-}
-
-/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */
-#define AuConv_VM_MAP(f, b)	_calc_vm_trans(f, VM_##b, MAP_##b)
-
-static unsigned long au_flag_conv(unsigned long flags)
-{
-	return AuConv_VM_MAP(flags, GROWSDOWN)
-		| AuConv_VM_MAP(flags, DENYWRITE)
-		| AuConv_VM_MAP(flags, LOCKED);
-}
-#endif
-
-static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	int err;
-	const unsigned char wlock
-		= (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
-	struct super_block *sb;
-	struct file *h_file;
-	struct inode *inode;
-
-	AuDbgVmRegion(file, vma);
-
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	lockdep_off();
-	si_read_lock(sb, AuLock_NOPLMW);
-
-	h_file = au_write_pre(file, wlock, /*wpre*/NULL);
-	lockdep_on();
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = 0;
-	au_set_mmapped(file);
-	au_vm_file_reset(vma, h_file);
-	/*
-	 * we cannot call security_mmap_file() here since it may acquire
-	 * mmap_sem or i_mutex.
-	 *
-	 * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags),
-	 *			 au_flag_conv(vma->vm_flags));
-	 */
-	if (!err)
-		err = h_file->f_op->mmap(h_file, vma);
-	if (!err) {
-		au_vm_prfile_set(vma, file);
-		fsstack_copy_attr_atime(inode, file_inode(h_file));
-		goto out_fput; /* success */
-	}
-	au_unset_mmapped(file);
-	au_vm_file_reset(vma, file);
-
-out_fput:
-	lockdep_off();
-	ii_write_unlock(inode);
-	lockdep_on();
-	fput(h_file);
-out:
-	lockdep_off();
-	si_read_unlock(sb);
-	lockdep_on();
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end,
-			     int datasync)
-{
-	int err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *h_file;
-
-	err = 0; /* -EBADF; */ /* posix? */
-	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		goto out;
-
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out_unlock;
-
-	err = vfsub_fsync(h_file, &h_file->f_path, datasync);
-	au_write_post(inode, h_file, &wpre, /*written*/0);
-
-out_unlock:
-	si_read_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
-out:
-	return err;
-}
-
-/* no one supports this operation, currently */
-#if 0
-static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync)
-{
-	int err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *file, *h_file;
-
-	err = 0; /* -EBADF; */ /* posix? */
-	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		goto out;
-
-	file = kio->ki_filp;
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out_unlock;
-
-	err = -ENOSYS;
-	h_file = au_hf_top(file);
-	if (h_file->f_op->aio_fsync) {
-		struct mutex *h_mtx;
-
-		h_mtx = &file_inode(h_file)->i_mutex;
-		if (!is_sync_kiocb(kio)) {
-			get_file(h_file);
-			fput(file);
-		}
-		kio->ki_filp = h_file;
-		err = h_file->f_op->aio_fsync(kio, datasync);
-		mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
-		if (!err)
-			vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL);
-		/*ignore*/
-		mutex_unlock(h_mtx);
-	}
-	au_write_post(inode, h_file, &wpre, /*written*/0);
-
-out_unlock:
-	si_read_unlock(inode->sb);
-	mutex_unlock(&inode->i_mutex);
-out:
-	return err;
-}
-#endif
-
-static int aufs_fasync(int fd, struct file *file, int flag)
-{
-	int err;
-	struct file *h_file;
-	struct super_block *sb;
-
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	if (h_file->f_op->fasync)
-		err = h_file->f_op->fasync(fd, h_file, flag);
-	fput(h_file); /* instead of au_read_post() */
-
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* no one supports this operation, currently */
-#if 0
-static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset,
-			     size_t len, loff_t *pos, int more)
-{
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-const struct file_operations aufs_file_fop = {
-	.owner		= THIS_MODULE,
-
-	.llseek		= default_llseek,
-
-	.read		= aufs_read,
-	.write		= aufs_write,
-	.read_iter	= aufs_read_iter,
-	.write_iter	= aufs_write_iter,
-
-#ifdef CONFIG_AUFS_POLL
-	.poll		= aufs_poll,
-#endif
-	.unlocked_ioctl	= aufs_ioctl_nondir,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= aufs_compat_ioctl_nondir,
-#endif
-	.mmap		= aufs_mmap,
-	.open		= aufs_open_nondir,
-	.flush		= aufs_flush_nondir,
-	.release	= aufs_release_nondir,
-	.fsync		= aufs_fsync_nondir,
-	/* .aio_fsync	= aufs_aio_fsync_nondir, */
-	.fasync		= aufs_fasync,
-	/* .sendpage	= aufs_sendpage, */
-	.splice_write	= aufs_splice_write,
-	.splice_read	= aufs_splice_read,
-#if 0
-	.aio_splice_write = aufs_aio_splice_write,
-	.aio_splice_read  = aufs_aio_splice_read,
-#endif
-	.fallocate	= aufs_fallocate
-};
diff --git a/fs/aufs/fhsm.c b/fs/aufs/fhsm.c
deleted file mode 100644
index 7ca4d1759..000000000
--- a/fs/aufs/fhsm.c
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- * Copyright (C) 2011-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-/*
- * File-based Hierarchy Storage Management
- */
-
-#include <linux/anon_inodes.h>
-#include <linux/poll.h>
-#include <linux/seq_file.h>
-#include <linux/statfs.h>
-#include "aufs.h"
-
-static aufs_bindex_t au_fhsm_bottom(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	SiMustAnyLock(sb);
-
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	AuDebugOn(!fhsm);
-	return fhsm->fhsm_bottom;
-}
-
-void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex)
-{
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	AuDebugOn(!fhsm);
-	fhsm->fhsm_bottom = bindex;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_fhsm_test_jiffy(struct au_sbinfo *sbinfo, struct au_branch *br)
-{
-	struct au_br_fhsm *bf;
-
-	bf = br->br_fhsm;
-	MtxMustLock(&bf->bf_lock);
-
-	return !bf->bf_readable
-		|| time_after(jiffies,
-			      bf->bf_jiffy + sbinfo->si_fhsm.fhsm_expire);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void au_fhsm_notify(struct super_block *sb, int val)
-{
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	SiMustAnyLock(sb);
-
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	if (au_fhsm_pid(fhsm)
-	    && atomic_read(&fhsm->fhsm_readable) != -1) {
-		atomic_set(&fhsm->fhsm_readable, val);
-		if (val)
-			wake_up(&fhsm->fhsm_wqh);
-	}
-}
-
-static int au_fhsm_stfs(struct super_block *sb, aufs_bindex_t bindex,
-			struct aufs_stfs *rstfs, int do_lock, int do_notify)
-{
-	int err;
-	struct au_branch *br;
-	struct au_br_fhsm *bf;
-
-	br = au_sbr(sb, bindex);
-	AuDebugOn(au_br_rdonly(br));
-	bf = br->br_fhsm;
-	AuDebugOn(!bf);
-
-	if (do_lock)
-		mutex_lock(&bf->bf_lock);
-	else
-		MtxMustLock(&bf->bf_lock);
-
-	/* sb->s_root for NFS is unreliable */
-	err = au_br_stfs(br, &bf->bf_stfs);
-	if (unlikely(err)) {
-		AuErr1("FHSM failed (%d), b%d, ignored.\n", bindex, err);
-		goto out;
-	}
-
-	bf->bf_jiffy = jiffies;
-	bf->bf_readable = 1;
-	if (do_notify)
-		au_fhsm_notify(sb, /*val*/1);
-	if (rstfs)
-		*rstfs = bf->bf_stfs;
-
-out:
-	if (do_lock)
-		mutex_unlock(&bf->bf_lock);
-	au_fhsm_notify(sb, /*val*/1);
-
-	return err;
-}
-
-void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-	struct au_branch *br;
-	struct au_br_fhsm *bf;
-
-	AuDbg("b%d, force %d\n", bindex, force);
-	SiMustAnyLock(sb);
-
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	if (!au_ftest_si(sbinfo, FHSM)
-	    || fhsm->fhsm_bottom == bindex)
-		return;
-
-	br = au_sbr(sb, bindex);
-	bf = br->br_fhsm;
-	AuDebugOn(!bf);
-	mutex_lock(&bf->bf_lock);
-	if (force
-	    || au_fhsm_pid(fhsm)
-	    || au_fhsm_test_jiffy(sbinfo, br))
-		err = au_fhsm_stfs(sb, bindex, /*rstfs*/NULL, /*do_lock*/0,
-				  /*do_notify*/1);
-	mutex_unlock(&bf->bf_lock);
-}
-
-void au_fhsm_wrote_all(struct super_block *sb, int force)
-{
-	aufs_bindex_t bindex, bend;
-	struct au_branch *br;
-
-	/* exclude the bottom */
-	bend = au_fhsm_bottom(sb);
-	for (bindex = 0; bindex < bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		if (au_br_fhsm(br->br_perm))
-			au_fhsm_wrote(sb, bindex, force);
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-static unsigned int au_fhsm_poll(struct file *file,
-				 struct poll_table_struct *wait)
-{
-	unsigned int mask;
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	mask = 0;
-	sbinfo = file->private_data;
-	fhsm = &sbinfo->si_fhsm;
-	poll_wait(file, &fhsm->fhsm_wqh, wait);
-	if (atomic_read(&fhsm->fhsm_readable))
-		mask = POLLIN /* | POLLRDNORM */;
-
-	AuTraceErr((int)mask);
-	return mask;
-}
-
-static int au_fhsm_do_read_one(struct aufs_stbr __user *stbr,
-			      struct aufs_stfs *stfs, __s16 brid)
-{
-	int err;
-
-	err = copy_to_user(&stbr->stfs, stfs, sizeof(*stfs));
-	if (!err)
-		err = __put_user(brid, &stbr->brid);
-	if (unlikely(err))
-		err = -EFAULT;
-
-	return err;
-}
-
-static ssize_t au_fhsm_do_read(struct super_block *sb,
-			       struct aufs_stbr __user *stbr, size_t count)
-{
-	ssize_t err;
-	int nstbr;
-	aufs_bindex_t bindex, bend;
-	struct au_branch *br;
-	struct au_br_fhsm *bf;
-
-	/* except the bottom branch */
-	err = 0;
-	nstbr = 0;
-	bend = au_fhsm_bottom(sb);
-	for (bindex = 0; !err && bindex < bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		if (!au_br_fhsm(br->br_perm))
-			continue;
-
-		bf = br->br_fhsm;
-		mutex_lock(&bf->bf_lock);
-		if (bf->bf_readable) {
-			err = -EFAULT;
-			if (count >= sizeof(*stbr))
-				err = au_fhsm_do_read_one(stbr++, &bf->bf_stfs,
-							  br->br_id);
-			if (!err) {
-				bf->bf_readable = 0;
-				count -= sizeof(*stbr);
-				nstbr++;
-			}
-		}
-		mutex_unlock(&bf->bf_lock);
-	}
-	if (!err)
-		err = sizeof(*stbr) * nstbr;
-
-	return err;
-}
-
-static ssize_t au_fhsm_read(struct file *file, char __user *buf, size_t count,
-			   loff_t *pos)
-{
-	ssize_t err;
-	int readable;
-	aufs_bindex_t nfhsm, bindex, bend;
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-	struct au_branch *br;
-	struct super_block *sb;
-
-	err = 0;
-	sbinfo = file->private_data;
-	fhsm = &sbinfo->si_fhsm;
-need_data:
-	spin_lock_irq(&fhsm->fhsm_wqh.lock);
-	if (!atomic_read(&fhsm->fhsm_readable)) {
-		if (vfsub_file_flags(file) & O_NONBLOCK)
-			err = -EAGAIN;
-		else
-			err = wait_event_interruptible_locked_irq
-				(fhsm->fhsm_wqh,
-				 atomic_read(&fhsm->fhsm_readable));
-	}
-	spin_unlock_irq(&fhsm->fhsm_wqh.lock);
-	if (unlikely(err))
-		goto out;
-
-	/* sb may already be dead */
-	au_rw_read_lock(&sbinfo->si_rwsem);
-	readable = atomic_read(&fhsm->fhsm_readable);
-	if (readable > 0) {
-		sb = sbinfo->si_sb;
-		AuDebugOn(!sb);
-		/* exclude the bottom branch */
-		nfhsm = 0;
-		bend = au_fhsm_bottom(sb);
-		for (bindex = 0; bindex < bend; bindex++) {
-			br = au_sbr(sb, bindex);
-			if (au_br_fhsm(br->br_perm))
-				nfhsm++;
-		}
-		err = -EMSGSIZE;
-		if (nfhsm * sizeof(struct aufs_stbr) <= count) {
-			atomic_set(&fhsm->fhsm_readable, 0);
-			err = au_fhsm_do_read(sbinfo->si_sb, (void __user *)buf,
-					     count);
-		}
-	}
-	au_rw_read_unlock(&sbinfo->si_rwsem);
-	if (!readable)
-		goto need_data;
-
-out:
-	return err;
-}
-
-static int au_fhsm_release(struct inode *inode, struct file *file)
-{
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	/* sb may already be dead */
-	sbinfo = file->private_data;
-	fhsm = &sbinfo->si_fhsm;
-	spin_lock(&fhsm->fhsm_spin);
-	fhsm->fhsm_pid = 0;
-	spin_unlock(&fhsm->fhsm_spin);
-	kobject_put(&sbinfo->si_kobj);
-
-	return 0;
-}
-
-static const struct file_operations au_fhsm_fops = {
-	.owner		= THIS_MODULE,
-	.llseek		= noop_llseek,
-	.read		= au_fhsm_read,
-	.poll		= au_fhsm_poll,
-	.release	= au_fhsm_release
-};
-
-int au_fhsm_fd(struct super_block *sb, int oflags)
-{
-	int err, fd;
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	err = -EPERM;
-	if (unlikely(!capable(CAP_SYS_ADMIN)))
-		goto out;
-
-	err = -EINVAL;
-	if (unlikely(oflags & ~(O_CLOEXEC | O_NONBLOCK)))
-		goto out;
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	spin_lock(&fhsm->fhsm_spin);
-	if (!fhsm->fhsm_pid)
-		fhsm->fhsm_pid = current->pid;
-	else
-		err = -EBUSY;
-	spin_unlock(&fhsm->fhsm_spin);
-	if (unlikely(err))
-		goto out;
-
-	oflags |= O_RDONLY;
-	/* oflags |= FMODE_NONOTIFY; */
-	fd = anon_inode_getfd("[aufs_fhsm]", &au_fhsm_fops, sbinfo, oflags);
-	err = fd;
-	if (unlikely(fd < 0))
-		goto out_pid;
-
-	/* succeed reglardless 'fhsm' status */
-	kobject_get(&sbinfo->si_kobj);
-	si_noflush_read_lock(sb);
-	if (au_ftest_si(sbinfo, FHSM))
-		au_fhsm_wrote_all(sb, /*force*/0);
-	si_read_unlock(sb);
-	goto out; /* success */
-
-out_pid:
-	spin_lock(&fhsm->fhsm_spin);
-	fhsm->fhsm_pid = 0;
-	spin_unlock(&fhsm->fhsm_spin);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_fhsm_br_alloc(struct au_branch *br)
-{
-	int err;
-
-	err = 0;
-	br->br_fhsm = kmalloc(sizeof(*br->br_fhsm), GFP_NOFS);
-	if (br->br_fhsm)
-		au_br_fhsm_init(br->br_fhsm);
-	else
-		err = -ENOMEM;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_fhsm_fin(struct super_block *sb)
-{
-	au_fhsm_notify(sb, /*val*/-1);
-}
-
-void au_fhsm_init(struct au_sbinfo *sbinfo)
-{
-	struct au_fhsm *fhsm;
-
-	fhsm = &sbinfo->si_fhsm;
-	spin_lock_init(&fhsm->fhsm_spin);
-	init_waitqueue_head(&fhsm->fhsm_wqh);
-	atomic_set(&fhsm->fhsm_readable, 0);
-	fhsm->fhsm_expire
-		= msecs_to_jiffies(AUFS_FHSM_CACHE_DEF_SEC * MSEC_PER_SEC);
-	fhsm->fhsm_bottom = -1;
-}
-
-void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec)
-{
-	sbinfo->si_fhsm.fhsm_expire
-		= msecs_to_jiffies(sec * MSEC_PER_SEC);
-}
-
-void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo)
-{
-	unsigned int u;
-
-	if (!au_ftest_si(sbinfo, FHSM))
-		return;
-
-	u = jiffies_to_msecs(sbinfo->si_fhsm.fhsm_expire) / MSEC_PER_SEC;
-	if (u != AUFS_FHSM_CACHE_DEF_SEC)
-		seq_printf(seq, ",fhsm_sec=%u", u);
-}
diff --git a/fs/aufs/file.c b/fs/aufs/file.c
deleted file mode 100644
index d3a566e4f..000000000
--- a/fs/aufs/file.c
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * handling file/dir, and address_space operation
- */
-
-#ifdef CONFIG_AUFS_DEBUG
-#include <linux/migrate.h>
-#endif
-#include <linux/pagemap.h>
-#include "aufs.h"
-
-/* drop flags for writing */
-unsigned int au_file_roflags(unsigned int flags)
-{
-	flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC);
-	flags |= O_RDONLY | O_NOATIME;
-	return flags;
-}
-
-/* common functions to regular file and dir */
-struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags,
-		       struct file *file, int force_wr)
-{
-	struct file *h_file;
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct path h_path;
-	int err;
-
-	/* a race condition can happen between open and unlink/rmdir */
-	h_file = ERR_PTR(-ENOENT);
-	h_dentry = au_h_dptr(dentry, bindex);
-	if (au_test_nfsd() && (!h_dentry || d_is_negative(h_dentry)))
-		goto out;
-	h_inode = d_inode(h_dentry);
-	spin_lock(&h_dentry->d_lock);
-	err = (!d_unhashed(dentry) && d_unlinked(h_dentry))
-		/* || !d_inode(dentry)->i_nlink */
-		;
-	spin_unlock(&h_dentry->d_lock);
-	if (unlikely(err))
-		goto out;
-
-	sb = dentry->d_sb;
-	br = au_sbr(sb, bindex);
-	err = au_br_test_oflag(flags, br);
-	h_file = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	/* drop flags for writing */
-	if (au_test_ro(sb, bindex, d_inode(dentry))) {
-		if (force_wr && !(flags & O_WRONLY))
-			force_wr = 0;
-		flags = au_file_roflags(flags);
-		if (force_wr) {
-			h_file = ERR_PTR(-EROFS);
-			flags = au_file_roflags(flags);
-			if (unlikely(vfsub_native_ro(h_inode)
-				     || IS_APPEND(h_inode)))
-				goto out;
-			flags &= ~O_ACCMODE;
-			flags |= O_WRONLY;
-		}
-	}
-	flags &= ~O_CREAT;
-	atomic_inc(&br->br_count);
-	h_path.dentry = h_dentry;
-	h_path.mnt = au_br_mnt(br);
-	h_file = vfsub_dentry_open(&h_path, flags);
-	if (IS_ERR(h_file))
-		goto out_br;
-
-	if (flags & __FMODE_EXEC) {
-		err = deny_write_access(h_file);
-		if (unlikely(err)) {
-			fput(h_file);
-			h_file = ERR_PTR(err);
-			goto out_br;
-		}
-	}
-	fsnotify_open(h_file);
-	goto out; /* success */
-
-out_br:
-	atomic_dec(&br->br_count);
-out:
-	return h_file;
-}
-
-static int au_cmoo(struct dentry *dentry)
-{
-	int err, cmoo;
-	unsigned int udba;
-	struct path h_path;
-	struct au_pin pin;
-	struct au_cp_generic cpg = {
-		.dentry	= dentry,
-		.bdst	= -1,
-		.bsrc	= -1,
-		.len	= -1,
-		.pin	= &pin,
-		.flags	= AuCpup_DTIME | AuCpup_HOPEN
-	};
-	struct inode *delegated;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-	pid_t pid;
-	struct au_branch *br;
-	struct dentry *parent;
-	struct au_hinode *hdir;
-
-	DiMustWriteLock(dentry);
-	IiMustWriteLock(d_inode(dentry));
-
-	err = 0;
-	if (IS_ROOT(dentry))
-		goto out;
-	cpg.bsrc = au_dbstart(dentry);
-	if (!cpg.bsrc)
-		goto out;
-
-	sb = dentry->d_sb;
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	pid = au_fhsm_pid(fhsm);
-	if (pid
-	    && (current->pid == pid
-		|| current->real_parent->pid == pid))
-		goto out;
-
-	br = au_sbr(sb, cpg.bsrc);
-	cmoo = au_br_cmoo(br->br_perm);
-	if (!cmoo)
-		goto out;
-	if (!d_is_reg(dentry))
-		cmoo &= AuBrAttr_COO_ALL;
-	if (!cmoo)
-		goto out;
-
-	parent = dget_parent(dentry);
-	di_write_lock_parent(parent);
-	err = au_wbr_do_copyup_bu(dentry, cpg.bsrc - 1);
-	cpg.bdst = err;
-	if (unlikely(err < 0)) {
-		err = 0;	/* there is no upper writable branch */
-		goto out_dgrade;
-	}
-	AuDbg("bsrc %d, bdst %d\n", cpg.bsrc, cpg.bdst);
-
-	/* do not respect the coo attrib for the target branch */
-	err = au_cpup_dirs(dentry, cpg.bdst);
-	if (unlikely(err))
-		goto out_dgrade;
-
-	di_downgrade_lock(parent, AuLock_IR);
-	udba = au_opt_udba(sb);
-	err = au_pin(&pin, dentry, cpg.bdst, udba,
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	if (unlikely(err))
-		goto out_parent;
-
-	err = au_sio_cpup_simple(&cpg);
-	au_unpin(&pin);
-	if (unlikely(err))
-		goto out_parent;
-	if (!(cmoo & AuBrWAttr_MOO))
-		goto out_parent; /* success */
-
-	err = au_pin(&pin, dentry, cpg.bsrc, udba,
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	if (unlikely(err))
-		goto out_parent;
-
-	h_path.mnt = au_br_mnt(br);
-	h_path.dentry = au_h_dptr(dentry, cpg.bsrc);
-	hdir = au_hi(d_inode(parent), cpg.bsrc);
-	delegated = NULL;
-	err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated, /*force*/1);
-	au_unpin(&pin);
-	/* todo: keep h_dentry or not? */
-	if (unlikely(err == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal unlink\n");
-		iput(delegated);
-	}
-	if (unlikely(err)) {
-		pr_err("unlink %pd after coo failed (%d), ignored\n",
-		       dentry, err);
-		err = 0;
-	}
-	goto out_parent; /* success */
-
-out_dgrade:
-	di_downgrade_lock(parent, AuLock_IR);
-out_parent:
-	di_read_unlock(parent, AuLock_IR);
-	dput(parent);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int au_do_open(struct file *file, struct au_do_open_args *args)
-{
-	int err, no_lock = args->no_lock;
-	struct dentry *dentry;
-	struct au_finfo *finfo;
-
-	if (!no_lock)
-		err = au_finfo_init(file, args->fidir);
-	else {
-		lockdep_off();
-		err = au_finfo_init(file, args->fidir);
-		lockdep_on();
-	}
-	if (unlikely(err))
-		goto out;
-
-	dentry = file->f_path.dentry;
-	AuDebugOn(IS_ERR_OR_NULL(dentry));
-	if (!no_lock) {
-		di_write_lock_child(dentry);
-		err = au_cmoo(dentry);
-		di_downgrade_lock(dentry, AuLock_IR);
-		if (!err)
-			err = args->open(file, vfsub_file_flags(file), NULL);
-		di_read_unlock(dentry, AuLock_IR);
-	} else {
-		err = au_cmoo(dentry);
-		if (!err)
-			err = args->open(file, vfsub_file_flags(file),
-					 args->h_file);
-		if (!err && au_fbstart(file) != au_dbstart(dentry))
-			/*
-			 * cmoo happens after h_file was opened.
-			 * need to refresh file later.
-			 */
-			atomic_dec(&au_fi(file)->fi_generation);
-	}
-
-	finfo = au_fi(file);
-	if (!err) {
-		finfo->fi_file = file;
-		au_sphl_add(&finfo->fi_hlist,
-			    &au_sbi(file->f_path.dentry->d_sb)->si_files);
-	}
-	if (!no_lock)
-		fi_write_unlock(file);
-	else {
-		lockdep_off();
-		fi_write_unlock(file);
-		lockdep_on();
-	}
-	if (unlikely(err)) {
-		finfo->fi_hdir = NULL;
-		au_finfo_fin(file);
-	}
-
-out:
-	return err;
-}
-
-int au_reopen_nondir(struct file *file)
-{
-	int err;
-	aufs_bindex_t bstart;
-	struct dentry *dentry;
-	struct file *h_file, *h_file_tmp;
-
-	dentry = file->f_path.dentry;
-	bstart = au_dbstart(dentry);
-	h_file_tmp = NULL;
-	if (au_fbstart(file) == bstart) {
-		h_file = au_hf_top(file);
-		if (file->f_mode == h_file->f_mode)
-			return 0; /* success */
-		h_file_tmp = h_file;
-		get_file(h_file_tmp);
-		au_set_h_fptr(file, bstart, NULL);
-	}
-	AuDebugOn(au_fi(file)->fi_hdir);
-	/*
-	 * it can happen
-	 * file exists on both of rw and ro
-	 * open --> dbstart and fbstart are both 0
-	 * prepend a branch as rw, "rw" become ro
-	 * remove rw/file
-	 * delete the top branch, "rw" becomes rw again
-	 *	--> dbstart is 1, fbstart is still 0
-	 * write --> fbstart is 0 but dbstart is 1
-	 */
-	/* AuDebugOn(au_fbstart(file) < bstart); */
-
-	h_file = au_h_open(dentry, bstart, vfsub_file_flags(file) & ~O_TRUNC,
-			   file, /*force_wr*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file)) {
-		if (h_file_tmp) {
-			atomic_inc(&au_sbr(dentry->d_sb, bstart)->br_count);
-			au_set_h_fptr(file, bstart, h_file_tmp);
-			h_file_tmp = NULL;
-		}
-		goto out; /* todo: close all? */
-	}
-
-	err = 0;
-	au_set_fbstart(file, bstart);
-	au_set_h_fptr(file, bstart, h_file);
-	au_update_figen(file);
-	/* todo: necessary? */
-	/* file->f_ra = h_file->f_ra; */
-
-out:
-	if (h_file_tmp)
-		fput(h_file_tmp);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_reopen_wh(struct file *file, aufs_bindex_t btgt,
-			struct dentry *hi_wh)
-{
-	int err;
-	aufs_bindex_t bstart;
-	struct au_dinfo *dinfo;
-	struct dentry *h_dentry;
-	struct au_hdentry *hdp;
-
-	dinfo = au_di(file->f_path.dentry);
-	AuRwMustWriteLock(&dinfo->di_rwsem);
-
-	bstart = dinfo->di_bstart;
-	dinfo->di_bstart = btgt;
-	hdp = dinfo->di_hdentry;
-	h_dentry = hdp[0 + btgt].hd_dentry;
-	hdp[0 + btgt].hd_dentry = hi_wh;
-	err = au_reopen_nondir(file);
-	hdp[0 + btgt].hd_dentry = h_dentry;
-	dinfo->di_bstart = bstart;
-
-	return err;
-}
-
-static int au_ready_to_write_wh(struct file *file, loff_t len,
-				aufs_bindex_t bcpup, struct au_pin *pin)
-{
-	int err;
-	struct inode *inode, *h_inode;
-	struct dentry *h_dentry, *hi_wh;
-	struct au_cp_generic cpg = {
-		.dentry	= file->f_path.dentry,
-		.bdst	= bcpup,
-		.bsrc	= -1,
-		.len	= len,
-		.pin	= pin
-	};
-
-	au_update_dbstart(cpg.dentry);
-	inode = d_inode(cpg.dentry);
-	h_inode = NULL;
-	if (au_dbstart(cpg.dentry) <= bcpup
-	    && au_dbend(cpg.dentry) >= bcpup) {
-		h_dentry = au_h_dptr(cpg.dentry, bcpup);
-		if (h_dentry && d_is_positive(h_dentry))
-			h_inode = d_inode(h_dentry);
-	}
-	hi_wh = au_hi_wh(inode, bcpup);
-	if (!hi_wh && !h_inode)
-		err = au_sio_cpup_wh(&cpg, file);
-	else
-		/* already copied-up after unlink */
-		err = au_reopen_wh(file, bcpup, hi_wh);
-
-	if (!err
-	    && (inode->i_nlink > 1
-		|| (inode->i_state & I_LINKABLE))
-	    && au_opt_test(au_mntflags(cpg.dentry->d_sb), PLINK))
-		au_plink_append(inode, bcpup, au_h_dptr(cpg.dentry, bcpup));
-
-	return err;
-}
-
-/*
- * prepare the @file for writing.
- */
-int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin)
-{
-	int err;
-	aufs_bindex_t dbstart;
-	struct dentry *parent;
-	struct inode *inode;
-	struct super_block *sb;
-	struct file *h_file;
-	struct au_cp_generic cpg = {
-		.dentry	= file->f_path.dentry,
-		.bdst	= -1,
-		.bsrc	= -1,
-		.len	= len,
-		.pin	= pin,
-		.flags	= AuCpup_DTIME
-	};
-
-	sb = cpg.dentry->d_sb;
-	inode = d_inode(cpg.dentry);
-	cpg.bsrc = au_fbstart(file);
-	err = au_test_ro(sb, cpg.bsrc, inode);
-	if (!err && (au_hf_top(file)->f_mode & FMODE_WRITE)) {
-		err = au_pin(pin, cpg.dentry, cpg.bsrc, AuOpt_UDBA_NONE,
-			     /*flags*/0);
-		goto out;
-	}
-
-	/* need to cpup or reopen */
-	parent = dget_parent(cpg.dentry);
-	di_write_lock_parent(parent);
-	err = AuWbrCopyup(au_sbi(sb), cpg.dentry);
-	cpg.bdst = err;
-	if (unlikely(err < 0))
-		goto out_dgrade;
-	err = 0;
-
-	if (!d_unhashed(cpg.dentry) && !au_h_dptr(parent, cpg.bdst)) {
-		err = au_cpup_dirs(cpg.dentry, cpg.bdst);
-		if (unlikely(err))
-			goto out_dgrade;
-	}
-
-	err = au_pin(pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE,
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	if (unlikely(err))
-		goto out_dgrade;
-
-	dbstart = au_dbstart(cpg.dentry);
-	if (dbstart <= cpg.bdst)
-		cpg.bsrc = cpg.bdst;
-
-	if (dbstart <= cpg.bdst		/* just reopen */
-	    || !d_unhashed(cpg.dentry)	/* copyup and reopen */
-		) {
-		h_file = au_h_open_pre(cpg.dentry, cpg.bsrc, /*force_wr*/0);
-		if (IS_ERR(h_file))
-			err = PTR_ERR(h_file);
-		else {
-			di_downgrade_lock(parent, AuLock_IR);
-			if (dbstart > cpg.bdst)
-				err = au_sio_cpup_simple(&cpg);
-			if (!err)
-				err = au_reopen_nondir(file);
-			au_h_open_post(cpg.dentry, cpg.bsrc, h_file);
-		}
-	} else {			/* copyup as wh and reopen */
-		/*
-		 * since writable hfsplus branch is not supported,
-		 * h_open_pre/post() are unnecessary.
-		 */
-		err = au_ready_to_write_wh(file, len, cpg.bdst, pin);
-		di_downgrade_lock(parent, AuLock_IR);
-	}
-
-	if (!err) {
-		au_pin_set_parent_lflag(pin, /*lflag*/0);
-		goto out_dput; /* success */
-	}
-	au_unpin(pin);
-	goto out_unlock;
-
-out_dgrade:
-	di_downgrade_lock(parent, AuLock_IR);
-out_unlock:
-	di_read_unlock(parent, AuLock_IR);
-out_dput:
-	dput(parent);
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_do_flush(struct file *file, fl_owner_t id,
-		int (*flush)(struct file *file, fl_owner_t id))
-{
-	int err;
-	struct super_block *sb;
-	struct inode *inode;
-
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	si_noflush_read_lock(sb);
-	fi_read_lock(file);
-	ii_read_lock_child(inode);
-
-	err = flush(file, id);
-	au_cpup_attr_timesizes(inode);
-
-	ii_read_unlock(inode);
-	fi_read_unlock(file);
-	si_read_unlock(sb);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_file_refresh_by_inode(struct file *file, int *need_reopen)
-{
-	int err;
-	struct au_pin pin;
-	struct au_finfo *finfo;
-	struct dentry *parent, *hi_wh;
-	struct inode *inode;
-	struct super_block *sb;
-	struct au_cp_generic cpg = {
-		.dentry	= file->f_path.dentry,
-		.bdst	= -1,
-		.bsrc	= -1,
-		.len	= -1,
-		.pin	= &pin,
-		.flags	= AuCpup_DTIME
-	};
-
-	FiMustWriteLock(file);
-
-	err = 0;
-	finfo = au_fi(file);
-	sb = cpg.dentry->d_sb;
-	inode = d_inode(cpg.dentry);
-	cpg.bdst = au_ibstart(inode);
-	if (cpg.bdst == finfo->fi_btop || IS_ROOT(cpg.dentry))
-		goto out;
-
-	parent = dget_parent(cpg.dentry);
-	if (au_test_ro(sb, cpg.bdst, inode)) {
-		di_read_lock_parent(parent, !AuLock_IR);
-		err = AuWbrCopyup(au_sbi(sb), cpg.dentry);
-		cpg.bdst = err;
-		di_read_unlock(parent, !AuLock_IR);
-		if (unlikely(err < 0))
-			goto out_parent;
-		err = 0;
-	}
-
-	di_read_lock_parent(parent, AuLock_IR);
-	hi_wh = au_hi_wh(inode, cpg.bdst);
-	if (!S_ISDIR(inode->i_mode)
-	    && au_opt_test(au_mntflags(sb), PLINK)
-	    && au_plink_test(inode)
-	    && !d_unhashed(cpg.dentry)
-	    && cpg.bdst < au_dbstart(cpg.dentry)) {
-		err = au_test_and_cpup_dirs(cpg.dentry, cpg.bdst);
-		if (unlikely(err))
-			goto out_unlock;
-
-		/* always superio. */
-		err = au_pin(&pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE,
-			     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-		if (!err) {
-			err = au_sio_cpup_simple(&cpg);
-			au_unpin(&pin);
-		}
-	} else if (hi_wh) {
-		/* already copied-up after unlink */
-		err = au_reopen_wh(file, cpg.bdst, hi_wh);
-		*need_reopen = 0;
-	}
-
-out_unlock:
-	di_read_unlock(parent, AuLock_IR);
-out_parent:
-	dput(parent);
-out:
-	return err;
-}
-
-static void au_do_refresh_dir(struct file *file)
-{
-	aufs_bindex_t bindex, bend, new_bindex, brid;
-	struct au_hfile *p, tmp, *q;
-	struct au_finfo *finfo;
-	struct super_block *sb;
-	struct au_fidir *fidir;
-
-	FiMustWriteLock(file);
-
-	sb = file->f_path.dentry->d_sb;
-	finfo = au_fi(file);
-	fidir = finfo->fi_hdir;
-	AuDebugOn(!fidir);
-	p = fidir->fd_hfile + finfo->fi_btop;
-	brid = p->hf_br->br_id;
-	bend = fidir->fd_bbot;
-	for (bindex = finfo->fi_btop; bindex <= bend; bindex++, p++) {
-		if (!p->hf_file)
-			continue;
-
-		new_bindex = au_br_index(sb, p->hf_br->br_id);
-		if (new_bindex == bindex)
-			continue;
-		if (new_bindex < 0) {
-			au_set_h_fptr(file, bindex, NULL);
-			continue;
-		}
-
-		/* swap two lower inode, and loop again */
-		q = fidir->fd_hfile + new_bindex;
-		tmp = *q;
-		*q = *p;
-		*p = tmp;
-		if (tmp.hf_file) {
-			bindex--;
-			p--;
-		}
-	}
-
-	p = fidir->fd_hfile;
-	if (!au_test_mmapped(file) && !d_unlinked(file->f_path.dentry)) {
-		bend = au_sbend(sb);
-		for (finfo->fi_btop = 0; finfo->fi_btop <= bend;
-		     finfo->fi_btop++, p++)
-			if (p->hf_file) {
-				if (file_inode(p->hf_file))
-					break;
-				au_hfput(p, file);
-			}
-	} else {
-		bend = au_br_index(sb, brid);
-		for (finfo->fi_btop = 0; finfo->fi_btop < bend;
-		     finfo->fi_btop++, p++)
-			if (p->hf_file)
-				au_hfput(p, file);
-		bend = au_sbend(sb);
-	}
-
-	p = fidir->fd_hfile + bend;
-	for (fidir->fd_bbot = bend; fidir->fd_bbot >= finfo->fi_btop;
-	     fidir->fd_bbot--, p--)
-		if (p->hf_file) {
-			if (file_inode(p->hf_file))
-				break;
-			au_hfput(p, file);
-		}
-	AuDebugOn(fidir->fd_bbot < finfo->fi_btop);
-}
-
-/*
- * after branch manipulating, refresh the file.
- */
-static int refresh_file(struct file *file, int (*reopen)(struct file *file))
-{
-	int err, need_reopen;
-	aufs_bindex_t bend, bindex;
-	struct dentry *dentry;
-	struct au_finfo *finfo;
-	struct au_hfile *hfile;
-
-	dentry = file->f_path.dentry;
-	finfo = au_fi(file);
-	if (!finfo->fi_hdir) {
-		hfile = &finfo->fi_htop;
-		AuDebugOn(!hfile->hf_file);
-		bindex = au_br_index(dentry->d_sb, hfile->hf_br->br_id);
-		AuDebugOn(bindex < 0);
-		if (bindex != finfo->fi_btop)
-			au_set_fbstart(file, bindex);
-	} else {
-		err = au_fidir_realloc(finfo, au_sbend(dentry->d_sb) + 1);
-		if (unlikely(err))
-			goto out;
-		au_do_refresh_dir(file);
-	}
-
-	err = 0;
-	need_reopen = 1;
-	if (!au_test_mmapped(file))
-		err = au_file_refresh_by_inode(file, &need_reopen);
-	if (!err && need_reopen && !d_unlinked(dentry))
-		err = reopen(file);
-	if (!err) {
-		au_update_figen(file);
-		goto out; /* success */
-	}
-
-	/* error, close all lower files */
-	if (finfo->fi_hdir) {
-		bend = au_fbend_dir(file);
-		for (bindex = au_fbstart(file); bindex <= bend; bindex++)
-			au_set_h_fptr(file, bindex, NULL);
-	}
-
-out:
-	return err;
-}
-
-/* common function to regular file and dir */
-int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file),
-			  int wlock)
-{
-	int err;
-	unsigned int sigen, figen;
-	aufs_bindex_t bstart;
-	unsigned char pseudo_link;
-	struct dentry *dentry;
-	struct inode *inode;
-
-	err = 0;
-	dentry = file->f_path.dentry;
-	inode = d_inode(dentry);
-	sigen = au_sigen(dentry->d_sb);
-	fi_write_lock(file);
-	figen = au_figen(file);
-	di_write_lock_child(dentry);
-	bstart = au_dbstart(dentry);
-	pseudo_link = (bstart != au_ibstart(inode));
-	if (sigen == figen && !pseudo_link && au_fbstart(file) == bstart) {
-		if (!wlock) {
-			di_downgrade_lock(dentry, AuLock_IR);
-			fi_downgrade_lock(file);
-		}
-		goto out; /* success */
-	}
-
-	AuDbg("sigen %d, figen %d\n", sigen, figen);
-	if (au_digen_test(dentry, sigen)) {
-		err = au_reval_dpath(dentry, sigen);
-		AuDebugOn(!err && au_digen_test(dentry, sigen));
-	}
-
-	if (!err)
-		err = refresh_file(file, reopen);
-	if (!err) {
-		if (!wlock) {
-			di_downgrade_lock(dentry, AuLock_IR);
-			fi_downgrade_lock(file);
-		}
-	} else {
-		di_write_unlock(dentry);
-		fi_write_unlock(file);
-	}
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* cf. aufs_nopage() */
-/* for madvise(2) */
-static int aufs_readpage(struct file *file __maybe_unused, struct page *page)
-{
-	unlock_page(page);
-	return 0;
-}
-
-/* it will never be called, but necessary to support O_DIRECT */
-static ssize_t aufs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t offset)
-{ BUG(); return 0; }
-
-/* they will never be called. */
-#ifdef CONFIG_AUFS_DEBUG
-static int aufs_write_begin(struct file *file, struct address_space *mapping,
-			    loff_t pos, unsigned len, unsigned flags,
-			    struct page **pagep, void **fsdata)
-{ AuUnsupport(); return 0; }
-static int aufs_write_end(struct file *file, struct address_space *mapping,
-			  loff_t pos, unsigned len, unsigned copied,
-			  struct page *page, void *fsdata)
-{ AuUnsupport(); return 0; }
-static int aufs_writepage(struct page *page, struct writeback_control *wbc)
-{ AuUnsupport(); return 0; }
-
-static int aufs_set_page_dirty(struct page *page)
-{ AuUnsupport(); return 0; }
-static void aufs_invalidatepage(struct page *page, unsigned int offset,
-				unsigned int length)
-{ AuUnsupport(); }
-static int aufs_releasepage(struct page *page, gfp_t gfp)
-{ AuUnsupport(); return 0; }
-static int aufs_migratepage(struct address_space *mapping, struct page *newpage,
-			    struct page *page, enum migrate_mode mode)
-{ AuUnsupport(); return 0; }
-static int aufs_launder_page(struct page *page)
-{ AuUnsupport(); return 0; }
-static int aufs_is_partially_uptodate(struct page *page,
-				      unsigned long from,
-				      unsigned long count)
-{ AuUnsupport(); return 0; }
-static void aufs_is_dirty_writeback(struct page *page, bool *dirty,
-				    bool *writeback)
-{ AuUnsupport(); }
-static int aufs_error_remove_page(struct address_space *mapping,
-				  struct page *page)
-{ AuUnsupport(); return 0; }
-static int aufs_swap_activate(struct swap_info_struct *sis, struct file *file,
-			      sector_t *span)
-{ AuUnsupport(); return 0; }
-static void aufs_swap_deactivate(struct file *file)
-{ AuUnsupport(); }
-#endif /* CONFIG_AUFS_DEBUG */
-
-const struct address_space_operations aufs_aop = {
-	.readpage		= aufs_readpage,
-	.direct_IO		= aufs_direct_IO,
-#ifdef CONFIG_AUFS_DEBUG
-	.writepage		= aufs_writepage,
-	/* no writepages, because of writepage */
-	.set_page_dirty		= aufs_set_page_dirty,
-	/* no readpages, because of readpage */
-	.write_begin		= aufs_write_begin,
-	.write_end		= aufs_write_end,
-	/* no bmap, no block device */
-	.invalidatepage		= aufs_invalidatepage,
-	.releasepage		= aufs_releasepage,
-	.migratepage		= aufs_migratepage,
-	.launder_page		= aufs_launder_page,
-	.is_partially_uptodate	= aufs_is_partially_uptodate,
-	.is_dirty_writeback	= aufs_is_dirty_writeback,
-	.error_remove_page	= aufs_error_remove_page,
-	.swap_activate		= aufs_swap_activate,
-	.swap_deactivate	= aufs_swap_deactivate
-#endif /* CONFIG_AUFS_DEBUG */
-};
diff --git a/fs/aufs/file.h b/fs/aufs/file.h
deleted file mode 100644
index 10ede169f..000000000
--- a/fs/aufs/file.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * file operations
- */
-
-#ifndef __AUFS_FILE_H__
-#define __AUFS_FILE_H__
-
-#ifdef __KERNEL__
-
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include "rwsem.h"
-
-struct au_branch;
-struct au_hfile {
-	struct file		*hf_file;
-	struct au_branch	*hf_br;
-};
-
-struct au_vdir;
-struct au_fidir {
-	aufs_bindex_t		fd_bbot;
-	aufs_bindex_t		fd_nent;
-	struct au_vdir		*fd_vdir_cache;
-	struct au_hfile		fd_hfile[];
-};
-
-static inline int au_fidir_sz(int nent)
-{
-	AuDebugOn(nent < 0);
-	return sizeof(struct au_fidir) + sizeof(struct au_hfile) * nent;
-}
-
-struct au_finfo {
-	atomic_t		fi_generation;
-
-	struct au_rwsem		fi_rwsem;
-	aufs_bindex_t		fi_btop;
-
-	/* do not union them */
-	struct {				/* for non-dir */
-		struct au_hfile			fi_htop;
-		atomic_t			fi_mmapped;
-	};
-	struct au_fidir		*fi_hdir;	/* for dir only */
-
-	struct hlist_node	fi_hlist;
-	struct file		*fi_file;	/* very ugly */
-} ____cacheline_aligned_in_smp;
-
-/* ---------------------------------------------------------------------- */
-
-/* file.c */
-extern const struct address_space_operations aufs_aop;
-unsigned int au_file_roflags(unsigned int flags);
-struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags,
-		       struct file *file, int force_wr);
-struct au_do_open_args {
-	int		no_lock;
-	int		(*open)(struct file *file, int flags,
-				struct file *h_file);
-	struct au_fidir	*fidir;
-	struct file	*h_file;
-};
-int au_do_open(struct file *file, struct au_do_open_args *args);
-int au_reopen_nondir(struct file *file);
-struct au_pin;
-int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin);
-int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file),
-			  int wlock);
-int au_do_flush(struct file *file, fl_owner_t id,
-		int (*flush)(struct file *file, fl_owner_t id));
-
-/* poll.c */
-#ifdef CONFIG_AUFS_POLL
-unsigned int aufs_poll(struct file *file, poll_table *wait);
-#endif
-
-#ifdef CONFIG_AUFS_BR_HFSPLUS
-/* hfsplus.c */
-struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex,
-			   int force_wr);
-void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex,
-		    struct file *h_file);
-#else
-AuStub(struct file *, au_h_open_pre, return NULL, struct dentry *dentry,
-       aufs_bindex_t bindex, int force_wr)
-AuStubVoid(au_h_open_post, struct dentry *dentry, aufs_bindex_t bindex,
-	   struct file *h_file);
-#endif
-
-/* f_op.c */
-extern const struct file_operations aufs_file_fop;
-int au_do_open_nondir(struct file *file, int flags, struct file *h_file);
-int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file);
-struct file *au_read_pre(struct file *file, int keep_fi);
-
-/* finfo.c */
-void au_hfput(struct au_hfile *hf, struct file *file);
-void au_set_h_fptr(struct file *file, aufs_bindex_t bindex,
-		   struct file *h_file);
-
-void au_update_figen(struct file *file);
-struct au_fidir *au_fidir_alloc(struct super_block *sb);
-int au_fidir_realloc(struct au_finfo *finfo, int nbr);
-
-void au_fi_init_once(void *_fi);
-void au_finfo_fin(struct file *file);
-int au_finfo_init(struct file *file, struct au_fidir *fidir);
-
-/* ioctl.c */
-long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg);
-#ifdef CONFIG_COMPAT
-long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd,
-			   unsigned long arg);
-long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd,
-			      unsigned long arg);
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct au_finfo *au_fi(struct file *file)
-{
-	return file->private_data;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * fi_read_lock, fi_write_lock,
- * fi_read_unlock, fi_write_unlock, fi_downgrade_lock
- */
-AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem);
-
-#define FiMustNoWaiters(f)	AuRwMustNoWaiters(&au_fi(f)->fi_rwsem)
-#define FiMustAnyLock(f)	AuRwMustAnyLock(&au_fi(f)->fi_rwsem)
-#define FiMustWriteLock(f)	AuRwMustWriteLock(&au_fi(f)->fi_rwsem)
-
-/* ---------------------------------------------------------------------- */
-
-/* todo: hard/soft set? */
-static inline aufs_bindex_t au_fbstart(struct file *file)
-{
-	FiMustAnyLock(file);
-	return au_fi(file)->fi_btop;
-}
-
-static inline aufs_bindex_t au_fbend_dir(struct file *file)
-{
-	FiMustAnyLock(file);
-	AuDebugOn(!au_fi(file)->fi_hdir);
-	return au_fi(file)->fi_hdir->fd_bbot;
-}
-
-static inline struct au_vdir *au_fvdir_cache(struct file *file)
-{
-	FiMustAnyLock(file);
-	AuDebugOn(!au_fi(file)->fi_hdir);
-	return au_fi(file)->fi_hdir->fd_vdir_cache;
-}
-
-static inline void au_set_fbstart(struct file *file, aufs_bindex_t bindex)
-{
-	FiMustWriteLock(file);
-	au_fi(file)->fi_btop = bindex;
-}
-
-static inline void au_set_fbend_dir(struct file *file, aufs_bindex_t bindex)
-{
-	FiMustWriteLock(file);
-	AuDebugOn(!au_fi(file)->fi_hdir);
-	au_fi(file)->fi_hdir->fd_bbot = bindex;
-}
-
-static inline void au_set_fvdir_cache(struct file *file,
-				      struct au_vdir *vdir_cache)
-{
-	FiMustWriteLock(file);
-	AuDebugOn(!au_fi(file)->fi_hdir);
-	au_fi(file)->fi_hdir->fd_vdir_cache = vdir_cache;
-}
-
-static inline struct file *au_hf_top(struct file *file)
-{
-	FiMustAnyLock(file);
-	AuDebugOn(au_fi(file)->fi_hdir);
-	return au_fi(file)->fi_htop.hf_file;
-}
-
-static inline struct file *au_hf_dir(struct file *file, aufs_bindex_t bindex)
-{
-	FiMustAnyLock(file);
-	AuDebugOn(!au_fi(file)->fi_hdir);
-	return au_fi(file)->fi_hdir->fd_hfile[0 + bindex].hf_file;
-}
-
-/* todo: memory barrier? */
-static inline unsigned int au_figen(struct file *f)
-{
-	return atomic_read(&au_fi(f)->fi_generation);
-}
-
-static inline void au_set_mmapped(struct file *f)
-{
-	if (atomic_inc_return(&au_fi(f)->fi_mmapped))
-		return;
-	pr_warn("fi_mmapped wrapped around\n");
-	while (!atomic_inc_return(&au_fi(f)->fi_mmapped))
-		;
-}
-
-static inline void au_unset_mmapped(struct file *f)
-{
-	atomic_dec(&au_fi(f)->fi_mmapped);
-}
-
-static inline int au_test_mmapped(struct file *f)
-{
-	return atomic_read(&au_fi(f)->fi_mmapped);
-}
-
-/* customize vma->vm_file */
-
-static inline void au_do_vm_file_reset(struct vm_area_struct *vma,
-				       struct file *file)
-{
-	struct file *f;
-
-	f = vma->vm_file;
-	get_file(file);
-	vma->vm_file = file;
-	fput(f);
-}
-
-#ifdef CONFIG_MMU
-#define AuDbgVmRegion(file, vma) do {} while (0)
-
-static inline void au_vm_file_reset(struct vm_area_struct *vma,
-				    struct file *file)
-{
-	au_do_vm_file_reset(vma, file);
-}
-#else
-#define AuDbgVmRegion(file, vma) \
-	AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file))
-
-static inline void au_vm_file_reset(struct vm_area_struct *vma,
-				    struct file *file)
-{
-	struct file *f;
-
-	au_do_vm_file_reset(vma, file);
-	f = vma->vm_region->vm_file;
-	get_file(file);
-	vma->vm_region->vm_file = file;
-	fput(f);
-}
-#endif /* CONFIG_MMU */
-
-/* handle vma->vm_prfile */
-static inline void au_vm_prfile_set(struct vm_area_struct *vma,
-				    struct file *file)
-{
-	get_file(file);
-	vma->vm_prfile = file;
-#ifndef CONFIG_MMU
-	get_file(file);
-	vma->vm_region->vm_prfile = file;
-#endif
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_FILE_H__ */
diff --git a/fs/aufs/finfo.c b/fs/aufs/finfo.c
deleted file mode 100644
index 7ac467cf7..000000000
--- a/fs/aufs/finfo.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * file private data
- */
-
-#include "aufs.h"
-
-void au_hfput(struct au_hfile *hf, struct file *file)
-{
-	/* todo: direct access f_flags */
-	if (vfsub_file_flags(file) & __FMODE_EXEC)
-		allow_write_access(hf->hf_file);
-	fput(hf->hf_file);
-	hf->hf_file = NULL;
-	atomic_dec(&hf->hf_br->br_count);
-	hf->hf_br = NULL;
-}
-
-void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val)
-{
-	struct au_finfo *finfo = au_fi(file);
-	struct au_hfile *hf;
-	struct au_fidir *fidir;
-
-	fidir = finfo->fi_hdir;
-	if (!fidir) {
-		AuDebugOn(finfo->fi_btop != bindex);
-		hf = &finfo->fi_htop;
-	} else
-		hf = fidir->fd_hfile + bindex;
-
-	if (hf && hf->hf_file)
-		au_hfput(hf, file);
-	if (val) {
-		FiMustWriteLock(file);
-		AuDebugOn(IS_ERR_OR_NULL(file->f_path.dentry));
-		hf->hf_file = val;
-		hf->hf_br = au_sbr(file->f_path.dentry->d_sb, bindex);
-	}
-}
-
-void au_update_figen(struct file *file)
-{
-	atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_path.dentry));
-	/* smp_mb(); */ /* atomic_set */
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_fidir *au_fidir_alloc(struct super_block *sb)
-{
-	struct au_fidir *fidir;
-	int nbr;
-
-	nbr = au_sbend(sb) + 1;
-	if (nbr < 2)
-		nbr = 2; /* initial allocate for 2 branches */
-	fidir = kzalloc(au_fidir_sz(nbr), GFP_NOFS);
-	if (fidir) {
-		fidir->fd_bbot = -1;
-		fidir->fd_nent = nbr;
-		fidir->fd_vdir_cache = NULL;
-	}
-
-	return fidir;
-}
-
-int au_fidir_realloc(struct au_finfo *finfo, int nbr)
-{
-	int err;
-	struct au_fidir *fidir, *p;
-
-	AuRwMustWriteLock(&finfo->fi_rwsem);
-	fidir = finfo->fi_hdir;
-	AuDebugOn(!fidir);
-
-	err = -ENOMEM;
-	p = au_kzrealloc(fidir, au_fidir_sz(fidir->fd_nent), au_fidir_sz(nbr),
-			 GFP_NOFS);
-	if (p) {
-		p->fd_nent = nbr;
-		finfo->fi_hdir = p;
-		err = 0;
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_finfo_fin(struct file *file)
-{
-	struct au_finfo *finfo;
-
-	au_nfiles_dec(file->f_path.dentry->d_sb);
-
-	finfo = au_fi(file);
-	AuDebugOn(finfo->fi_hdir);
-	AuRwDestroy(&finfo->fi_rwsem);
-	au_cache_free_finfo(finfo);
-}
-
-void au_fi_init_once(void *_finfo)
-{
-	struct au_finfo *finfo = _finfo;
-	static struct lock_class_key aufs_fi;
-
-	au_rw_init(&finfo->fi_rwsem);
-	au_rw_class(&finfo->fi_rwsem, &aufs_fi);
-}
-
-int au_finfo_init(struct file *file, struct au_fidir *fidir)
-{
-	int err;
-	struct au_finfo *finfo;
-	struct dentry *dentry;
-
-	err = -ENOMEM;
-	dentry = file->f_path.dentry;
-	finfo = au_cache_alloc_finfo();
-	if (unlikely(!finfo))
-		goto out;
-
-	err = 0;
-	au_nfiles_inc(dentry->d_sb);
-	/* verbose coding for lock class name */
-	if (!fidir)
-		au_rw_class(&finfo->fi_rwsem, au_lc_key + AuLcNonDir_FIINFO);
-	else
-		au_rw_class(&finfo->fi_rwsem, au_lc_key + AuLcDir_FIINFO);
-	au_rw_write_lock(&finfo->fi_rwsem);
-	finfo->fi_btop = -1;
-	finfo->fi_hdir = fidir;
-	atomic_set(&finfo->fi_generation, au_digen(dentry));
-	/* smp_mb(); */ /* atomic_set */
-
-	file->private_data = finfo;
-
-out:
-	return err;
-}
diff --git a/fs/aufs/fstype.h b/fs/aufs/fstype.h
deleted file mode 100644
index 8b6878b4e..000000000
--- a/fs/aufs/fstype.h
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * judging filesystem type
- */
-
-#ifndef __AUFS_FSTYPE_H__
-#define __AUFS_FSTYPE_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-#include <linux/magic.h>
-#include <linux/romfs_fs.h>
-#include <linux/nfs_fs.h>
-
-static inline int au_test_aufs(struct super_block *sb)
-{
-	return sb->s_magic == AUFS_SUPER_MAGIC;
-}
-
-static inline const char *au_sbtype(struct super_block *sb)
-{
-	return sb->s_type->name;
-}
-
-static inline int au_test_iso9660(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE)
-	return sb->s_magic == ISOFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_romfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE)
-	return sb->s_magic == ROMFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_cramfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE)
-	return sb->s_magic == CRAMFS_MAGIC;
-#endif
-	return 0;
-}
-
-static inline int au_test_nfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE)
-	return sb->s_magic == NFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_fuse(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE)
-	return sb->s_magic == FUSE_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_xfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE)
-	return sb->s_magic == XFS_SB_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_tmpfs(struct super_block *sb __maybe_unused)
-{
-#ifdef CONFIG_TMPFS
-	return sb->s_magic == TMPFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE)
-	return !strcmp(au_sbtype(sb), "ecryptfs");
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_ramfs(struct super_block *sb)
-{
-	return sb->s_magic == RAMFS_MAGIC;
-}
-
-static inline int au_test_ubifs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE)
-	return sb->s_magic == UBIFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_procfs(struct super_block *sb __maybe_unused)
-{
-#ifdef CONFIG_PROC_FS
-	return sb->s_magic == PROC_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_sysfs(struct super_block *sb __maybe_unused)
-{
-#ifdef CONFIG_SYSFS
-	return sb->s_magic == SYSFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_configfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE)
-	return sb->s_magic == CONFIGFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_minix(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE)
-	return sb->s_magic == MINIX3_SUPER_MAGIC
-		|| sb->s_magic == MINIX2_SUPER_MAGIC
-		|| sb->s_magic == MINIX2_SUPER_MAGIC2
-		|| sb->s_magic == MINIX_SUPER_MAGIC
-		|| sb->s_magic == MINIX_SUPER_MAGIC2;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_fat(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE)
-	return sb->s_magic == MSDOS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_msdos(struct super_block *sb)
-{
-	return au_test_fat(sb);
-}
-
-static inline int au_test_vfat(struct super_block *sb)
-{
-	return au_test_fat(sb);
-}
-
-static inline int au_test_securityfs(struct super_block *sb __maybe_unused)
-{
-#ifdef CONFIG_SECURITYFS
-	return sb->s_magic == SECURITYFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_squashfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE)
-	return sb->s_magic == SQUASHFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_btrfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE)
-	return sb->s_magic == BTRFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_xenfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE)
-	return sb->s_magic == XENFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_debugfs(struct super_block *sb __maybe_unused)
-{
-#ifdef CONFIG_DEBUG_FS
-	return sb->s_magic == DEBUGFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_nilfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE)
-	return sb->s_magic == NILFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_hfsplus(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_HFSPLUS_FS) || defined(CONFIG_HFSPLUS_FS_MODULE)
-	return sb->s_magic == HFSPLUS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * they can't be an aufs branch.
- */
-static inline int au_test_fs_unsuppoted(struct super_block *sb)
-{
-	return
-#ifndef CONFIG_AUFS_BR_RAMFS
-		au_test_ramfs(sb) ||
-#endif
-		au_test_procfs(sb)
-		|| au_test_sysfs(sb)
-		|| au_test_configfs(sb)
-		|| au_test_debugfs(sb)
-		|| au_test_securityfs(sb)
-		|| au_test_xenfs(sb)
-		|| au_test_ecryptfs(sb)
-		/* || !strcmp(au_sbtype(sb), "unionfs") */
-		|| au_test_aufs(sb); /* will be supported in next version */
-}
-
-static inline int au_test_fs_remote(struct super_block *sb)
-{
-	return !au_test_tmpfs(sb)
-#ifdef CONFIG_AUFS_BR_RAMFS
-		&& !au_test_ramfs(sb)
-#endif
-		&& !(sb->s_type->fs_flags & FS_REQUIRES_DEV);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * Note: these functions (below) are created after reading ->getattr() in all
- * filesystems under linux/fs. it means we have to do so in every update...
- */
-
-/*
- * some filesystems require getattr to refresh the inode attributes before
- * referencing.
- * in most cases, we can rely on the inode attribute in NFS (or every remote fs)
- * and leave the work for d_revalidate()
- */
-static inline int au_test_fs_refresh_iattr(struct super_block *sb)
-{
-	return au_test_nfs(sb)
-		|| au_test_fuse(sb)
-		/* || au_test_btrfs(sb) */	/* untested */
-		;
-}
-
-/*
- * filesystems which don't maintain i_size or i_blocks.
- */
-static inline int au_test_fs_bad_iattr_size(struct super_block *sb)
-{
-	return au_test_xfs(sb)
-		|| au_test_btrfs(sb)
-		|| au_test_ubifs(sb)
-		|| au_test_hfsplus(sb)	/* maintained, but incorrect */
-		/* || au_test_minix(sb) */	/* untested */
-		;
-}
-
-/*
- * filesystems which don't store the correct value in some of their inode
- * attributes.
- */
-static inline int au_test_fs_bad_iattr(struct super_block *sb)
-{
-	return au_test_fs_bad_iattr_size(sb)
-		|| au_test_fat(sb)
-		|| au_test_msdos(sb)
-		|| au_test_vfat(sb);
-}
-
-/* they don't check i_nlink in link(2) */
-static inline int au_test_fs_no_limit_nlink(struct super_block *sb)
-{
-	return au_test_tmpfs(sb)
-#ifdef CONFIG_AUFS_BR_RAMFS
-		|| au_test_ramfs(sb)
-#endif
-		|| au_test_ubifs(sb)
-		|| au_test_hfsplus(sb);
-}
-
-/*
- * filesystems which sets S_NOATIME and S_NOCMTIME.
- */
-static inline int au_test_fs_notime(struct super_block *sb)
-{
-	return au_test_nfs(sb)
-		|| au_test_fuse(sb)
-		|| au_test_ubifs(sb)
-		;
-}
-
-/* temporary support for i#1 in cramfs */
-static inline int au_test_fs_unique_ino(struct inode *inode)
-{
-	if (au_test_cramfs(inode->i_sb))
-		return inode->i_ino != 1;
-	return 1;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * the filesystem where the xino files placed must support i/o after unlink and
- * maintain i_size and i_blocks.
- */
-static inline int au_test_fs_bad_xino(struct super_block *sb)
-{
-	return au_test_fs_remote(sb)
-		|| au_test_fs_bad_iattr_size(sb)
-		/* don't want unnecessary work for xino */
-		|| au_test_aufs(sb)
-		|| au_test_ecryptfs(sb)
-		|| au_test_nilfs(sb);
-}
-
-static inline int au_test_fs_trunc_xino(struct super_block *sb)
-{
-	return au_test_tmpfs(sb)
-		|| au_test_ramfs(sb);
-}
-
-/*
- * test if the @sb is real-readonly.
- */
-static inline int au_test_fs_rr(struct super_block *sb)
-{
-	return au_test_squashfs(sb)
-		|| au_test_iso9660(sb)
-		|| au_test_cramfs(sb)
-		|| au_test_romfs(sb);
-}
-
-/*
- * test if the @inode is nfs with 'noacl' option
- * NFS always sets MS_POSIXACL regardless its mount option 'noacl.'
- */
-static inline int au_test_nfs_noacl(struct inode *inode)
-{
-	return au_test_nfs(inode->i_sb)
-		/* && IS_POSIXACL(inode) */
-		&& !nfs_server_capable(inode, NFS_CAP_ACLS);
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_FSTYPE_H__ */
diff --git a/fs/aufs/hfsnotify.c b/fs/aufs/hfsnotify.c
deleted file mode 100644
index dc4f6c3c9..000000000
--- a/fs/aufs/hfsnotify.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * fsnotify for the lower directories
- */
-
-#include "aufs.h"
-
-/* FS_IN_IGNORED is unnecessary */
-static const __u32 AuHfsnMask = (FS_MOVED_TO | FS_MOVED_FROM | FS_DELETE
-				 | FS_CREATE | FS_EVENT_ON_CHILD);
-static DECLARE_WAIT_QUEUE_HEAD(au_hfsn_wq);
-static __cacheline_aligned_in_smp atomic64_t au_hfsn_ifree = ATOMIC64_INIT(0);
-
-static void au_hfsn_free_mark(struct fsnotify_mark *mark)
-{
-	struct au_hnotify *hn = container_of(mark, struct au_hnotify,
-					     hn_mark);
-	AuDbg("here\n");
-	au_cache_free_hnotify(hn);
-	smp_mb__before_atomic();
-	if (atomic64_dec_and_test(&au_hfsn_ifree))
-		wake_up(&au_hfsn_wq);
-}
-
-static int au_hfsn_alloc(struct au_hinode *hinode)
-{
-	int err;
-	struct au_hnotify *hn;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct fsnotify_mark *mark;
-	aufs_bindex_t bindex;
-
-	hn = hinode->hi_notify;
-	sb = hn->hn_aufs_inode->i_sb;
-	bindex = au_br_index(sb, hinode->hi_id);
-	br = au_sbr(sb, bindex);
-	AuDebugOn(!br->br_hfsn);
-
-	mark = &hn->hn_mark;
-	fsnotify_init_mark(mark, au_hfsn_free_mark);
-	mark->mask = AuHfsnMask;
-	/*
-	 * by udba rename or rmdir, aufs assign a new inode to the known
-	 * h_inode, so specify 1 to allow dups.
-	 */
-	lockdep_off();
-	err = fsnotify_add_mark(mark, br->br_hfsn->hfsn_group, hinode->hi_inode,
-				 /*mnt*/NULL, /*allow_dups*/1);
-	/* even if err */
-	fsnotify_put_mark(mark);
-	lockdep_on();
-
-	return err;
-}
-
-static int au_hfsn_free(struct au_hinode *hinode, struct au_hnotify *hn)
-{
-	struct fsnotify_mark *mark;
-	unsigned long long ull;
-	struct fsnotify_group *group;
-
-	ull = atomic64_inc_return(&au_hfsn_ifree);
-	BUG_ON(!ull);
-
-	mark = &hn->hn_mark;
-	spin_lock(&mark->lock);
-	group = mark->group;
-	fsnotify_get_group(group);
-	spin_unlock(&mark->lock);
-	lockdep_off();
-	fsnotify_destroy_mark(mark, group);
-	fsnotify_put_group(group);
-	lockdep_on();
-
-	/* free hn by myself */
-	return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void au_hfsn_ctl(struct au_hinode *hinode, int do_set)
-{
-	struct fsnotify_mark *mark;
-
-	mark = &hinode->hi_notify->hn_mark;
-	spin_lock(&mark->lock);
-	if (do_set) {
-		AuDebugOn(mark->mask & AuHfsnMask);
-		mark->mask |= AuHfsnMask;
-	} else {
-		AuDebugOn(!(mark->mask & AuHfsnMask));
-		mark->mask &= ~AuHfsnMask;
-	}
-	spin_unlock(&mark->lock);
-	/* fsnotify_recalc_inode_mask(hinode->hi_inode); */
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* #define AuDbgHnotify */
-#ifdef AuDbgHnotify
-static char *au_hfsn_name(u32 mask)
-{
-#ifdef CONFIG_AUFS_DEBUG
-#define test_ret(flag)				\
-	do {					\
-		if (mask & flag)		\
-			return #flag;		\
-	} while (0)
-	test_ret(FS_ACCESS);
-	test_ret(FS_MODIFY);
-	test_ret(FS_ATTRIB);
-	test_ret(FS_CLOSE_WRITE);
-	test_ret(FS_CLOSE_NOWRITE);
-	test_ret(FS_OPEN);
-	test_ret(FS_MOVED_FROM);
-	test_ret(FS_MOVED_TO);
-	test_ret(FS_CREATE);
-	test_ret(FS_DELETE);
-	test_ret(FS_DELETE_SELF);
-	test_ret(FS_MOVE_SELF);
-	test_ret(FS_UNMOUNT);
-	test_ret(FS_Q_OVERFLOW);
-	test_ret(FS_IN_IGNORED);
-	test_ret(FS_ISDIR);
-	test_ret(FS_IN_ONESHOT);
-	test_ret(FS_EVENT_ON_CHILD);
-	return "";
-#undef test_ret
-#else
-	return "??";
-#endif
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static void au_hfsn_free_group(struct fsnotify_group *group)
-{
-	struct au_br_hfsnotify *hfsn = group->private;
-
-	AuDbg("here\n");
-	kfree(hfsn);
-}
-
-static int au_hfsn_handle_event(struct fsnotify_group *group,
-				struct inode *inode,
-				struct fsnotify_mark *inode_mark,
-				struct fsnotify_mark *vfsmount_mark,
-				u32 mask, void *data, int data_type,
-				const unsigned char *file_name, u32 cookie)
-{
-	int err;
-	struct au_hnotify *hnotify;
-	struct inode *h_dir, *h_inode;
-	struct qstr h_child_qstr = QSTR_INIT(file_name, strlen(file_name));
-
-	AuDebugOn(data_type != FSNOTIFY_EVENT_INODE);
-
-	err = 0;
-	/* if FS_UNMOUNT happens, there must be another bug */
-	AuDebugOn(mask & FS_UNMOUNT);
-	if (mask & (FS_IN_IGNORED | FS_UNMOUNT))
-		goto out;
-
-	h_dir = inode;
-	h_inode = NULL;
-#ifdef AuDbgHnotify
-	au_debug_on();
-	if (1 || h_child_qstr.len != sizeof(AUFS_XINO_FNAME) - 1
-	    || strncmp(h_child_qstr.name, AUFS_XINO_FNAME, h_child_qstr.len)) {
-		AuDbg("i%lu, mask 0x%x %s, hcname %.*s, hi%lu\n",
-		      h_dir->i_ino, mask, au_hfsn_name(mask),
-		      AuLNPair(&h_child_qstr), h_inode ? h_inode->i_ino : 0);
-		/* WARN_ON(1); */
-	}
-	au_debug_off();
-#endif
-
-	AuDebugOn(!inode_mark);
-	hnotify = container_of(inode_mark, struct au_hnotify, hn_mark);
-	err = au_hnotify(h_dir, hnotify, mask, &h_child_qstr, h_inode);
-
-out:
-	return err;
-}
-
-static struct fsnotify_ops au_hfsn_ops = {
-	.handle_event		= au_hfsn_handle_event,
-	.free_group_priv	= au_hfsn_free_group
-};
-
-/* ---------------------------------------------------------------------- */
-
-static void au_hfsn_fin_br(struct au_branch *br)
-{
-	struct au_br_hfsnotify *hfsn;
-
-	hfsn = br->br_hfsn;
-	if (hfsn) {
-		lockdep_off();
-		fsnotify_put_group(hfsn->hfsn_group);
-		lockdep_on();
-	}
-}
-
-static int au_hfsn_init_br(struct au_branch *br, int perm)
-{
-	int err;
-	struct fsnotify_group *group;
-	struct au_br_hfsnotify *hfsn;
-
-	err = 0;
-	br->br_hfsn = NULL;
-	if (!au_br_hnotifyable(perm))
-		goto out;
-
-	err = -ENOMEM;
-	hfsn = kmalloc(sizeof(*hfsn), GFP_NOFS);
-	if (unlikely(!hfsn))
-		goto out;
-
-	err = 0;
-	group = fsnotify_alloc_group(&au_hfsn_ops);
-	if (IS_ERR(group)) {
-		err = PTR_ERR(group);
-		pr_err("fsnotify_alloc_group() failed, %d\n", err);
-		goto out_hfsn;
-	}
-
-	group->private = hfsn;
-	hfsn->hfsn_group = group;
-	br->br_hfsn = hfsn;
-	goto out; /* success */
-
-out_hfsn:
-	kfree(hfsn);
-out:
-	return err;
-}
-
-static int au_hfsn_reset_br(unsigned int udba, struct au_branch *br, int perm)
-{
-	int err;
-
-	err = 0;
-	if (!br->br_hfsn)
-		err = au_hfsn_init_br(br, perm);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void au_hfsn_fin(void)
-{
-	AuDbg("au_hfsn_ifree %lld\n", (long long)atomic64_read(&au_hfsn_ifree));
-	wait_event(au_hfsn_wq, !atomic64_read(&au_hfsn_ifree));
-}
-
-const struct au_hnotify_op au_hnotify_op = {
-	.ctl		= au_hfsn_ctl,
-	.alloc		= au_hfsn_alloc,
-	.free		= au_hfsn_free,
-
-	.fin		= au_hfsn_fin,
-
-	.reset_br	= au_hfsn_reset_br,
-	.fin_br		= au_hfsn_fin_br,
-	.init_br	= au_hfsn_init_br
-};
diff --git a/fs/aufs/hfsplus.c b/fs/aufs/hfsplus.c
deleted file mode 100644
index ced054639..000000000
--- a/fs/aufs/hfsplus.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2010-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * special support for filesystems which aqucires an inode mutex
- * at final closing a file, eg, hfsplus.
- *
- * This trick is very simple and stupid, just to open the file before really
- * neceeary open to tell hfsplus that this is not the final closing.
- * The caller should call au_h_open_pre() after acquiring the inode mutex,
- * and au_h_open_post() after releasing it.
- */
-
-#include "aufs.h"
-
-struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex,
-			   int force_wr)
-{
-	struct file *h_file;
-	struct dentry *h_dentry;
-
-	h_dentry = au_h_dptr(dentry, bindex);
-	AuDebugOn(!h_dentry);
-	AuDebugOn(d_is_negative(h_dentry));
-
-	h_file = NULL;
-	if (au_test_hfsplus(h_dentry->d_sb)
-	    && d_is_reg(h_dentry))
-		h_file = au_h_open(dentry, bindex,
-				   O_RDONLY | O_NOATIME | O_LARGEFILE,
-				   /*file*/NULL, force_wr);
-	return h_file;
-}
-
-void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex,
-		    struct file *h_file)
-{
-	if (h_file) {
-		fput(h_file);
-		au_sbr_put(dentry->d_sb, bindex);
-	}
-}
diff --git a/fs/aufs/hnotify.c b/fs/aufs/hnotify.c
deleted file mode 100644
index 643bc57c5..000000000
--- a/fs/aufs/hnotify.c
+++ /dev/null
@@ -1,710 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * abstraction to notify the direct changes on lower directories
- */
-
-#include "aufs.h"
-
-int au_hn_alloc(struct au_hinode *hinode, struct inode *inode)
-{
-	int err;
-	struct au_hnotify *hn;
-
-	err = -ENOMEM;
-	hn = au_cache_alloc_hnotify();
-	if (hn) {
-		hn->hn_aufs_inode = inode;
-		hinode->hi_notify = hn;
-		err = au_hnotify_op.alloc(hinode);
-		AuTraceErr(err);
-		if (unlikely(err)) {
-			hinode->hi_notify = NULL;
-			au_cache_free_hnotify(hn);
-			/*
-			 * The upper dir was removed by udba, but the same named
-			 * dir left. In this case, aufs assignes a new inode
-			 * number and set the monitor again.
-			 * For the lower dir, the old monitnor is still left.
-			 */
-			if (err == -EEXIST)
-				err = 0;
-		}
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-void au_hn_free(struct au_hinode *hinode)
-{
-	struct au_hnotify *hn;
-
-	hn = hinode->hi_notify;
-	if (hn) {
-		hinode->hi_notify = NULL;
-		if (au_hnotify_op.free(hinode, hn))
-			au_cache_free_hnotify(hn);
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_hn_ctl(struct au_hinode *hinode, int do_set)
-{
-	if (hinode->hi_notify)
-		au_hnotify_op.ctl(hinode, do_set);
-}
-
-void au_hn_reset(struct inode *inode, unsigned int flags)
-{
-	aufs_bindex_t bindex, bend;
-	struct inode *hi;
-	struct dentry *iwhdentry;
-
-	bend = au_ibend(inode);
-	for (bindex = au_ibstart(inode); bindex <= bend; bindex++) {
-		hi = au_h_iptr(inode, bindex);
-		if (!hi)
-			continue;
-
-		/* mutex_lock_nested(&hi->i_mutex, AuLsc_I_CHILD); */
-		iwhdentry = au_hi_wh(inode, bindex);
-		if (iwhdentry)
-			dget(iwhdentry);
-		au_igrab(hi);
-		au_set_h_iptr(inode, bindex, NULL, 0);
-		au_set_h_iptr(inode, bindex, au_igrab(hi),
-			      flags & ~AuHi_XINO);
-		iput(hi);
-		dput(iwhdentry);
-		/* mutex_unlock(&hi->i_mutex); */
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int hn_xino(struct inode *inode, struct inode *h_inode)
-{
-	int err;
-	aufs_bindex_t bindex, bend, bfound, bstart;
-	struct inode *h_i;
-
-	err = 0;
-	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
-		pr_warn("branch root dir was changed\n");
-		goto out;
-	}
-
-	bfound = -1;
-	bend = au_ibend(inode);
-	bstart = au_ibstart(inode);
-#if 0 /* reserved for future use */
-	if (bindex == bend) {
-		/* keep this ino in rename case */
-		goto out;
-	}
-#endif
-	for (bindex = bstart; bindex <= bend; bindex++)
-		if (au_h_iptr(inode, bindex) == h_inode) {
-			bfound = bindex;
-			break;
-		}
-	if (bfound < 0)
-		goto out;
-
-	for (bindex = bstart; bindex <= bend; bindex++) {
-		h_i = au_h_iptr(inode, bindex);
-		if (!h_i)
-			continue;
-
-		err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0);
-		/* ignore this error */
-		/* bad action? */
-	}
-
-	/* children inode number will be broken */
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int hn_gen_tree(struct dentry *dentry)
-{
-	int err, i, j, ndentry;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry **dentries;
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	err = au_dcsub_pages(&dpages, dentry, NULL, NULL);
-	if (unlikely(err))
-		goto out_dpages;
-
-	for (i = 0; i < dpages.ndpage; i++) {
-		dpage = dpages.dpages + i;
-		dentries = dpage->dentries;
-		ndentry = dpage->ndentry;
-		for (j = 0; j < ndentry; j++) {
-			struct dentry *d;
-
-			d = dentries[j];
-			if (IS_ROOT(d))
-				continue;
-
-			au_digen_dec(d);
-			if (d_really_is_positive(d))
-				/* todo: reset children xino?
-				   cached children only? */
-				au_iigen_dec(d_inode(d));
-		}
-	}
-
-out_dpages:
-	au_dpages_free(&dpages);
-
-#if 0
-	/* discard children */
-	dentry_unhash(dentry);
-	dput(dentry);
-#endif
-out:
-	return err;
-}
-
-/*
- * return 0 if processed.
- */
-static int hn_gen_by_inode(char *name, unsigned int nlen, struct inode *inode,
-			   const unsigned int isdir)
-{
-	int err;
-	struct dentry *d;
-	struct qstr *dname;
-
-	err = 1;
-	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
-		pr_warn("branch root dir was changed\n");
-		err = 0;
-		goto out;
-	}
-
-	if (!isdir) {
-		AuDebugOn(!name);
-		au_iigen_dec(inode);
-		spin_lock(&inode->i_lock);
-		hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) {
-			spin_lock(&d->d_lock);
-			dname = &d->d_name;
-			if (dname->len != nlen
-			    && memcmp(dname->name, name, nlen)) {
-				spin_unlock(&d->d_lock);
-				continue;
-			}
-			err = 0;
-			au_digen_dec(d);
-			spin_unlock(&d->d_lock);
-			break;
-		}
-		spin_unlock(&inode->i_lock);
-	} else {
-		au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIR);
-		d = d_find_any_alias(inode);
-		if (!d) {
-			au_iigen_dec(inode);
-			goto out;
-		}
-
-		spin_lock(&d->d_lock);
-		dname = &d->d_name;
-		if (dname->len == nlen && !memcmp(dname->name, name, nlen)) {
-			spin_unlock(&d->d_lock);
-			err = hn_gen_tree(d);
-			spin_lock(&d->d_lock);
-		}
-		spin_unlock(&d->d_lock);
-		dput(d);
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int hn_gen_by_name(struct dentry *dentry, const unsigned int isdir)
-{
-	int err;
-
-	if (IS_ROOT(dentry)) {
-		pr_warn("branch root dir was changed\n");
-		return 0;
-	}
-
-	err = 0;
-	if (!isdir) {
-		au_digen_dec(dentry);
-		if (d_really_is_positive(dentry))
-			au_iigen_dec(d_inode(dentry));
-	} else {
-		au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR);
-		if (d_really_is_positive(dentry))
-			err = hn_gen_tree(dentry);
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* hnotify job flags */
-#define AuHnJob_XINO0		1
-#define AuHnJob_GEN		(1 << 1)
-#define AuHnJob_DIRENT		(1 << 2)
-#define AuHnJob_ISDIR		(1 << 3)
-#define AuHnJob_TRYXINO0	(1 << 4)
-#define AuHnJob_MNTPNT		(1 << 5)
-#define au_ftest_hnjob(flags, name)	((flags) & AuHnJob_##name)
-#define au_fset_hnjob(flags, name) \
-	do { (flags) |= AuHnJob_##name; } while (0)
-#define au_fclr_hnjob(flags, name) \
-	do { (flags) &= ~AuHnJob_##name; } while (0)
-
-enum {
-	AuHn_CHILD,
-	AuHn_PARENT,
-	AuHnLast
-};
-
-struct au_hnotify_args {
-	struct inode *h_dir, *dir, *h_child_inode;
-	u32 mask;
-	unsigned int flags[AuHnLast];
-	unsigned int h_child_nlen;
-	char h_child_name[];
-};
-
-struct hn_job_args {
-	unsigned int flags;
-	struct inode *inode, *h_inode, *dir, *h_dir;
-	struct dentry *dentry;
-	char *h_name;
-	int h_nlen;
-};
-
-static int hn_job(struct hn_job_args *a)
-{
-	const unsigned int isdir = au_ftest_hnjob(a->flags, ISDIR);
-	int e;
-
-	/* reset xino */
-	if (au_ftest_hnjob(a->flags, XINO0) && a->inode)
-		hn_xino(a->inode, a->h_inode); /* ignore this error */
-
-	if (au_ftest_hnjob(a->flags, TRYXINO0)
-	    && a->inode
-	    && a->h_inode) {
-		mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD);
-		if (!a->h_inode->i_nlink
-		    && !(a->h_inode->i_state & I_LINKABLE))
-			hn_xino(a->inode, a->h_inode); /* ignore this error */
-		mutex_unlock(&a->h_inode->i_mutex);
-	}
-
-	/* make the generation obsolete */
-	if (au_ftest_hnjob(a->flags, GEN)) {
-		e = -1;
-		if (a->inode)
-			e = hn_gen_by_inode(a->h_name, a->h_nlen, a->inode,
-					      isdir);
-		if (e && a->dentry)
-			hn_gen_by_name(a->dentry, isdir);
-		/* ignore this error */
-	}
-
-	/* make dir entries obsolete */
-	if (au_ftest_hnjob(a->flags, DIRENT) && a->inode) {
-		struct au_vdir *vdir;
-
-		vdir = au_ivdir(a->inode);
-		if (vdir)
-			vdir->vd_jiffy = 0;
-		/* IMustLock(a->inode); */
-		/* a->inode->i_version++; */
-	}
-
-	/* can do nothing but warn */
-	if (au_ftest_hnjob(a->flags, MNTPNT)
-	    && a->dentry
-	    && d_mountpoint(a->dentry))
-		pr_warn("mount-point %pd is removed or renamed\n", a->dentry);
-
-	return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen,
-					   struct inode *dir)
-{
-	struct dentry *dentry, *d, *parent;
-	struct qstr *dname;
-
-	parent = d_find_any_alias(dir);
-	if (!parent)
-		return NULL;
-
-	dentry = NULL;
-	spin_lock(&parent->d_lock);
-	list_for_each_entry(d, &parent->d_subdirs, d_child) {
-		/* AuDbg("%pd\n", d); */
-		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
-		dname = &d->d_name;
-		if (dname->len != nlen || memcmp(dname->name, name, nlen))
-			goto cont_unlock;
-		if (au_di(d))
-			au_digen_dec(d);
-		else
-			goto cont_unlock;
-		if (au_dcount(d) > 0) {
-			dentry = dget_dlock(d);
-			spin_unlock(&d->d_lock);
-			break;
-		}
-
-cont_unlock:
-		spin_unlock(&d->d_lock);
-	}
-	spin_unlock(&parent->d_lock);
-	dput(parent);
-
-	if (dentry)
-		di_write_lock_child(dentry);
-
-	return dentry;
-}
-
-static struct inode *lookup_wlock_by_ino(struct super_block *sb,
-					 aufs_bindex_t bindex, ino_t h_ino)
-{
-	struct inode *inode;
-	ino_t ino;
-	int err;
-
-	inode = NULL;
-	err = au_xino_read(sb, bindex, h_ino, &ino);
-	if (!err && ino)
-		inode = ilookup(sb, ino);
-	if (!inode)
-		goto out;
-
-	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
-		pr_warn("wrong root branch\n");
-		iput(inode);
-		inode = NULL;
-		goto out;
-	}
-
-	ii_write_lock_child(inode);
-
-out:
-	return inode;
-}
-
-static void au_hn_bh(void *_args)
-{
-	struct au_hnotify_args *a = _args;
-	struct super_block *sb;
-	aufs_bindex_t bindex, bend, bfound;
-	unsigned char xino, try_iput;
-	int err;
-	struct inode *inode;
-	ino_t h_ino;
-	struct hn_job_args args;
-	struct dentry *dentry;
-	struct au_sbinfo *sbinfo;
-
-	AuDebugOn(!_args);
-	AuDebugOn(!a->h_dir);
-	AuDebugOn(!a->dir);
-	AuDebugOn(!a->mask);
-	AuDbg("mask 0x%x, i%lu, hi%lu, hci%lu\n",
-	      a->mask, a->dir->i_ino, a->h_dir->i_ino,
-	      a->h_child_inode ? a->h_child_inode->i_ino : 0);
-
-	inode = NULL;
-	dentry = NULL;
-	/*
-	 * do not lock a->dir->i_mutex here
-	 * because of d_revalidate() may cause a deadlock.
-	 */
-	sb = a->dir->i_sb;
-	AuDebugOn(!sb);
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!sbinfo);
-	si_write_lock(sb, AuLock_NOPLMW);
-
-	ii_read_lock_parent(a->dir);
-	bfound = -1;
-	bend = au_ibend(a->dir);
-	for (bindex = au_ibstart(a->dir); bindex <= bend; bindex++)
-		if (au_h_iptr(a->dir, bindex) == a->h_dir) {
-			bfound = bindex;
-			break;
-		}
-	ii_read_unlock(a->dir);
-	if (unlikely(bfound < 0))
-		goto out;
-
-	xino = !!au_opt_test(au_mntflags(sb), XINO);
-	h_ino = 0;
-	if (a->h_child_inode)
-		h_ino = a->h_child_inode->i_ino;
-
-	if (a->h_child_nlen
-	    && (au_ftest_hnjob(a->flags[AuHn_CHILD], GEN)
-		|| au_ftest_hnjob(a->flags[AuHn_CHILD], MNTPNT)))
-		dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen,
-					      a->dir);
-	try_iput = 0;
-	if (dentry && d_really_is_positive(dentry))
-		inode = d_inode(dentry);
-	if (xino && !inode && h_ino
-	    && (au_ftest_hnjob(a->flags[AuHn_CHILD], XINO0)
-		|| au_ftest_hnjob(a->flags[AuHn_CHILD], TRYXINO0)
-		|| au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) {
-		inode = lookup_wlock_by_ino(sb, bfound, h_ino);
-		try_iput = 1;
-	    }
-
-	args.flags = a->flags[AuHn_CHILD];
-	args.dentry = dentry;
-	args.inode = inode;
-	args.h_inode = a->h_child_inode;
-	args.dir = a->dir;
-	args.h_dir = a->h_dir;
-	args.h_name = a->h_child_name;
-	args.h_nlen = a->h_child_nlen;
-	err = hn_job(&args);
-	if (dentry) {
-		if (au_di(dentry))
-			di_write_unlock(dentry);
-		dput(dentry);
-	}
-	if (inode && try_iput) {
-		ii_write_unlock(inode);
-		iput(inode);
-	}
-
-	ii_write_lock_parent(a->dir);
-	args.flags = a->flags[AuHn_PARENT];
-	args.dentry = NULL;
-	args.inode = a->dir;
-	args.h_inode = a->h_dir;
-	args.dir = NULL;
-	args.h_dir = NULL;
-	args.h_name = NULL;
-	args.h_nlen = 0;
-	err = hn_job(&args);
-	ii_write_unlock(a->dir);
-
-out:
-	iput(a->h_child_inode);
-	iput(a->h_dir);
-	iput(a->dir);
-	si_write_unlock(sb);
-	au_nwt_done(&sbinfo->si_nowait);
-	kfree(a);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask,
-	       struct qstr *h_child_qstr, struct inode *h_child_inode)
-{
-	int err, len;
-	unsigned int flags[AuHnLast], f;
-	unsigned char isdir, isroot, wh;
-	struct inode *dir;
-	struct au_hnotify_args *args;
-	char *p, *h_child_name;
-
-	err = 0;
-	AuDebugOn(!hnotify || !hnotify->hn_aufs_inode);
-	dir = igrab(hnotify->hn_aufs_inode);
-	if (!dir)
-		goto out;
-
-	isroot = (dir->i_ino == AUFS_ROOT_INO);
-	wh = 0;
-	h_child_name = (void *)h_child_qstr->name;
-	len = h_child_qstr->len;
-	if (h_child_name) {
-		if (len > AUFS_WH_PFX_LEN
-		    && !memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
-			h_child_name += AUFS_WH_PFX_LEN;
-			len -= AUFS_WH_PFX_LEN;
-			wh = 1;
-		}
-	}
-
-	isdir = 0;
-	if (h_child_inode)
-		isdir = !!S_ISDIR(h_child_inode->i_mode);
-	flags[AuHn_PARENT] = AuHnJob_ISDIR;
-	flags[AuHn_CHILD] = 0;
-	if (isdir)
-		flags[AuHn_CHILD] = AuHnJob_ISDIR;
-	au_fset_hnjob(flags[AuHn_PARENT], DIRENT);
-	au_fset_hnjob(flags[AuHn_CHILD], GEN);
-	switch (mask & FS_EVENTS_POSS_ON_CHILD) {
-	case FS_MOVED_FROM:
-	case FS_MOVED_TO:
-		au_fset_hnjob(flags[AuHn_CHILD], XINO0);
-		au_fset_hnjob(flags[AuHn_CHILD], MNTPNT);
-		/*FALLTHROUGH*/
-	case FS_CREATE:
-		AuDebugOn(!h_child_name);
-		break;
-
-	case FS_DELETE:
-		/*
-		 * aufs never be able to get this child inode.
-		 * revalidation should be in d_revalidate()
-		 * by checking i_nlink, i_generation or d_unhashed().
-		 */
-		AuDebugOn(!h_child_name);
-		au_fset_hnjob(flags[AuHn_CHILD], TRYXINO0);
-		au_fset_hnjob(flags[AuHn_CHILD], MNTPNT);
-		break;
-
-	default:
-		AuDebugOn(1);
-	}
-
-	if (wh)
-		h_child_inode = NULL;
-
-	err = -ENOMEM;
-	/* iput() and kfree() will be called in au_hnotify() */
-	args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS);
-	if (unlikely(!args)) {
-		AuErr1("no memory\n");
-		iput(dir);
-		goto out;
-	}
-	args->flags[AuHn_PARENT] = flags[AuHn_PARENT];
-	args->flags[AuHn_CHILD] = flags[AuHn_CHILD];
-	args->mask = mask;
-	args->dir = dir;
-	args->h_dir = igrab(h_dir);
-	if (h_child_inode)
-		h_child_inode = igrab(h_child_inode); /* can be NULL */
-	args->h_child_inode = h_child_inode;
-	args->h_child_nlen = len;
-	if (len) {
-		p = (void *)args;
-		p += sizeof(*args);
-		memcpy(p, h_child_name, len);
-		p[len] = 0;
-	}
-
-	/* NFS fires the event for silly-renamed one from kworker */
-	f = 0;
-	if (!dir->i_nlink
-	    || (au_test_nfs(h_dir->i_sb) && (mask & FS_DELETE)))
-		f = AuWkq_NEST;
-	err = au_wkq_nowait(au_hn_bh, args, dir->i_sb, f);
-	if (unlikely(err)) {
-		pr_err("wkq %d\n", err);
-		iput(args->h_child_inode);
-		iput(args->h_dir);
-		iput(args->dir);
-		kfree(args);
-	}
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm)
-{
-	int err;
-
-	AuDebugOn(!(udba & AuOptMask_UDBA));
-
-	err = 0;
-	if (au_hnotify_op.reset_br)
-		err = au_hnotify_op.reset_br(udba, br, perm);
-
-	return err;
-}
-
-int au_hnotify_init_br(struct au_branch *br, int perm)
-{
-	int err;
-
-	err = 0;
-	if (au_hnotify_op.init_br)
-		err = au_hnotify_op.init_br(br, perm);
-
-	return err;
-}
-
-void au_hnotify_fin_br(struct au_branch *br)
-{
-	if (au_hnotify_op.fin_br)
-		au_hnotify_op.fin_br(br);
-}
-
-static void au_hn_destroy_cache(void)
-{
-	kmem_cache_destroy(au_cachep[AuCache_HNOTIFY]);
-	au_cachep[AuCache_HNOTIFY] = NULL;
-}
-
-int __init au_hnotify_init(void)
-{
-	int err;
-
-	err = -ENOMEM;
-	au_cachep[AuCache_HNOTIFY] = AuCache(au_hnotify);
-	if (au_cachep[AuCache_HNOTIFY]) {
-		err = 0;
-		if (au_hnotify_op.init)
-			err = au_hnotify_op.init();
-		if (unlikely(err))
-			au_hn_destroy_cache();
-	}
-	AuTraceErr(err);
-	return err;
-}
-
-void au_hnotify_fin(void)
-{
-	if (au_hnotify_op.fin)
-		au_hnotify_op.fin();
-	/* cf. au_cache_fin() */
-	if (au_cachep[AuCache_HNOTIFY])
-		au_hn_destroy_cache();
-}
diff --git a/fs/aufs/i_op.c b/fs/aufs/i_op.c
deleted file mode 100644
index 17364e4f1..000000000
--- a/fs/aufs/i_op.c
+++ /dev/null
@@ -1,1447 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * inode operations (except add/del/rename)
- */
-
-#include <linux/device_cgroup.h>
-#include <linux/fs_stack.h>
-#include <linux/mm.h>
-#include <linux/namei.h>
-#include <linux/security.h>
-#include "aufs.h"
-
-static int h_permission(struct inode *h_inode, int mask,
-			struct vfsmount *h_mnt, int brperm)
-{
-	int err;
-	const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND));
-
-	err = -EACCES;
-	if ((write_mask && IS_IMMUTABLE(h_inode))
-	    || ((mask & MAY_EXEC)
-		&& S_ISREG(h_inode->i_mode)
-		&& ((h_mnt->mnt_flags & MNT_NOEXEC)
-		    || !(h_inode->i_mode & S_IXUGO))))
-		goto out;
-
-	/*
-	 * - skip the lower fs test in the case of write to ro branch.
-	 * - nfs dir permission write check is optimized, but a policy for
-	 *   link/rename requires a real check.
-	 * - nfs always sets MS_POSIXACL regardless its mount option 'noacl.'
-	 *   in this case, generic_permission() returns -EOPNOTSUPP.
-	 */
-	if ((write_mask && !au_br_writable(brperm))
-	    || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode)
-		&& write_mask && !(mask & MAY_READ))
-	    || !h_inode->i_op->permission) {
-		/* AuLabel(generic_permission); */
-		/* AuDbg("get_acl %pf\n", h_inode->i_op->get_acl); */
-		err = generic_permission(h_inode, mask);
-		if (err == -EOPNOTSUPP && au_test_nfs_noacl(h_inode))
-			err = h_inode->i_op->permission(h_inode, mask);
-		AuTraceErr(err);
-	} else {
-		/* AuLabel(h_inode->permission); */
-		err = h_inode->i_op->permission(h_inode, mask);
-		AuTraceErr(err);
-	}
-
-	if (!err)
-		err = devcgroup_inode_permission(h_inode, mask);
-	if (!err)
-		err = security_inode_permission(h_inode, mask);
-
-#if 0
-	if (!err) {
-		/* todo: do we need to call ima_path_check()? */
-		struct path h_path = {
-			.dentry	=
-			.mnt	= h_mnt
-		};
-		err = ima_path_check(&h_path,
-				     mask & (MAY_READ | MAY_WRITE | MAY_EXEC),
-				     IMA_COUNT_LEAVE);
-	}
-#endif
-
-out:
-	return err;
-}
-
-static int aufs_permission(struct inode *inode, int mask)
-{
-	int err;
-	aufs_bindex_t bindex, bend;
-	const unsigned char isdir = !!S_ISDIR(inode->i_mode),
-		write_mask = !!(mask & (MAY_WRITE | MAY_APPEND));
-	struct inode *h_inode;
-	struct super_block *sb;
-	struct au_branch *br;
-
-	/* todo: support rcu-walk? */
-	if (mask & MAY_NOT_BLOCK)
-		return -ECHILD;
-
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	ii_read_lock_child(inode);
-#if 0
-	err = au_iigen_test(inode, au_sigen(sb));
-	if (unlikely(err))
-		goto out;
-#endif
-
-	if (!isdir
-	    || write_mask
-	    || au_opt_test(au_mntflags(sb), DIRPERM1)) {
-		err = au_busy_or_stale();
-		h_inode = au_h_iptr(inode, au_ibstart(inode));
-		if (unlikely(!h_inode
-			     || (h_inode->i_mode & S_IFMT)
-			     != (inode->i_mode & S_IFMT)))
-			goto out;
-
-		err = 0;
-		bindex = au_ibstart(inode);
-		br = au_sbr(sb, bindex);
-		err = h_permission(h_inode, mask, au_br_mnt(br), br->br_perm);
-		if (write_mask
-		    && !err
-		    && !special_file(h_inode->i_mode)) {
-			/* test whether the upper writable branch exists */
-			err = -EROFS;
-			for (; bindex >= 0; bindex--)
-				if (!au_br_rdonly(au_sbr(sb, bindex))) {
-					err = 0;
-					break;
-				}
-		}
-		goto out;
-	}
-
-	/* non-write to dir */
-	err = 0;
-	bend = au_ibend(inode);
-	for (bindex = au_ibstart(inode); !err && bindex <= bend; bindex++) {
-		h_inode = au_h_iptr(inode, bindex);
-		if (h_inode) {
-			err = au_busy_or_stale();
-			if (unlikely(!S_ISDIR(h_inode->i_mode)))
-				break;
-
-			br = au_sbr(sb, bindex);
-			err = h_permission(h_inode, mask, au_br_mnt(br),
-					   br->br_perm);
-		}
-	}
-
-out:
-	ii_read_unlock(inode);
-	si_read_unlock(sb);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry,
-				  unsigned int flags)
-{
-	struct dentry *ret, *parent;
-	struct inode *inode;
-	struct super_block *sb;
-	int err, npositive;
-
-	IMustLock(dir);
-
-	/* todo: support rcu-walk? */
-	ret = ERR_PTR(-ECHILD);
-	if (flags & LOOKUP_RCU)
-		goto out;
-
-	ret = ERR_PTR(-ENAMETOOLONG);
-	if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
-		goto out;
-
-	sb = dir->i_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	ret = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	err = au_di_init(dentry);
-	ret = ERR_PTR(err);
-	if (unlikely(err))
-		goto out_si;
-
-	inode = NULL;
-	npositive = 0; /* suppress a warning */
-	parent = dentry->d_parent; /* dir inode is locked */
-	di_read_lock_parent(parent, AuLock_IR);
-	err = au_alive_dir(parent);
-	if (!err)
-		err = au_digen_test(parent, au_sigen(sb));
-	if (!err) {
-		npositive = au_lkup_dentry(dentry, au_dbstart(parent),
-					   /*type*/0);
-		err = npositive;
-	}
-	di_read_unlock(parent, AuLock_IR);
-	ret = ERR_PTR(err);
-	if (unlikely(err < 0))
-		goto out_unlock;
-
-	if (npositive) {
-		inode = au_new_inode(dentry, /*must_new*/0);
-		if (IS_ERR(inode)) {
-			ret = (void *)inode;
-			inode = NULL;
-			goto out_unlock;
-		}
-	}
-
-	if (inode)
-		atomic_inc(&inode->i_count);
-	ret = d_splice_alias(inode, dentry);
-#if 0
-	if (unlikely(d_need_lookup(dentry))) {
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
-		spin_unlock(&dentry->d_lock);
-	} else
-#endif
-	if (inode) {
-		if (!IS_ERR(ret)) {
-			iput(inode);
-			if (ret && ret != dentry)
-				ii_write_unlock(inode);
-		} else {
-			ii_write_unlock(inode);
-			iput(inode);
-			inode = NULL;
-		}
-	}
-
-out_unlock:
-	di_write_unlock(dentry);
-	if (inode) {
-		/* verbose coding for lock class name */
-		if (unlikely(S_ISLNK(inode->i_mode)))
-			au_rw_class(&au_di(dentry)->di_rwsem,
-				    au_lc_key + AuLcSymlink_DIINFO);
-		else if (unlikely(S_ISDIR(inode->i_mode)))
-			au_rw_class(&au_di(dentry)->di_rwsem,
-				    au_lc_key + AuLcDir_DIINFO);
-		else /* likely */
-			au_rw_class(&au_di(dentry)->di_rwsem,
-				    au_lc_key + AuLcNonDir_DIINFO);
-	}
-out_si:
-	si_read_unlock(sb);
-out:
-	return ret;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct aopen_node {
-	struct hlist_node hlist;
-	struct file *file, *h_file;
-};
-
-static int au_do_aopen(struct inode *inode, struct file *file)
-{
-	struct au_sphlhead *aopen;
-	struct aopen_node *node;
-	struct au_do_open_args args = {
-		.no_lock	= 1,
-		.open		= au_do_open_nondir
-	};
-
-	aopen = &au_sbi(inode->i_sb)->si_aopen;
-	spin_lock(&aopen->spin);
-	hlist_for_each_entry(node, &aopen->head, hlist)
-		if (node->file == file) {
-			args.h_file = node->h_file;
-			break;
-		}
-	spin_unlock(&aopen->spin);
-	/* AuDebugOn(!args.h_file); */
-
-	return au_do_open(file, &args);
-}
-
-static int aufs_atomic_open(struct inode *dir, struct dentry *dentry,
-			    struct file *file, unsigned int open_flag,
-			    umode_t create_mode, int *opened)
-{
-	int err, h_opened = *opened;
-	struct dentry *parent;
-	struct dentry *d;
-	struct au_sphlhead *aopen;
-	struct vfsub_aopen_args args = {
-		.open_flag	= open_flag,
-		.create_mode	= create_mode,
-		.opened		= &h_opened
-	};
-	struct aopen_node aopen_node = {
-		.file	= file
-	};
-
-	IMustLock(dir);
-	AuDbg("open_flag 0x%x\n", open_flag);
-	AuDbgDentry(dentry);
-
-	err = 0;
-	if (!au_di(dentry)) {
-		d = aufs_lookup(dir, dentry, /*flags*/0);
-		if (IS_ERR(d)) {
-			err = PTR_ERR(d);
-			goto out;
-		} else if (d) {
-			/*
-			 * obsoleted dentry found.
-			 * another error will be returned later.
-			 */
-			d_drop(d);
-			dput(d);
-			AuDbgDentry(d);
-		}
-		AuDbgDentry(dentry);
-	}
-
-	if (d_is_positive(dentry)
-	    || d_unhashed(dentry)
-	    || d_unlinked(dentry)
-	    || !(open_flag & O_CREAT))
-		goto out_no_open;
-
-	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN);
-	if (unlikely(err))
-		goto out;
-
-	parent = dentry->d_parent;	/* dir is locked */
-	di_write_lock_parent(parent);
-	err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0);
-	if (unlikely(err))
-		goto out_unlock;
-
-	AuDbgDentry(dentry);
-	if (d_is_positive(dentry))
-		goto out_unlock;
-
-	args.file = get_empty_filp();
-	err = PTR_ERR(args.file);
-	if (IS_ERR(args.file))
-		goto out_unlock;
-
-	args.file->f_flags = file->f_flags;
-	err = au_aopen_or_create(dir, dentry, &args);
-	AuTraceErr(err);
-	AuDbgFile(args.file);
-	if (unlikely(err < 0)) {
-		if (h_opened & FILE_OPENED)
-			fput(args.file);
-		else
-			put_filp(args.file);
-		goto out_unlock;
-	}
-
-	/* some filesystems don't set FILE_CREATED while succeeded? */
-	*opened |= FILE_CREATED;
-	if (h_opened & FILE_OPENED)
-		aopen_node.h_file = args.file;
-	else {
-		put_filp(args.file);
-		args.file = NULL;
-	}
-	aopen = &au_sbi(dir->i_sb)->si_aopen;
-	au_sphl_add(&aopen_node.hlist, aopen);
-	err = finish_open(file, dentry, au_do_aopen, opened);
-	au_sphl_del(&aopen_node.hlist, aopen);
-	AuTraceErr(err);
-	AuDbgFile(file);
-	if (aopen_node.h_file)
-		fput(aopen_node.h_file);
-
-out_unlock:
-	di_write_unlock(parent);
-	aufs_read_unlock(dentry, AuLock_DW);
-	AuDbgDentry(dentry);
-	if (unlikely(err))
-		goto out;
-out_no_open:
-	if (!err && !(*opened & FILE_CREATED)) {
-		AuLabel(out_no_open);
-		dget(dentry);
-		err = finish_no_open(file, dentry);
-	}
-out:
-	AuDbg("%pd%s%s\n", dentry,
-	      (*opened & FILE_CREATED) ? " created" : "",
-	      (*opened & FILE_OPENED) ? " opened" : "");
-	AuTraceErr(err);
-	return err;
-}
-
-
-/* ---------------------------------------------------------------------- */
-
-static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent,
-			  const unsigned char add_entry, aufs_bindex_t bcpup,
-			  aufs_bindex_t bstart)
-{
-	int err;
-	struct dentry *h_parent;
-	struct inode *h_dir;
-
-	if (add_entry)
-		IMustLock(d_inode(parent));
-	else
-		di_write_lock_parent(parent);
-
-	err = 0;
-	if (!au_h_dptr(parent, bcpup)) {
-		if (bstart > bcpup)
-			err = au_cpup_dirs(dentry, bcpup);
-		else if (bstart < bcpup)
-			err = au_cpdown_dirs(dentry, bcpup);
-		else
-			BUG();
-	}
-	if (!err && add_entry && !au_ftest_wrdir(add_entry, TMPFILE)) {
-		h_parent = au_h_dptr(parent, bcpup);
-		h_dir = d_inode(h_parent);
-		mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT);
-		err = au_lkup_neg(dentry, bcpup, /*wh*/0);
-		/* todo: no unlock here */
-		mutex_unlock(&h_dir->i_mutex);
-
-		AuDbg("bcpup %d\n", bcpup);
-		if (!err) {
-			if (d_really_is_negative(dentry))
-				au_set_h_dptr(dentry, bstart, NULL);
-			au_update_dbrange(dentry, /*do_put_zero*/0);
-		}
-	}
-
-	if (!add_entry)
-		di_write_unlock(parent);
-	if (!err)
-		err = bcpup; /* success */
-
-	AuTraceErr(err);
-	return err;
-}
-
-/*
- * decide the branch and the parent dir where we will create a new entry.
- * returns new bindex or an error.
- * copyup the parent dir if needed.
- */
-int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry,
-	      struct au_wr_dir_args *args)
-{
-	int err;
-	unsigned int flags;
-	aufs_bindex_t bcpup, bstart, src_bstart;
-	const unsigned char add_entry
-		= au_ftest_wrdir(args->flags, ADD_ENTRY)
-		| au_ftest_wrdir(args->flags, TMPFILE);
-	struct super_block *sb;
-	struct dentry *parent;
-	struct au_sbinfo *sbinfo;
-
-	sb = dentry->d_sb;
-	sbinfo = au_sbi(sb);
-	parent = dget_parent(dentry);
-	bstart = au_dbstart(dentry);
-	bcpup = bstart;
-	if (args->force_btgt < 0) {
-		if (src_dentry) {
-			src_bstart = au_dbstart(src_dentry);
-			if (src_bstart < bstart)
-				bcpup = src_bstart;
-		} else if (add_entry) {
-			flags = 0;
-			if (au_ftest_wrdir(args->flags, ISDIR))
-				au_fset_wbr(flags, DIR);
-			err = AuWbrCreate(sbinfo, dentry, flags);
-			bcpup = err;
-		}
-
-		if (bcpup < 0 || au_test_ro(sb, bcpup, d_inode(dentry))) {
-			if (add_entry)
-				err = AuWbrCopyup(sbinfo, dentry);
-			else {
-				if (!IS_ROOT(dentry)) {
-					di_read_lock_parent(parent, !AuLock_IR);
-					err = AuWbrCopyup(sbinfo, dentry);
-					di_read_unlock(parent, !AuLock_IR);
-				} else
-					err = AuWbrCopyup(sbinfo, dentry);
-			}
-			bcpup = err;
-			if (unlikely(err < 0))
-				goto out;
-		}
-	} else {
-		bcpup = args->force_btgt;
-		AuDebugOn(au_test_ro(sb, bcpup, d_inode(dentry)));
-	}
-
-	AuDbg("bstart %d, bcpup %d\n", bstart, bcpup);
-	err = bcpup;
-	if (bcpup == bstart)
-		goto out; /* success */
-
-	/* copyup the new parent into the branch we process */
-	err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, bstart);
-	if (err >= 0) {
-		if (d_really_is_negative(dentry)) {
-			au_set_h_dptr(dentry, bstart, NULL);
-			au_set_dbstart(dentry, bcpup);
-			au_set_dbend(dentry, bcpup);
-		}
-		AuDebugOn(add_entry
-			  && !au_ftest_wrdir(args->flags, TMPFILE)
-			  && !au_h_dptr(dentry, bcpup));
-	}
-
-out:
-	dput(parent);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_pin_hdir_unlock(struct au_pin *p)
-{
-	if (p->hdir)
-		au_hn_imtx_unlock(p->hdir);
-}
-
-int au_pin_hdir_lock(struct au_pin *p)
-{
-	int err;
-
-	err = 0;
-	if (!p->hdir)
-		goto out;
-
-	/* even if an error happens later, keep this lock */
-	au_hn_imtx_lock_nested(p->hdir, p->lsc_hi);
-
-	err = -EBUSY;
-	if (unlikely(p->hdir->hi_inode != d_inode(p->h_parent)))
-		goto out;
-
-	err = 0;
-	if (p->h_dentry)
-		err = au_h_verify(p->h_dentry, p->udba, p->hdir->hi_inode,
-				  p->h_parent, p->br);
-
-out:
-	return err;
-}
-
-int au_pin_hdir_relock(struct au_pin *p)
-{
-	int err, i;
-	struct inode *h_i;
-	struct dentry *h_d[] = {
-		p->h_dentry,
-		p->h_parent
-	};
-
-	err = au_pin_hdir_lock(p);
-	if (unlikely(err))
-		goto out;
-
-	for (i = 0; !err && i < sizeof(h_d)/sizeof(*h_d); i++) {
-		if (!h_d[i])
-			continue;
-		if (d_is_positive(h_d[i])) {
-			h_i = d_inode(h_d[i]);
-			err = !h_i->i_nlink;
-		}
-	}
-
-out:
-	return err;
-}
-
-void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task)
-{
-#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
-	p->hdir->hi_inode->i_mutex.owner = task;
-#endif
-}
-
-void au_pin_hdir_acquire_nest(struct au_pin *p)
-{
-	if (p->hdir) {
-		mutex_acquire_nest(&p->hdir->hi_inode->i_mutex.dep_map,
-				   p->lsc_hi, 0, NULL, _RET_IP_);
-		au_pin_hdir_set_owner(p, current);
-	}
-}
-
-void au_pin_hdir_release(struct au_pin *p)
-{
-	if (p->hdir) {
-		au_pin_hdir_set_owner(p, p->task);
-		mutex_release(&p->hdir->hi_inode->i_mutex.dep_map, 1, _RET_IP_);
-	}
-}
-
-struct dentry *au_pinned_h_parent(struct au_pin *pin)
-{
-	if (pin && pin->parent)
-		return au_h_dptr(pin->parent, pin->bindex);
-	return NULL;
-}
-
-void au_unpin(struct au_pin *p)
-{
-	if (p->hdir)
-		au_pin_hdir_unlock(p);
-	if (p->h_mnt && au_ftest_pin(p->flags, MNT_WRITE))
-		vfsub_mnt_drop_write(p->h_mnt);
-	if (!p->hdir)
-		return;
-
-	if (!au_ftest_pin(p->flags, DI_LOCKED))
-		di_read_unlock(p->parent, AuLock_IR);
-	iput(p->hdir->hi_inode);
-	dput(p->parent);
-	p->parent = NULL;
-	p->hdir = NULL;
-	p->h_mnt = NULL;
-	/* do not clear p->task */
-}
-
-int au_do_pin(struct au_pin *p)
-{
-	int err;
-	struct super_block *sb;
-	struct inode *h_dir;
-
-	err = 0;
-	sb = p->dentry->d_sb;
-	p->br = au_sbr(sb, p->bindex);
-	if (IS_ROOT(p->dentry)) {
-		if (au_ftest_pin(p->flags, MNT_WRITE)) {
-			p->h_mnt = au_br_mnt(p->br);
-			err = vfsub_mnt_want_write(p->h_mnt);
-			if (unlikely(err)) {
-				au_fclr_pin(p->flags, MNT_WRITE);
-				goto out_err;
-			}
-		}
-		goto out;
-	}
-
-	p->h_dentry = NULL;
-	if (p->bindex <= au_dbend(p->dentry))
-		p->h_dentry = au_h_dptr(p->dentry, p->bindex);
-
-	p->parent = dget_parent(p->dentry);
-	if (!au_ftest_pin(p->flags, DI_LOCKED))
-		di_read_lock(p->parent, AuLock_IR, p->lsc_di);
-
-	h_dir = NULL;
-	p->h_parent = au_h_dptr(p->parent, p->bindex);
-	p->hdir = au_hi(d_inode(p->parent), p->bindex);
-	if (p->hdir)
-		h_dir = p->hdir->hi_inode;
-
-	/*
-	 * udba case, or
-	 * if DI_LOCKED is not set, then p->parent may be different
-	 * and h_parent can be NULL.
-	 */
-	if (unlikely(!p->hdir || !h_dir || !p->h_parent)) {
-		err = -EBUSY;
-		if (!au_ftest_pin(p->flags, DI_LOCKED))
-			di_read_unlock(p->parent, AuLock_IR);
-		dput(p->parent);
-		p->parent = NULL;
-		goto out_err;
-	}
-
-	if (au_ftest_pin(p->flags, MNT_WRITE)) {
-		p->h_mnt = au_br_mnt(p->br);
-		err = vfsub_mnt_want_write(p->h_mnt);
-		if (unlikely(err)) {
-			au_fclr_pin(p->flags, MNT_WRITE);
-			if (!au_ftest_pin(p->flags, DI_LOCKED))
-				di_read_unlock(p->parent, AuLock_IR);
-			dput(p->parent);
-			p->parent = NULL;
-			goto out_err;
-		}
-	}
-
-	au_igrab(h_dir);
-	err = au_pin_hdir_lock(p);
-	if (!err)
-		goto out; /* success */
-
-	au_unpin(p);
-
-out_err:
-	pr_err("err %d\n", err);
-	err = au_busy_or_stale();
-out:
-	return err;
-}
-
-void au_pin_init(struct au_pin *p, struct dentry *dentry,
-		 aufs_bindex_t bindex, int lsc_di, int lsc_hi,
-		 unsigned int udba, unsigned char flags)
-{
-	p->dentry = dentry;
-	p->udba = udba;
-	p->lsc_di = lsc_di;
-	p->lsc_hi = lsc_hi;
-	p->flags = flags;
-	p->bindex = bindex;
-
-	p->parent = NULL;
-	p->hdir = NULL;
-	p->h_mnt = NULL;
-
-	p->h_dentry = NULL;
-	p->h_parent = NULL;
-	p->br = NULL;
-	p->task = current;
-}
-
-int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex,
-	   unsigned int udba, unsigned char flags)
-{
-	au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2,
-		    udba, flags);
-	return au_do_pin(pin);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * ->setattr() and ->getattr() are called in various cases.
- * chmod, stat: dentry is revalidated.
- * fchmod, fstat: file and dentry are not revalidated, additionally they may be
- *		  unhashed.
- * for ->setattr(), ia->ia_file is passed from ftruncate only.
- */
-/* todo: consolidate with do_refresh() and simple_reval_dpath() */
-int au_reval_for_attr(struct dentry *dentry, unsigned int sigen)
-{
-	int err;
-	struct dentry *parent;
-
-	err = 0;
-	if (au_digen_test(dentry, sigen)) {
-		parent = dget_parent(dentry);
-		di_read_lock_parent(parent, AuLock_IR);
-		err = au_refresh_dentry(dentry, parent);
-		di_read_unlock(parent, AuLock_IR);
-		dput(parent);
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia,
-		     struct au_icpup_args *a)
-{
-	int err;
-	loff_t sz;
-	aufs_bindex_t bstart, ibstart;
-	struct dentry *hi_wh, *parent;
-	struct inode *inode;
-	struct au_wr_dir_args wr_dir_args = {
-		.force_btgt	= -1,
-		.flags		= 0
-	};
-
-	if (d_is_dir(dentry))
-		au_fset_wrdir(wr_dir_args.flags, ISDIR);
-	/* plink or hi_wh() case */
-	bstart = au_dbstart(dentry);
-	inode = d_inode(dentry);
-	ibstart = au_ibstart(inode);
-	if (bstart != ibstart && !au_test_ro(inode->i_sb, ibstart, inode))
-		wr_dir_args.force_btgt = ibstart;
-	err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args);
-	if (unlikely(err < 0))
-		goto out;
-	a->btgt = err;
-	if (err != bstart)
-		au_fset_icpup(a->flags, DID_CPUP);
-
-	err = 0;
-	a->pin_flags = AuPin_MNT_WRITE;
-	parent = NULL;
-	if (!IS_ROOT(dentry)) {
-		au_fset_pin(a->pin_flags, DI_LOCKED);
-		parent = dget_parent(dentry);
-		di_write_lock_parent(parent);
-	}
-
-	err = au_pin(&a->pin, dentry, a->btgt, a->udba, a->pin_flags);
-	if (unlikely(err))
-		goto out_parent;
-
-	a->h_path.dentry = au_h_dptr(dentry, bstart);
-	sz = -1;
-	a->h_inode = d_inode(a->h_path.dentry);
-	if (ia && (ia->ia_valid & ATTR_SIZE)) {
-		mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD);
-		if (ia->ia_size < i_size_read(a->h_inode))
-			sz = ia->ia_size;
-		mutex_unlock(&a->h_inode->i_mutex);
-	}
-
-	hi_wh = NULL;
-	if (au_ftest_icpup(a->flags, DID_CPUP) && d_unlinked(dentry)) {
-		hi_wh = au_hi_wh(inode, a->btgt);
-		if (!hi_wh) {
-			struct au_cp_generic cpg = {
-				.dentry	= dentry,
-				.bdst	= a->btgt,
-				.bsrc	= -1,
-				.len	= sz,
-				.pin	= &a->pin
-			};
-			err = au_sio_cpup_wh(&cpg, /*file*/NULL);
-			if (unlikely(err))
-				goto out_unlock;
-			hi_wh = au_hi_wh(inode, a->btgt);
-			/* todo: revalidate hi_wh? */
-		}
-	}
-
-	if (parent) {
-		au_pin_set_parent_lflag(&a->pin, /*lflag*/0);
-		di_downgrade_lock(parent, AuLock_IR);
-		dput(parent);
-		parent = NULL;
-	}
-	if (!au_ftest_icpup(a->flags, DID_CPUP))
-		goto out; /* success */
-
-	if (!d_unhashed(dentry)) {
-		struct au_cp_generic cpg = {
-			.dentry	= dentry,
-			.bdst	= a->btgt,
-			.bsrc	= bstart,
-			.len	= sz,
-			.pin	= &a->pin,
-			.flags	= AuCpup_DTIME | AuCpup_HOPEN
-		};
-		err = au_sio_cpup_simple(&cpg);
-		if (!err)
-			a->h_path.dentry = au_h_dptr(dentry, a->btgt);
-	} else if (!hi_wh)
-		a->h_path.dentry = au_h_dptr(dentry, a->btgt);
-	else
-		a->h_path.dentry = hi_wh; /* do not dget here */
-
-out_unlock:
-	a->h_inode = d_inode(a->h_path.dentry);
-	if (!err)
-		goto out; /* success */
-	au_unpin(&a->pin);
-out_parent:
-	if (parent) {
-		di_write_unlock(parent);
-		dput(parent);
-	}
-out:
-	if (!err)
-		mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD);
-	return err;
-}
-
-static int aufs_setattr(struct dentry *dentry, struct iattr *ia)
-{
-	int err;
-	struct inode *inode, *delegated;
-	struct super_block *sb;
-	struct file *file;
-	struct au_icpup_args *a;
-
-	inode = d_inode(dentry);
-	IMustLock(inode);
-
-	err = -ENOMEM;
-	a = kzalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
-		ia->ia_valid &= ~ATTR_MODE;
-
-	file = NULL;
-	sb = dentry->d_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out_kfree;
-
-	if (ia->ia_valid & ATTR_FILE) {
-		/* currently ftruncate(2) only */
-		AuDebugOn(!d_is_reg(dentry));
-		file = ia->ia_file;
-		err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
-		if (unlikely(err))
-			goto out_si;
-		ia->ia_file = au_hf_top(file);
-		a->udba = AuOpt_UDBA_NONE;
-	} else {
-		/* fchmod() doesn't pass ia_file */
-		a->udba = au_opt_udba(sb);
-		di_write_lock_child(dentry);
-		/* no d_unlinked(), to set UDBA_NONE for root */
-		if (d_unhashed(dentry))
-			a->udba = AuOpt_UDBA_NONE;
-		if (a->udba != AuOpt_UDBA_NONE) {
-			AuDebugOn(IS_ROOT(dentry));
-			err = au_reval_for_attr(dentry, au_sigen(sb));
-			if (unlikely(err))
-				goto out_dentry;
-		}
-	}
-
-	err = au_pin_and_icpup(dentry, ia, a);
-	if (unlikely(err < 0))
-		goto out_dentry;
-	if (au_ftest_icpup(a->flags, DID_CPUP)) {
-		ia->ia_file = NULL;
-		ia->ia_valid &= ~ATTR_FILE;
-	}
-
-	a->h_path.mnt = au_sbr_mnt(sb, a->btgt);
-	if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME))
-	    == (ATTR_MODE | ATTR_CTIME)) {
-		err = security_path_chmod(&a->h_path, ia->ia_mode);
-		if (unlikely(err))
-			goto out_unlock;
-	} else if ((ia->ia_valid & (ATTR_UID | ATTR_GID))
-		   && (ia->ia_valid & ATTR_CTIME)) {
-		err = security_path_chown(&a->h_path, ia->ia_uid, ia->ia_gid);
-		if (unlikely(err))
-			goto out_unlock;
-	}
-
-	if (ia->ia_valid & ATTR_SIZE) {
-		struct file *f;
-
-		if (ia->ia_size < i_size_read(inode))
-			/* unmap only */
-			truncate_setsize(inode, ia->ia_size);
-
-		f = NULL;
-		if (ia->ia_valid & ATTR_FILE)
-			f = ia->ia_file;
-		mutex_unlock(&a->h_inode->i_mutex);
-		err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f);
-		mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD);
-	} else {
-		delegated = NULL;
-		while (1) {
-			err = vfsub_notify_change(&a->h_path, ia, &delegated);
-			if (delegated) {
-				err = break_deleg_wait(&delegated);
-				if (!err)
-					continue;
-			}
-			break;
-		}
-	}
-	if (!err)
-		au_cpup_attr_changeable(inode);
-
-out_unlock:
-	mutex_unlock(&a->h_inode->i_mutex);
-	au_unpin(&a->pin);
-	if (unlikely(err))
-		au_update_dbstart(dentry);
-out_dentry:
-	di_write_unlock(dentry);
-	if (file) {
-		fi_write_unlock(file);
-		ia->ia_file = file;
-		ia->ia_valid |= ATTR_FILE;
-	}
-out_si:
-	si_read_unlock(sb);
-out_kfree:
-	kfree(a);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL)
-static int au_h_path_to_set_attr(struct dentry *dentry,
-				 struct au_icpup_args *a, struct path *h_path)
-{
-	int err;
-	struct super_block *sb;
-
-	sb = dentry->d_sb;
-	a->udba = au_opt_udba(sb);
-	/* no d_unlinked(), to set UDBA_NONE for root */
-	if (d_unhashed(dentry))
-		a->udba = AuOpt_UDBA_NONE;
-	if (a->udba != AuOpt_UDBA_NONE) {
-		AuDebugOn(IS_ROOT(dentry));
-		err = au_reval_for_attr(dentry, au_sigen(sb));
-		if (unlikely(err))
-			goto out;
-	}
-	err = au_pin_and_icpup(dentry, /*ia*/NULL, a);
-	if (unlikely(err < 0))
-		goto out;
-
-	h_path->dentry = a->h_path.dentry;
-	h_path->mnt = au_sbr_mnt(sb, a->btgt);
-
-out:
-	return err;
-}
-
-ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg)
-{
-	int err;
-	struct path h_path;
-	struct super_block *sb;
-	struct au_icpup_args *a;
-	struct inode *inode, *h_inode;
-
-	inode = d_inode(dentry);
-	IMustLock(inode);
-
-	err = -ENOMEM;
-	a = kzalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	sb = dentry->d_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out_kfree;
-
-	h_path.dentry = NULL;	/* silence gcc */
-	di_write_lock_child(dentry);
-	err = au_h_path_to_set_attr(dentry, a, &h_path);
-	if (unlikely(err))
-		goto out_di;
-
-	mutex_unlock(&a->h_inode->i_mutex);
-	switch (arg->type) {
-	case AU_XATTR_SET:
-		err = vfsub_setxattr(h_path.dentry,
-				     arg->u.set.name, arg->u.set.value,
-				     arg->u.set.size, arg->u.set.flags);
-		break;
-	case AU_XATTR_REMOVE:
-		err = vfsub_removexattr(h_path.dentry, arg->u.remove.name);
-		break;
-	case AU_ACL_SET:
-		err = -EOPNOTSUPP;
-		h_inode = d_inode(h_path.dentry);
-		if (h_inode->i_op->set_acl)
-			err = h_inode->i_op->set_acl(h_inode,
-						     arg->u.acl_set.acl,
-						     arg->u.acl_set.type);
-		break;
-	}
-	if (!err)
-		au_cpup_attr_timesizes(inode);
-
-	au_unpin(&a->pin);
-	if (unlikely(err))
-		au_update_dbstart(dentry);
-
-out_di:
-	di_write_unlock(dentry);
-	si_read_unlock(sb);
-out_kfree:
-	kfree(a);
-out:
-	AuTraceErr(err);
-	return err;
-}
-#endif
-
-static void au_refresh_iattr(struct inode *inode, struct kstat *st,
-			     unsigned int nlink)
-{
-	unsigned int n;
-
-	inode->i_mode = st->mode;
-	/* don't i_[ug]id_write() here */
-	inode->i_uid = st->uid;
-	inode->i_gid = st->gid;
-	inode->i_atime = st->atime;
-	inode->i_mtime = st->mtime;
-	inode->i_ctime = st->ctime;
-
-	au_cpup_attr_nlink(inode, /*force*/0);
-	if (S_ISDIR(inode->i_mode)) {
-		n = inode->i_nlink;
-		n -= nlink;
-		n += st->nlink;
-		smp_mb(); /* for i_nlink */
-		/* 0 can happen */
-		set_nlink(inode, n);
-	}
-
-	spin_lock(&inode->i_lock);
-	inode->i_blocks = st->blocks;
-	i_size_write(inode, st->size);
-	spin_unlock(&inode->i_lock);
-}
-
-/*
- * common routine for aufs_getattr() and aufs_getxattr().
- * returns zero or negative (an error).
- * @dentry will be read-locked in success.
- */
-int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path)
-{
-	int err;
-	unsigned int mnt_flags, sigen;
-	unsigned char udba_none;
-	aufs_bindex_t bindex;
-	struct super_block *sb, *h_sb;
-	struct inode *inode;
-
-	h_path->mnt = NULL;
-	h_path->dentry = NULL;
-
-	err = 0;
-	sb = dentry->d_sb;
-	mnt_flags = au_mntflags(sb);
-	udba_none = !!au_opt_test(mnt_flags, UDBA_NONE);
-
-	/* support fstat(2) */
-	if (!d_unlinked(dentry) && !udba_none) {
-		sigen = au_sigen(sb);
-		err = au_digen_test(dentry, sigen);
-		if (!err) {
-			di_read_lock_child(dentry, AuLock_IR);
-			err = au_dbrange_test(dentry);
-			if (unlikely(err)) {
-				di_read_unlock(dentry, AuLock_IR);
-				goto out;
-			}
-		} else {
-			AuDebugOn(IS_ROOT(dentry));
-			di_write_lock_child(dentry);
-			err = au_dbrange_test(dentry);
-			if (!err)
-				err = au_reval_for_attr(dentry, sigen);
-			if (!err)
-				di_downgrade_lock(dentry, AuLock_IR);
-			else {
-				di_write_unlock(dentry);
-				goto out;
-			}
-		}
-	} else
-		di_read_lock_child(dentry, AuLock_IR);
-
-	inode = d_inode(dentry);
-	bindex = au_ibstart(inode);
-	h_path->mnt = au_sbr_mnt(sb, bindex);
-	h_sb = h_path->mnt->mnt_sb;
-	if (!force
-	    && !au_test_fs_bad_iattr(h_sb)
-	    && udba_none)
-		goto out; /* success */
-
-	if (au_dbstart(dentry) == bindex)
-		h_path->dentry = au_h_dptr(dentry, bindex);
-	else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) {
-		h_path->dentry = au_plink_lkup(inode, bindex);
-		if (IS_ERR(h_path->dentry))
-			/* pretending success */
-			h_path->dentry = NULL;
-		else
-			dput(h_path->dentry);
-	}
-
-out:
-	return err;
-}
-
-static int aufs_getattr(struct vfsmount *mnt __maybe_unused,
-			struct dentry *dentry, struct kstat *st)
-{
-	int err;
-	unsigned char positive;
-	struct path h_path;
-	struct inode *inode;
-	struct super_block *sb;
-
-	inode = d_inode(dentry);
-	sb = dentry->d_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out;
-	err = au_h_path_getattr(dentry, /*force*/0, &h_path);
-	if (unlikely(err))
-		goto out_si;
-	if (unlikely(!h_path.dentry))
-		/* illegally overlapped or something */
-		goto out_fill; /* pretending success */
-
-	positive = d_is_positive(h_path.dentry);
-	if (positive)
-		err = vfs_getattr(&h_path, st);
-	if (!err) {
-		if (positive)
-			au_refresh_iattr(inode, st,
-					 d_inode(h_path.dentry)->i_nlink);
-		goto out_fill; /* success */
-	}
-	AuTraceErr(err);
-	goto out_di;
-
-out_fill:
-	generic_fillattr(inode, st);
-out_di:
-	di_read_unlock(dentry, AuLock_IR);
-out_si:
-	si_read_unlock(sb);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int h_readlink(struct dentry *dentry, int bindex, char __user *buf,
-		      int bufsiz)
-{
-	int err;
-	struct super_block *sb;
-	struct dentry *h_dentry;
-	struct inode *inode, *h_inode;
-
-	err = -EINVAL;
-	h_dentry = au_h_dptr(dentry, bindex);
-	h_inode = d_inode(h_dentry);
-	if (unlikely(!h_inode->i_op->readlink))
-		goto out;
-
-	err = security_inode_readlink(h_dentry);
-	if (unlikely(err))
-		goto out;
-
-	sb = dentry->d_sb;
-	inode = d_inode(dentry);
-	if (!au_test_ro(sb, bindex, inode)) {
-		vfsub_touch_atime(au_sbr_mnt(sb, bindex), h_dentry);
-		fsstack_copy_attr_atime(inode, h_inode);
-	}
-	err = h_inode->i_op->readlink(h_dentry, buf, bufsiz);
-
-out:
-	return err;
-}
-
-static int aufs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
-{
-	int err;
-
-	err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN);
-	if (unlikely(err))
-		goto out;
-	err = au_d_hashed_positive(dentry);
-	if (!err)
-		err = h_readlink(dentry, au_dbstart(dentry), buf, bufsiz);
-	aufs_read_unlock(dentry, AuLock_IR);
-
-out:
-	return err;
-}
-
-static void *aufs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	int err;
-	mm_segment_t old_fs;
-	union {
-		char *k;
-		char __user *u;
-	} buf;
-
-	err = -ENOMEM;
-	buf.k = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!buf.k))
-		goto out;
-
-	err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN);
-	if (unlikely(err))
-		goto out_name;
-
-	err = au_d_hashed_positive(dentry);
-	if (!err) {
-		old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		err = h_readlink(dentry, au_dbstart(dentry), buf.u, PATH_MAX);
-		set_fs(old_fs);
-	}
-	aufs_read_unlock(dentry, AuLock_IR);
-
-	if (err >= 0) {
-		buf.k[err] = 0;
-		/* will be freed by put_link */
-		nd_set_link(nd, buf.k);
-		return NULL; /* success */
-	}
-
-out_name:
-	free_page((unsigned long)buf.k);
-out:
-	AuTraceErr(err);
-	return ERR_PTR(err);
-}
-
-static void aufs_put_link(struct dentry *dentry __maybe_unused,
-			  struct nameidata *nd, void *cookie __maybe_unused)
-{
-	char *p;
-
-	p = nd_get_link(nd);
-	if (!IS_ERR_OR_NULL(p))
-		free_page((unsigned long)p);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_update_time(struct inode *inode, struct timespec *ts, int flags)
-{
-	int err;
-	struct super_block *sb;
-	struct inode *h_inode;
-
-	sb = inode->i_sb;
-	/* mmap_sem might be acquired already, cf. aufs_mmap() */
-	lockdep_off();
-	si_read_lock(sb, AuLock_FLUSH);
-	ii_write_lock_child(inode);
-	lockdep_on();
-	h_inode = au_h_iptr(inode, au_ibstart(inode));
-	err = vfsub_update_time(h_inode, ts, flags);
-	lockdep_off();
-	if (!err)
-		au_cpup_attr_timesizes(inode);
-	ii_write_unlock(inode);
-	si_read_unlock(sb);
-	lockdep_on();
-
-	if (!err && (flags & S_VERSION))
-		inode_inc_iversion(inode);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct inode_operations aufs_symlink_iop = {
-	.permission	= aufs_permission,
-#ifdef CONFIG_FS_POSIX_ACL
-	.get_acl	= aufs_get_acl,
-	.set_acl	= aufs_set_acl, /* unsupport for symlink? */
-#endif
-
-	.setattr	= aufs_setattr,
-	.getattr	= aufs_getattr,
-
-#ifdef CONFIG_AUFS_XATTR
-	.setxattr	= aufs_setxattr,
-	.getxattr	= aufs_getxattr,
-	.listxattr	= aufs_listxattr,
-	.removexattr	= aufs_removexattr,
-#endif
-
-	.readlink	= aufs_readlink,
-	.follow_link	= aufs_follow_link,
-	.put_link	= aufs_put_link,
-
-	/* .update_time	= aufs_update_time */
-};
-
-struct inode_operations aufs_dir_iop = {
-	.create		= aufs_create,
-	.lookup		= aufs_lookup,
-	.link		= aufs_link,
-	.unlink		= aufs_unlink,
-	.symlink	= aufs_symlink,
-	.mkdir		= aufs_mkdir,
-	.rmdir		= aufs_rmdir,
-	.mknod		= aufs_mknod,
-	.rename		= aufs_rename,
-
-	.permission	= aufs_permission,
-#ifdef CONFIG_FS_POSIX_ACL
-	.get_acl	= aufs_get_acl,
-	.set_acl	= aufs_set_acl,
-#endif
-
-	.setattr	= aufs_setattr,
-	.getattr	= aufs_getattr,
-
-#ifdef CONFIG_AUFS_XATTR
-	.setxattr	= aufs_setxattr,
-	.getxattr	= aufs_getxattr,
-	.listxattr	= aufs_listxattr,
-	.removexattr	= aufs_removexattr,
-#endif
-
-	.update_time	= aufs_update_time,
-	.atomic_open	= aufs_atomic_open,
-	.tmpfile	= aufs_tmpfile
-};
-
-struct inode_operations aufs_iop = {
-	.permission	= aufs_permission,
-#ifdef CONFIG_FS_POSIX_ACL
-	.get_acl	= aufs_get_acl,
-	.set_acl	= aufs_set_acl,
-#endif
-
-	.setattr	= aufs_setattr,
-	.getattr	= aufs_getattr,
-
-#ifdef CONFIG_AUFS_XATTR
-	.setxattr	= aufs_setxattr,
-	.getxattr	= aufs_getxattr,
-	.listxattr	= aufs_listxattr,
-	.removexattr	= aufs_removexattr,
-#endif
-
-	.update_time	= aufs_update_time
-};
diff --git a/fs/aufs/i_op_add.c b/fs/aufs/i_op_add.c
deleted file mode 100644
index 209b26864..000000000
--- a/fs/aufs/i_op_add.c
+++ /dev/null
@@ -1,932 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * inode operations (add entry)
- */
-
-#include "aufs.h"
-
-/*
- * final procedure of adding a new entry, except link(2).
- * remove whiteout, instantiate, copyup the parent dir's times and size
- * and update version.
- * if it failed, re-create the removed whiteout.
- */
-static int epilog(struct inode *dir, aufs_bindex_t bindex,
-		  struct dentry *wh_dentry, struct dentry *dentry)
-{
-	int err, rerr;
-	aufs_bindex_t bwh;
-	struct path h_path;
-	struct super_block *sb;
-	struct inode *inode, *h_dir;
-	struct dentry *wh;
-
-	bwh = -1;
-	sb = dir->i_sb;
-	if (wh_dentry) {
-		h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */
-		IMustLock(h_dir);
-		AuDebugOn(au_h_iptr(dir, bindex) != h_dir);
-		bwh = au_dbwh(dentry);
-		h_path.dentry = wh_dentry;
-		h_path.mnt = au_sbr_mnt(sb, bindex);
-		err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path,
-					  dentry);
-		if (unlikely(err))
-			goto out;
-	}
-
-	inode = au_new_inode(dentry, /*must_new*/1);
-	if (!IS_ERR(inode)) {
-		d_instantiate(dentry, inode);
-		dir = d_inode(dentry->d_parent); /* dir inode is locked */
-		IMustLock(dir);
-		au_dir_ts(dir, bindex);
-		dir->i_version++;
-		au_fhsm_wrote(sb, bindex, /*force*/0);
-		return 0; /* success */
-	}
-
-	err = PTR_ERR(inode);
-	if (!wh_dentry)
-		goto out;
-
-	/* revert */
-	/* dir inode is locked */
-	wh = au_wh_create(dentry, bwh, wh_dentry->d_parent);
-	rerr = PTR_ERR(wh);
-	if (IS_ERR(wh)) {
-		AuIOErr("%pd reverting whiteout failed(%d, %d)\n",
-			dentry, err, rerr);
-		err = -EIO;
-	} else
-		dput(wh);
-
-out:
-	return err;
-}
-
-static int au_d_may_add(struct dentry *dentry)
-{
-	int err;
-
-	err = 0;
-	if (unlikely(d_unhashed(dentry)))
-		err = -ENOENT;
-	if (unlikely(d_really_is_positive(dentry)))
-		err = -EEXIST;
-	return err;
-}
-
-/*
- * simple tests for the adding inode operations.
- * following the checks in vfs, plus the parent-child relationship.
- */
-int au_may_add(struct dentry *dentry, aufs_bindex_t bindex,
-	       struct dentry *h_parent, int isdir)
-{
-	int err;
-	umode_t h_mode;
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-
-	err = -ENAMETOOLONG;
-	if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
-		goto out;
-
-	h_dentry = au_h_dptr(dentry, bindex);
-	if (d_really_is_negative(dentry)) {
-		err = -EEXIST;
-		if (unlikely(d_is_positive(h_dentry)))
-			goto out;
-	} else {
-		/* rename(2) case */
-		err = -EIO;
-		if (unlikely(d_is_negative(h_dentry)))
-			goto out;
-		h_inode = d_inode(h_dentry);
-		if (unlikely(!h_inode->i_nlink))
-			goto out;
-
-		h_mode = h_inode->i_mode;
-		if (!isdir) {
-			err = -EISDIR;
-			if (unlikely(S_ISDIR(h_mode)))
-				goto out;
-		} else if (unlikely(!S_ISDIR(h_mode))) {
-			err = -ENOTDIR;
-			goto out;
-		}
-	}
-
-	err = 0;
-	/* expected parent dir is locked */
-	if (unlikely(h_parent != h_dentry->d_parent))
-		err = -EIO;
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/*
- * initial procedure of adding a new entry.
- * prepare writable branch and the parent dir, lock it,
- * and lookup whiteout for the new entry.
- */
-static struct dentry*
-lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt,
-		  struct dentry *src_dentry, struct au_pin *pin,
-		  struct au_wr_dir_args *wr_dir_args)
-{
-	struct dentry *wh_dentry, *h_parent;
-	struct super_block *sb;
-	struct au_branch *br;
-	int err;
-	unsigned int udba;
-	aufs_bindex_t bcpup;
-
-	AuDbg("%pd\n", dentry);
-
-	err = au_wr_dir(dentry, src_dentry, wr_dir_args);
-	bcpup = err;
-	wh_dentry = ERR_PTR(err);
-	if (unlikely(err < 0))
-		goto out;
-
-	sb = dentry->d_sb;
-	udba = au_opt_udba(sb);
-	err = au_pin(pin, dentry, bcpup, udba,
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	wh_dentry = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	h_parent = au_pinned_h_parent(pin);
-	if (udba != AuOpt_UDBA_NONE
-	    && au_dbstart(dentry) == bcpup)
-		err = au_may_add(dentry, bcpup, h_parent,
-				 au_ftest_wrdir(wr_dir_args->flags, ISDIR));
-	else if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
-		err = -ENAMETOOLONG;
-	wh_dentry = ERR_PTR(err);
-	if (unlikely(err))
-		goto out_unpin;
-
-	br = au_sbr(sb, bcpup);
-	if (dt) {
-		struct path tmp = {
-			.dentry	= h_parent,
-			.mnt	= au_br_mnt(br)
-		};
-		au_dtime_store(dt, au_pinned_parent(pin), &tmp);
-	}
-
-	wh_dentry = NULL;
-	if (bcpup != au_dbwh(dentry))
-		goto out; /* success */
-
-	/*
-	 * ENAMETOOLONG here means that if we allowed create such name, then it
-	 * would not be able to removed in the future. So we don't allow such
-	 * name here and we don't handle ENAMETOOLONG differently here.
-	 */
-	wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br);
-
-out_unpin:
-	if (IS_ERR(wh_dentry))
-		au_unpin(pin);
-out:
-	return wh_dentry;
-}
-
-/* ---------------------------------------------------------------------- */
-
-enum { Mknod, Symlink, Creat };
-struct simple_arg {
-	int type;
-	union {
-		struct {
-			umode_t			mode;
-			bool			want_excl;
-			bool			try_aopen;
-			struct vfsub_aopen_args	*aopen;
-		} c;
-		struct {
-			const char *symname;
-		} s;
-		struct {
-			umode_t mode;
-			dev_t dev;
-		} m;
-	} u;
-};
-
-static int add_simple(struct inode *dir, struct dentry *dentry,
-		      struct simple_arg *arg)
-{
-	int err, rerr;
-	aufs_bindex_t bstart;
-	unsigned char created;
-	const unsigned char try_aopen
-		= (arg->type == Creat && arg->u.c.try_aopen);
-	struct dentry *wh_dentry, *parent;
-	struct inode *h_dir;
-	struct super_block *sb;
-	struct au_branch *br;
-	/* to reuduce stack size */
-	struct {
-		struct au_dtime dt;
-		struct au_pin pin;
-		struct path h_path;
-		struct au_wr_dir_args wr_dir_args;
-	} *a;
-
-	AuDbg("%pd\n", dentry);
-	IMustLock(dir);
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-	a->wr_dir_args.force_btgt = -1;
-	a->wr_dir_args.flags = AuWrDir_ADD_ENTRY;
-
-	parent = dentry->d_parent; /* dir inode is locked */
-	if (!try_aopen) {
-		err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
-		if (unlikely(err))
-			goto out_free;
-	}
-	err = au_d_may_add(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-	if (!try_aopen)
-		di_write_lock_parent(parent);
-	wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL,
-				      &a->pin, &a->wr_dir_args);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_parent;
-
-	bstart = au_dbstart(dentry);
-	sb = dentry->d_sb;
-	br = au_sbr(sb, bstart);
-	a->h_path.dentry = au_h_dptr(dentry, bstart);
-	a->h_path.mnt = au_br_mnt(br);
-	h_dir = au_pinned_h_dir(&a->pin);
-	switch (arg->type) {
-	case Creat:
-		err = 0;
-		if (!try_aopen || !h_dir->i_op->atomic_open)
-			err = vfsub_create(h_dir, &a->h_path, arg->u.c.mode,
-					   arg->u.c.want_excl);
-		else
-			err = vfsub_atomic_open(h_dir, a->h_path.dentry,
-						arg->u.c.aopen, br);
-		break;
-	case Symlink:
-		err = vfsub_symlink(h_dir, &a->h_path, arg->u.s.symname);
-		break;
-	case Mknod:
-		err = vfsub_mknod(h_dir, &a->h_path, arg->u.m.mode,
-				  arg->u.m.dev);
-		break;
-	default:
-		BUG();
-	}
-	created = !err;
-	if (!err)
-		err = epilog(dir, bstart, wh_dentry, dentry);
-
-	/* revert */
-	if (unlikely(created && err && d_is_positive(a->h_path.dentry))) {
-		/* no delegation since it is just created */
-		rerr = vfsub_unlink(h_dir, &a->h_path, /*delegated*/NULL,
-				    /*force*/0);
-		if (rerr) {
-			AuIOErr("%pd revert failure(%d, %d)\n",
-				dentry, err, rerr);
-			err = -EIO;
-		}
-		au_dtime_revert(&a->dt);
-	}
-
-	if (!err && try_aopen && !h_dir->i_op->atomic_open)
-		*arg->u.c.aopen->opened |= FILE_CREATED;
-
-	au_unpin(&a->pin);
-	dput(wh_dentry);
-
-out_parent:
-	if (!try_aopen)
-		di_write_unlock(parent);
-out_unlock:
-	if (unlikely(err)) {
-		au_update_dbstart(dentry);
-		d_drop(dentry);
-	}
-	if (!try_aopen)
-		aufs_read_unlock(dentry, AuLock_DW);
-out_free:
-	kfree(a);
-out:
-	return err;
-}
-
-int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-	       dev_t dev)
-{
-	struct simple_arg arg = {
-		.type = Mknod,
-		.u.m = {
-			.mode	= mode,
-			.dev	= dev
-		}
-	};
-	return add_simple(dir, dentry, &arg);
-}
-
-int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
-{
-	struct simple_arg arg = {
-		.type = Symlink,
-		.u.s.symname = symname
-	};
-	return add_simple(dir, dentry, &arg);
-}
-
-int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool want_excl)
-{
-	struct simple_arg arg = {
-		.type = Creat,
-		.u.c = {
-			.mode		= mode,
-			.want_excl	= want_excl
-		}
-	};
-	return add_simple(dir, dentry, &arg);
-}
-
-int au_aopen_or_create(struct inode *dir, struct dentry *dentry,
-		       struct vfsub_aopen_args *aopen_args)
-{
-	struct simple_arg arg = {
-		.type = Creat,
-		.u.c = {
-			.mode		= aopen_args->create_mode,
-			.want_excl	= aopen_args->open_flag & O_EXCL,
-			.try_aopen	= true,
-			.aopen		= aopen_args
-		}
-	};
-	return add_simple(dir, dentry, &arg);
-}
-
-int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct super_block *sb;
-	struct dentry *parent, *h_parent, *h_dentry;
-	struct inode *h_dir, *inode;
-	struct vfsmount *h_mnt;
-	struct au_wr_dir_args wr_dir_args = {
-		.force_btgt	= -1,
-		.flags		= AuWrDir_TMPFILE
-	};
-
-	/* copy-up may happen */
-	mutex_lock(&dir->i_mutex);
-
-	sb = dir->i_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out;
-
-	err = au_di_init(dentry);
-	if (unlikely(err))
-		goto out_si;
-
-	err = -EBUSY;
-	parent = d_find_any_alias(dir);
-	AuDebugOn(!parent);
-	di_write_lock_parent(parent);
-	if (unlikely(d_inode(parent) != dir))
-		goto out_parent;
-
-	err = au_digen_test(parent, au_sigen(sb));
-	if (unlikely(err))
-		goto out_parent;
-
-	bindex = au_dbstart(parent);
-	au_set_dbstart(dentry, bindex);
-	au_set_dbend(dentry, bindex);
-	err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args);
-	bindex = err;
-	if (unlikely(err < 0))
-		goto out_parent;
-
-	err = -EOPNOTSUPP;
-	h_dir = au_h_iptr(dir, bindex);
-	if (unlikely(!h_dir->i_op->tmpfile))
-		goto out_parent;
-
-	h_mnt = au_sbr_mnt(sb, bindex);
-	err = vfsub_mnt_want_write(h_mnt);
-	if (unlikely(err))
-		goto out_parent;
-
-	h_parent = au_h_dptr(parent, bindex);
-	err = inode_permission(d_inode(h_parent), MAY_WRITE | MAY_EXEC);
-	if (unlikely(err))
-		goto out_mnt;
-
-	err = -ENOMEM;
-	h_dentry = d_alloc(h_parent, &dentry->d_name);
-	if (unlikely(!h_dentry))
-		goto out_mnt;
-
-	err = h_dir->i_op->tmpfile(h_dir, h_dentry, mode);
-	if (unlikely(err))
-		goto out_dentry;
-
-	au_set_dbstart(dentry, bindex);
-	au_set_dbend(dentry, bindex);
-	au_set_h_dptr(dentry, bindex, dget(h_dentry));
-	inode = au_new_inode(dentry, /*must_new*/1);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		au_set_h_dptr(dentry, bindex, NULL);
-		au_set_dbstart(dentry, -1);
-		au_set_dbend(dentry, -1);
-	} else {
-		if (!inode->i_nlink)
-			set_nlink(inode, 1);
-		d_tmpfile(dentry, inode);
-		au_di(dentry)->di_tmpfile = 1;
-
-		/* update without i_mutex */
-		if (au_ibstart(dir) == au_dbstart(dentry))
-			au_cpup_attr_timesizes(dir);
-	}
-
-out_dentry:
-	dput(h_dentry);
-out_mnt:
-	vfsub_mnt_drop_write(h_mnt);
-out_parent:
-	di_write_unlock(parent);
-	dput(parent);
-	di_write_unlock(dentry);
-	if (!err)
-#if 0
-		/* verbose coding for lock class name */
-		au_rw_class(&au_di(dentry)->di_rwsem,
-			    au_lc_key + AuLcNonDir_DIINFO);
-#else
-		;
-#endif
-	else {
-		au_di_fin(dentry);
-		dentry->d_fsdata = NULL;
-	}
-out_si:
-	si_read_unlock(sb);
-out:
-	mutex_unlock(&dir->i_mutex);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_link_args {
-	aufs_bindex_t bdst, bsrc;
-	struct au_pin pin;
-	struct path h_path;
-	struct dentry *src_parent, *parent;
-};
-
-static int au_cpup_before_link(struct dentry *src_dentry,
-			       struct au_link_args *a)
-{
-	int err;
-	struct dentry *h_src_dentry;
-	struct au_cp_generic cpg = {
-		.dentry	= src_dentry,
-		.bdst	= a->bdst,
-		.bsrc	= a->bsrc,
-		.len	= -1,
-		.pin	= &a->pin,
-		.flags	= AuCpup_DTIME | AuCpup_HOPEN /* | AuCpup_KEEPLINO */
-	};
-
-	di_read_lock_parent(a->src_parent, AuLock_IR);
-	err = au_test_and_cpup_dirs(src_dentry, a->bdst);
-	if (unlikely(err))
-		goto out;
-
-	h_src_dentry = au_h_dptr(src_dentry, a->bsrc);
-	err = au_pin(&a->pin, src_dentry, a->bdst,
-		     au_opt_udba(src_dentry->d_sb),
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	if (unlikely(err))
-		goto out;
-
-	err = au_sio_cpup_simple(&cpg);
-	au_unpin(&a->pin);
-
-out:
-	di_read_unlock(a->src_parent, AuLock_IR);
-	return err;
-}
-
-static int au_cpup_or_link(struct dentry *src_dentry, struct dentry *dentry,
-			   struct au_link_args *a)
-{
-	int err;
-	unsigned char plink;
-	aufs_bindex_t bend;
-	struct dentry *h_src_dentry;
-	struct inode *h_inode, *inode, *delegated;
-	struct super_block *sb;
-	struct file *h_file;
-
-	plink = 0;
-	h_inode = NULL;
-	sb = src_dentry->d_sb;
-	inode = d_inode(src_dentry);
-	if (au_ibstart(inode) <= a->bdst)
-		h_inode = au_h_iptr(inode, a->bdst);
-	if (!h_inode || !h_inode->i_nlink) {
-		/* copyup src_dentry as the name of dentry. */
-		bend = au_dbend(dentry);
-		if (bend < a->bsrc)
-			au_set_dbend(dentry, a->bsrc);
-		au_set_h_dptr(dentry, a->bsrc,
-			      dget(au_h_dptr(src_dentry, a->bsrc)));
-		dget(a->h_path.dentry);
-		au_set_h_dptr(dentry, a->bdst, NULL);
-		AuDbg("temporary d_inode...\n");
-		spin_lock(&dentry->d_lock);
-		dentry->d_inode = d_inode(src_dentry); /* tmp */
-		spin_unlock(&dentry->d_lock);
-		h_file = au_h_open_pre(dentry, a->bsrc, /*force_wr*/0);
-		if (IS_ERR(h_file))
-			err = PTR_ERR(h_file);
-		else {
-			struct au_cp_generic cpg = {
-				.dentry	= dentry,
-				.bdst	= a->bdst,
-				.bsrc	= -1,
-				.len	= -1,
-				.pin	= &a->pin,
-				.flags	= AuCpup_KEEPLINO
-			};
-			err = au_sio_cpup_simple(&cpg);
-			au_h_open_post(dentry, a->bsrc, h_file);
-			if (!err) {
-				dput(a->h_path.dentry);
-				a->h_path.dentry = au_h_dptr(dentry, a->bdst);
-			} else
-				au_set_h_dptr(dentry, a->bdst,
-					      a->h_path.dentry);
-		}
-		spin_lock(&dentry->d_lock);
-		dentry->d_inode = NULL; /* restore */
-		spin_unlock(&dentry->d_lock);
-		AuDbg("temporary d_inode...done\n");
-		au_set_h_dptr(dentry, a->bsrc, NULL);
-		au_set_dbend(dentry, bend);
-	} else {
-		/* the inode of src_dentry already exists on a.bdst branch */
-		h_src_dentry = d_find_alias(h_inode);
-		if (!h_src_dentry && au_plink_test(inode)) {
-			plink = 1;
-			h_src_dentry = au_plink_lkup(inode, a->bdst);
-			err = PTR_ERR(h_src_dentry);
-			if (IS_ERR(h_src_dentry))
-				goto out;
-
-			if (unlikely(d_is_negative(h_src_dentry))) {
-				dput(h_src_dentry);
-				h_src_dentry = NULL;
-			}
-
-		}
-		if (h_src_dentry) {
-			delegated = NULL;
-			err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin),
-					 &a->h_path, &delegated);
-			if (unlikely(err == -EWOULDBLOCK)) {
-				pr_warn("cannot retry for NFSv4 delegation"
-					" for an internal link\n");
-				iput(delegated);
-			}
-			dput(h_src_dentry);
-		} else {
-			AuIOErr("no dentry found for hi%lu on b%d\n",
-				h_inode->i_ino, a->bdst);
-			err = -EIO;
-		}
-	}
-
-	if (!err && !plink)
-		au_plink_append(inode, a->bdst, a->h_path.dentry);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int aufs_link(struct dentry *src_dentry, struct inode *dir,
-	      struct dentry *dentry)
-{
-	int err, rerr;
-	struct au_dtime dt;
-	struct au_link_args *a;
-	struct dentry *wh_dentry, *h_src_dentry;
-	struct inode *inode, *delegated;
-	struct super_block *sb;
-	struct au_wr_dir_args wr_dir_args = {
-		/* .force_btgt	= -1, */
-		.flags		= AuWrDir_ADD_ENTRY
-	};
-
-	IMustLock(dir);
-	inode = d_inode(src_dentry);
-	IMustLock(inode);
-
-	err = -ENOMEM;
-	a = kzalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	a->parent = dentry->d_parent; /* dir inode is locked */
-	err = aufs_read_and_write_lock2(dentry, src_dentry,
-					AuLock_NOPLM | AuLock_GEN);
-	if (unlikely(err))
-		goto out_kfree;
-	err = au_d_linkable(src_dentry);
-	if (unlikely(err))
-		goto out_unlock;
-	err = au_d_may_add(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-
-	a->src_parent = dget_parent(src_dentry);
-	wr_dir_args.force_btgt = au_ibstart(inode);
-
-	di_write_lock_parent(a->parent);
-	wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt);
-	wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin,
-				      &wr_dir_args);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_parent;
-
-	err = 0;
-	sb = dentry->d_sb;
-	a->bdst = au_dbstart(dentry);
-	a->h_path.dentry = au_h_dptr(dentry, a->bdst);
-	a->h_path.mnt = au_sbr_mnt(sb, a->bdst);
-	a->bsrc = au_ibstart(inode);
-	h_src_dentry = au_h_d_alias(src_dentry, a->bsrc);
-	if (!h_src_dentry && au_di(src_dentry)->di_tmpfile)
-		h_src_dentry = dget(au_hi_wh(inode, a->bsrc));
-	if (!h_src_dentry) {
-		a->bsrc = au_dbstart(src_dentry);
-		h_src_dentry = au_h_d_alias(src_dentry, a->bsrc);
-		AuDebugOn(!h_src_dentry);
-	} else if (IS_ERR(h_src_dentry)) {
-		err = PTR_ERR(h_src_dentry);
-		goto out_parent;
-	}
-
-	if (au_opt_test(au_mntflags(sb), PLINK)) {
-		if (a->bdst < a->bsrc
-		    /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */)
-			err = au_cpup_or_link(src_dentry, dentry, a);
-		else {
-			delegated = NULL;
-			err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin),
-					 &a->h_path, &delegated);
-			if (unlikely(err == -EWOULDBLOCK)) {
-				pr_warn("cannot retry for NFSv4 delegation"
-					" for an internal link\n");
-				iput(delegated);
-			}
-		}
-		dput(h_src_dentry);
-	} else {
-		/*
-		 * copyup src_dentry to the branch we process,
-		 * and then link(2) to it.
-		 */
-		dput(h_src_dentry);
-		if (a->bdst < a->bsrc
-		    /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) {
-			au_unpin(&a->pin);
-			di_write_unlock(a->parent);
-			err = au_cpup_before_link(src_dentry, a);
-			di_write_lock_parent(a->parent);
-			if (!err)
-				err = au_pin(&a->pin, dentry, a->bdst,
-					     au_opt_udba(sb),
-					     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-			if (unlikely(err))
-				goto out_wh;
-		}
-		if (!err) {
-			h_src_dentry = au_h_dptr(src_dentry, a->bdst);
-			err = -ENOENT;
-			if (h_src_dentry && d_is_positive(h_src_dentry)) {
-				delegated = NULL;
-				err = vfsub_link(h_src_dentry,
-						 au_pinned_h_dir(&a->pin),
-						 &a->h_path, &delegated);
-				if (unlikely(err == -EWOULDBLOCK)) {
-					pr_warn("cannot retry"
-						" for NFSv4 delegation"
-						" for an internal link\n");
-					iput(delegated);
-				}
-			}
-		}
-	}
-	if (unlikely(err))
-		goto out_unpin;
-
-	if (wh_dentry) {
-		a->h_path.dentry = wh_dentry;
-		err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path,
-					  dentry);
-		if (unlikely(err))
-			goto out_revert;
-	}
-
-	au_dir_ts(dir, a->bdst);
-	dir->i_version++;
-	inc_nlink(inode);
-	inode->i_ctime = dir->i_ctime;
-	d_instantiate(dentry, au_igrab(inode));
-	if (d_unhashed(a->h_path.dentry))
-		/* some filesystem calls d_drop() */
-		d_drop(dentry);
-	/* some filesystems consume an inode even hardlink */
-	au_fhsm_wrote(sb, a->bdst, /*force*/0);
-	goto out_unpin; /* success */
-
-out_revert:
-	/* no delegation since it is just created */
-	rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path,
-			    /*delegated*/NULL, /*force*/0);
-	if (unlikely(rerr)) {
-		AuIOErr("%pd reverting failed(%d, %d)\n", dentry, err, rerr);
-		err = -EIO;
-	}
-	au_dtime_revert(&dt);
-out_unpin:
-	au_unpin(&a->pin);
-out_wh:
-	dput(wh_dentry);
-out_parent:
-	di_write_unlock(a->parent);
-	dput(a->src_parent);
-out_unlock:
-	if (unlikely(err)) {
-		au_update_dbstart(dentry);
-		d_drop(dentry);
-	}
-	aufs_read_and_write_unlock2(dentry, src_dentry);
-out_kfree:
-	kfree(a);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	int err, rerr;
-	aufs_bindex_t bindex;
-	unsigned char diropq;
-	struct path h_path;
-	struct dentry *wh_dentry, *parent, *opq_dentry;
-	struct mutex *h_mtx;
-	struct super_block *sb;
-	struct {
-		struct au_pin pin;
-		struct au_dtime dt;
-	} *a; /* reduce the stack usage */
-	struct au_wr_dir_args wr_dir_args = {
-		.force_btgt	= -1,
-		.flags		= AuWrDir_ADD_ENTRY | AuWrDir_ISDIR
-	};
-
-	IMustLock(dir);
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
-	if (unlikely(err))
-		goto out_free;
-	err = au_d_may_add(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-
-	parent = dentry->d_parent; /* dir inode is locked */
-	di_write_lock_parent(parent);
-	wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL,
-				      &a->pin, &wr_dir_args);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_parent;
-
-	sb = dentry->d_sb;
-	bindex = au_dbstart(dentry);
-	h_path.dentry = au_h_dptr(dentry, bindex);
-	h_path.mnt = au_sbr_mnt(sb, bindex);
-	err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode);
-	if (unlikely(err))
-		goto out_unpin;
-
-	/* make the dir opaque */
-	diropq = 0;
-	h_mtx = &d_inode(h_path.dentry)->i_mutex;
-	if (wh_dentry
-	    || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) {
-		mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
-		opq_dentry = au_diropq_create(dentry, bindex);
-		mutex_unlock(h_mtx);
-		err = PTR_ERR(opq_dentry);
-		if (IS_ERR(opq_dentry))
-			goto out_dir;
-		dput(opq_dentry);
-		diropq = 1;
-	}
-
-	err = epilog(dir, bindex, wh_dentry, dentry);
-	if (!err) {
-		inc_nlink(dir);
-		goto out_unpin; /* success */
-	}
-
-	/* revert */
-	if (diropq) {
-		AuLabel(revert opq);
-		mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
-		rerr = au_diropq_remove(dentry, bindex);
-		mutex_unlock(h_mtx);
-		if (rerr) {
-			AuIOErr("%pd reverting diropq failed(%d, %d)\n",
-				dentry, err, rerr);
-			err = -EIO;
-		}
-	}
-
-out_dir:
-	AuLabel(revert dir);
-	rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path);
-	if (rerr) {
-		AuIOErr("%pd reverting dir failed(%d, %d)\n",
-			dentry, err, rerr);
-		err = -EIO;
-	}
-	au_dtime_revert(&a->dt);
-out_unpin:
-	au_unpin(&a->pin);
-	dput(wh_dentry);
-out_parent:
-	di_write_unlock(parent);
-out_unlock:
-	if (unlikely(err)) {
-		au_update_dbstart(dentry);
-		d_drop(dentry);
-	}
-	aufs_read_unlock(dentry, AuLock_DW);
-out_free:
-	kfree(a);
-out:
-	return err;
-}
diff --git a/fs/aufs/i_op_del.c b/fs/aufs/i_op_del.c
deleted file mode 100644
index 78cf67a52..000000000
--- a/fs/aufs/i_op_del.c
+++ /dev/null
@@ -1,510 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * inode operations (del entry)
- */
-
-#include "aufs.h"
-
-/*
- * decide if a new whiteout for @dentry is necessary or not.
- * when it is necessary, prepare the parent dir for the upper branch whose
- * branch index is @bcpup for creation. the actual creation of the whiteout will
- * be done by caller.
- * return value:
- * 0: wh is unnecessary
- * plus: wh is necessary
- * minus: error
- */
-int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup)
-{
-	int need_wh, err;
-	aufs_bindex_t bstart;
-	struct super_block *sb;
-
-	sb = dentry->d_sb;
-	bstart = au_dbstart(dentry);
-	if (*bcpup < 0) {
-		*bcpup = bstart;
-		if (au_test_ro(sb, bstart, d_inode(dentry))) {
-			err = AuWbrCopyup(au_sbi(sb), dentry);
-			*bcpup = err;
-			if (unlikely(err < 0))
-				goto out;
-		}
-	} else
-		AuDebugOn(bstart < *bcpup
-			  || au_test_ro(sb, *bcpup, d_inode(dentry)));
-	AuDbg("bcpup %d, bstart %d\n", *bcpup, bstart);
-
-	if (*bcpup != bstart) {
-		err = au_cpup_dirs(dentry, *bcpup);
-		if (unlikely(err))
-			goto out;
-		need_wh = 1;
-	} else {
-		struct au_dinfo *dinfo, *tmp;
-
-		need_wh = -ENOMEM;
-		dinfo = au_di(dentry);
-		tmp = au_di_alloc(sb, AuLsc_DI_TMP);
-		if (tmp) {
-			au_di_cp(tmp, dinfo);
-			au_di_swap(tmp, dinfo);
-			/* returns the number of positive dentries */
-			need_wh = au_lkup_dentry(dentry, bstart + 1, /*type*/0);
-			au_di_swap(tmp, dinfo);
-			au_rw_write_unlock(&tmp->di_rwsem);
-			au_di_free(tmp);
-		}
-	}
-	AuDbg("need_wh %d\n", need_wh);
-	err = need_wh;
-
-out:
-	return err;
-}
-
-/*
- * simple tests for the del-entry operations.
- * following the checks in vfs, plus the parent-child relationship.
- */
-int au_may_del(struct dentry *dentry, aufs_bindex_t bindex,
-	       struct dentry *h_parent, int isdir)
-{
-	int err;
-	umode_t h_mode;
-	struct dentry *h_dentry, *h_latest;
-	struct inode *h_inode;
-
-	h_dentry = au_h_dptr(dentry, bindex);
-	if (d_really_is_positive(dentry)) {
-		err = -ENOENT;
-		if (unlikely(d_is_negative(h_dentry)))
-			goto out;
-		h_inode = d_inode(h_dentry);
-		if (unlikely(!h_inode->i_nlink))
-			goto out;
-
-		h_mode = h_inode->i_mode;
-		if (!isdir) {
-			err = -EISDIR;
-			if (unlikely(S_ISDIR(h_mode)))
-				goto out;
-		} else if (unlikely(!S_ISDIR(h_mode))) {
-			err = -ENOTDIR;
-			goto out;
-		}
-	} else {
-		/* rename(2) case */
-		err = -EIO;
-		if (unlikely(d_is_positive(h_dentry)))
-			goto out;
-	}
-
-	err = -ENOENT;
-	/* expected parent dir is locked */
-	if (unlikely(h_parent != h_dentry->d_parent))
-		goto out;
-	err = 0;
-
-	/*
-	 * rmdir a dir may break the consistency on some filesystem.
-	 * let's try heavy test.
-	 */
-	err = -EACCES;
-	if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1)
-		     && au_test_h_perm(d_inode(h_parent),
-				       MAY_EXEC | MAY_WRITE)))
-		goto out;
-
-	h_latest = au_sio_lkup_one(&dentry->d_name, h_parent);
-	err = -EIO;
-	if (IS_ERR(h_latest))
-		goto out;
-	if (h_latest == h_dentry)
-		err = 0;
-	dput(h_latest);
-
-out:
-	return err;
-}
-
-/*
- * decide the branch where we operate for @dentry. the branch index will be set
- * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent
- * dir for reverting.
- * when a new whiteout is necessary, create it.
- */
-static struct dentry*
-lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup,
-		    struct au_dtime *dt, struct au_pin *pin)
-{
-	struct dentry *wh_dentry;
-	struct super_block *sb;
-	struct path h_path;
-	int err, need_wh;
-	unsigned int udba;
-	aufs_bindex_t bcpup;
-
-	need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup);
-	wh_dentry = ERR_PTR(need_wh);
-	if (unlikely(need_wh < 0))
-		goto out;
-
-	sb = dentry->d_sb;
-	udba = au_opt_udba(sb);
-	bcpup = *rbcpup;
-	err = au_pin(pin, dentry, bcpup, udba,
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	wh_dentry = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	h_path.dentry = au_pinned_h_parent(pin);
-	if (udba != AuOpt_UDBA_NONE
-	    && au_dbstart(dentry) == bcpup) {
-		err = au_may_del(dentry, bcpup, h_path.dentry, isdir);
-		wh_dentry = ERR_PTR(err);
-		if (unlikely(err))
-			goto out_unpin;
-	}
-
-	h_path.mnt = au_sbr_mnt(sb, bcpup);
-	au_dtime_store(dt, au_pinned_parent(pin), &h_path);
-	wh_dentry = NULL;
-	if (!need_wh)
-		goto out; /* success, no need to create whiteout */
-
-	wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_unpin;
-
-	/* returns with the parent is locked and wh_dentry is dget-ed */
-	goto out; /* success */
-
-out_unpin:
-	au_unpin(pin);
-out:
-	return wh_dentry;
-}
-
-/*
- * when removing a dir, rename it to a unique temporary whiteout-ed name first
- * in order to be revertible and save time for removing many child whiteouts
- * under the dir.
- * returns 1 when there are too many child whiteout and caller should remove
- * them asynchronously. returns 0 when the number of children is enough small to
- * remove now or the branch fs is a remote fs.
- * otherwise return an error.
- */
-static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex,
-			   struct au_nhash *whlist, struct inode *dir)
-{
-	int rmdir_later, err, dirwh;
-	struct dentry *h_dentry;
-	struct super_block *sb;
-	struct inode *inode;
-
-	sb = dentry->d_sb;
-	SiMustAnyLock(sb);
-	h_dentry = au_h_dptr(dentry, bindex);
-	err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex));
-	if (unlikely(err))
-		goto out;
-
-	/* stop monitoring */
-	inode = d_inode(dentry);
-	au_hn_free(au_hi(inode, bindex));
-
-	if (!au_test_fs_remote(h_dentry->d_sb)) {
-		dirwh = au_sbi(sb)->si_dirwh;
-		rmdir_later = (dirwh <= 1);
-		if (!rmdir_later)
-			rmdir_later = au_nhash_test_longer_wh(whlist, bindex,
-							      dirwh);
-		if (rmdir_later)
-			return rmdir_later;
-	}
-
-	err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist);
-	if (unlikely(err)) {
-		AuIOErr("rmdir %pd, b%d failed, %d. ignored\n",
-			h_dentry, bindex, err);
-		err = 0;
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/*
- * final procedure for deleting a entry.
- * maintain dentry and iattr.
- */
-static void epilog(struct inode *dir, struct dentry *dentry,
-		   aufs_bindex_t bindex)
-{
-	struct inode *inode;
-
-	inode = d_inode(dentry);
-	d_drop(dentry);
-	inode->i_ctime = dir->i_ctime;
-
-	au_dir_ts(dir, bindex);
-	dir->i_version++;
-}
-
-/*
- * when an error happened, remove the created whiteout and revert everything.
- */
-static int do_revert(int err, struct inode *dir, aufs_bindex_t bindex,
-		     aufs_bindex_t bwh, struct dentry *wh_dentry,
-		     struct dentry *dentry, struct au_dtime *dt)
-{
-	int rerr;
-	struct path h_path = {
-		.dentry	= wh_dentry,
-		.mnt	= au_sbr_mnt(dir->i_sb, bindex)
-	};
-
-	rerr = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, dentry);
-	if (!rerr) {
-		au_set_dbwh(dentry, bwh);
-		au_dtime_revert(dt);
-		return 0;
-	}
-
-	AuIOErr("%pd reverting whiteout failed(%d, %d)\n", dentry, err, rerr);
-	return -EIO;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int aufs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	int err;
-	aufs_bindex_t bwh, bindex, bstart;
-	struct inode *inode, *h_dir, *delegated;
-	struct dentry *parent, *wh_dentry;
-	/* to reuduce stack size */
-	struct {
-		struct au_dtime dt;
-		struct au_pin pin;
-		struct path h_path;
-	} *a;
-
-	IMustLock(dir);
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
-	if (unlikely(err))
-		goto out_free;
-	err = au_d_hashed_positive(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-	inode = d_inode(dentry);
-	IMustLock(inode);
-	err = -EISDIR;
-	if (unlikely(d_is_dir(dentry)))
-		goto out_unlock; /* possible? */
-
-	bstart = au_dbstart(dentry);
-	bwh = au_dbwh(dentry);
-	bindex = -1;
-	parent = dentry->d_parent; /* dir inode is locked */
-	di_write_lock_parent(parent);
-	wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &a->dt,
-					&a->pin);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_parent;
-
-	a->h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart);
-	a->h_path.dentry = au_h_dptr(dentry, bstart);
-	dget(a->h_path.dentry);
-	if (bindex == bstart) {
-		h_dir = au_pinned_h_dir(&a->pin);
-		delegated = NULL;
-		err = vfsub_unlink(h_dir, &a->h_path, &delegated, /*force*/0);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal unlink\n");
-			iput(delegated);
-		}
-	} else {
-		/* dir inode is locked */
-		h_dir = d_inode(wh_dentry->d_parent);
-		IMustLock(h_dir);
-		err = 0;
-	}
-
-	if (!err) {
-		vfsub_drop_nlink(inode);
-		epilog(dir, dentry, bindex);
-
-		/* update target timestamps */
-		if (bindex == bstart) {
-			vfsub_update_h_iattr(&a->h_path, /*did*/NULL);
-			/*ignore*/
-			inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime;
-		} else
-			/* todo: this timestamp may be reverted later */
-			inode->i_ctime = h_dir->i_ctime;
-		goto out_unpin; /* success */
-	}
-
-	/* revert */
-	if (wh_dentry) {
-		int rerr;
-
-		rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry,
-				 &a->dt);
-		if (rerr)
-			err = rerr;
-	}
-
-out_unpin:
-	au_unpin(&a->pin);
-	dput(wh_dentry);
-	dput(a->h_path.dentry);
-out_parent:
-	di_write_unlock(parent);
-out_unlock:
-	aufs_read_unlock(dentry, AuLock_DW);
-out_free:
-	kfree(a);
-out:
-	return err;
-}
-
-int aufs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	int err, rmdir_later;
-	aufs_bindex_t bwh, bindex, bstart;
-	struct inode *inode;
-	struct dentry *parent, *wh_dentry, *h_dentry;
-	struct au_whtmp_rmdir *args;
-	/* to reuduce stack size */
-	struct {
-		struct au_dtime dt;
-		struct au_pin pin;
-	} *a;
-
-	IMustLock(dir);
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN);
-	if (unlikely(err))
-		goto out_free;
-	err = au_alive_dir(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-	inode = d_inode(dentry);
-	IMustLock(inode);
-	err = -ENOTDIR;
-	if (unlikely(!d_is_dir(dentry)))
-		goto out_unlock; /* possible? */
-
-	err = -ENOMEM;
-	args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS);
-	if (unlikely(!args))
-		goto out_unlock;
-
-	parent = dentry->d_parent; /* dir inode is locked */
-	di_write_lock_parent(parent);
-	err = au_test_empty(dentry, &args->whlist);
-	if (unlikely(err))
-		goto out_parent;
-
-	bstart = au_dbstart(dentry);
-	bwh = au_dbwh(dentry);
-	bindex = -1;
-	wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &a->dt,
-					&a->pin);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_parent;
-
-	h_dentry = au_h_dptr(dentry, bstart);
-	dget(h_dentry);
-	rmdir_later = 0;
-	if (bindex == bstart) {
-		err = renwh_and_rmdir(dentry, bstart, &args->whlist, dir);
-		if (err > 0) {
-			rmdir_later = err;
-			err = 0;
-		}
-	} else {
-		/* stop monitoring */
-		au_hn_free(au_hi(inode, bstart));
-
-		/* dir inode is locked */
-		IMustLock(d_inode(wh_dentry->d_parent));
-		err = 0;
-	}
-
-	if (!err) {
-		vfsub_dead_dir(inode);
-		au_set_dbdiropq(dentry, -1);
-		epilog(dir, dentry, bindex);
-
-		if (rmdir_later) {
-			au_whtmp_kick_rmdir(dir, bstart, h_dentry, args);
-			args = NULL;
-		}
-
-		goto out_unpin; /* success */
-	}
-
-	/* revert */
-	AuLabel(revert);
-	if (wh_dentry) {
-		int rerr;
-
-		rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry,
-				 &a->dt);
-		if (rerr)
-			err = rerr;
-	}
-
-out_unpin:
-	au_unpin(&a->pin);
-	dput(wh_dentry);
-	dput(h_dentry);
-out_parent:
-	di_write_unlock(parent);
-	if (args)
-		au_whtmp_rmdir_free(args);
-out_unlock:
-	aufs_read_unlock(dentry, AuLock_DW);
-out_free:
-	kfree(a);
-out:
-	AuTraceErr(err);
-	return err;
-}
diff --git a/fs/aufs/i_op_ren.c b/fs/aufs/i_op_ren.c
deleted file mode 100644
index 4db778057..000000000
--- a/fs/aufs/i_op_ren.c
+++ /dev/null
@@ -1,1017 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * inode operation (rename entry)
- * todo: this is crazy monster
- */
-
-#include "aufs.h"
-
-enum { AuSRC, AuDST, AuSrcDst };
-enum { AuPARENT, AuCHILD, AuParentChild };
-
-#define AuRen_ISDIR	1
-#define AuRen_ISSAMEDIR	(1 << 1)
-#define AuRen_WHSRC	(1 << 2)
-#define AuRen_WHDST	(1 << 3)
-#define AuRen_MNT_WRITE	(1 << 4)
-#define AuRen_DT_DSTDIR	(1 << 5)
-#define AuRen_DIROPQ	(1 << 6)
-#define au_ftest_ren(flags, name)	((flags) & AuRen_##name)
-#define au_fset_ren(flags, name) \
-	do { (flags) |= AuRen_##name; } while (0)
-#define au_fclr_ren(flags, name) \
-	do { (flags) &= ~AuRen_##name; } while (0)
-
-struct au_ren_args {
-	struct {
-		struct dentry *dentry, *h_dentry, *parent, *h_parent,
-			*wh_dentry;
-		struct inode *dir, *inode;
-		struct au_hinode *hdir;
-		struct au_dtime dt[AuParentChild];
-		aufs_bindex_t bstart;
-	} sd[AuSrcDst];
-
-#define src_dentry	sd[AuSRC].dentry
-#define src_dir		sd[AuSRC].dir
-#define src_inode	sd[AuSRC].inode
-#define src_h_dentry	sd[AuSRC].h_dentry
-#define src_parent	sd[AuSRC].parent
-#define src_h_parent	sd[AuSRC].h_parent
-#define src_wh_dentry	sd[AuSRC].wh_dentry
-#define src_hdir	sd[AuSRC].hdir
-#define src_h_dir	sd[AuSRC].hdir->hi_inode
-#define src_dt		sd[AuSRC].dt
-#define src_bstart	sd[AuSRC].bstart
-
-#define dst_dentry	sd[AuDST].dentry
-#define dst_dir		sd[AuDST].dir
-#define dst_inode	sd[AuDST].inode
-#define dst_h_dentry	sd[AuDST].h_dentry
-#define dst_parent	sd[AuDST].parent
-#define dst_h_parent	sd[AuDST].h_parent
-#define dst_wh_dentry	sd[AuDST].wh_dentry
-#define dst_hdir	sd[AuDST].hdir
-#define dst_h_dir	sd[AuDST].hdir->hi_inode
-#define dst_dt		sd[AuDST].dt
-#define dst_bstart	sd[AuDST].bstart
-
-	struct dentry *h_trap;
-	struct au_branch *br;
-	struct au_hinode *src_hinode;
-	struct path h_path;
-	struct au_nhash whlist;
-	aufs_bindex_t btgt, src_bwh, src_bdiropq;
-
-	unsigned int flags;
-
-	struct au_whtmp_rmdir *thargs;
-	struct dentry *h_dst;
-};
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * functions for reverting.
- * when an error happened in a single rename systemcall, we should revert
- * everything as if nothing happend.
- * we don't need to revert the copied-up/down the parent dir since they are
- * harmless.
- */
-
-#define RevertFailure(fmt, ...) do { \
-	AuIOErr("revert failure: " fmt " (%d, %d)\n", \
-		##__VA_ARGS__, err, rerr); \
-	err = -EIO; \
-} while (0)
-
-static void au_ren_rev_diropq(int err, struct au_ren_args *a)
-{
-	int rerr;
-
-	au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD);
-	rerr = au_diropq_remove(a->src_dentry, a->btgt);
-	au_hn_imtx_unlock(a->src_hinode);
-	au_set_dbdiropq(a->src_dentry, a->src_bdiropq);
-	if (rerr)
-		RevertFailure("remove diropq %pd", a->src_dentry);
-}
-
-static void au_ren_rev_rename(int err, struct au_ren_args *a)
-{
-	int rerr;
-	struct inode *delegated;
-
-	a->h_path.dentry = vfsub_lkup_one(&a->src_dentry->d_name,
-					  a->src_h_parent);
-	rerr = PTR_ERR(a->h_path.dentry);
-	if (IS_ERR(a->h_path.dentry)) {
-		RevertFailure("lkup one %pd", a->src_dentry);
-		return;
-	}
-
-	delegated = NULL;
-	rerr = vfsub_rename(a->dst_h_dir,
-			    au_h_dptr(a->src_dentry, a->btgt),
-			    a->src_h_dir, &a->h_path, &delegated);
-	if (unlikely(rerr == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal rename\n");
-		iput(delegated);
-	}
-	d_drop(a->h_path.dentry);
-	dput(a->h_path.dentry);
-	/* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */
-	if (rerr)
-		RevertFailure("rename %pd", a->src_dentry);
-}
-
-static void au_ren_rev_whtmp(int err, struct au_ren_args *a)
-{
-	int rerr;
-	struct inode *delegated;
-
-	a->h_path.dentry = vfsub_lkup_one(&a->dst_dentry->d_name,
-					  a->dst_h_parent);
-	rerr = PTR_ERR(a->h_path.dentry);
-	if (IS_ERR(a->h_path.dentry)) {
-		RevertFailure("lkup one %pd", a->dst_dentry);
-		return;
-	}
-	if (d_is_positive(a->h_path.dentry)) {
-		d_drop(a->h_path.dentry);
-		dput(a->h_path.dentry);
-		return;
-	}
-
-	delegated = NULL;
-	rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path,
-			    &delegated);
-	if (unlikely(rerr == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal rename\n");
-		iput(delegated);
-	}
-	d_drop(a->h_path.dentry);
-	dput(a->h_path.dentry);
-	if (!rerr)
-		au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst));
-	else
-		RevertFailure("rename %pd", a->h_dst);
-}
-
-static void au_ren_rev_whsrc(int err, struct au_ren_args *a)
-{
-	int rerr;
-
-	a->h_path.dentry = a->src_wh_dentry;
-	rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry);
-	au_set_dbwh(a->src_dentry, a->src_bwh);
-	if (rerr)
-		RevertFailure("unlink %pd", a->src_wh_dentry);
-}
-#undef RevertFailure
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * when we have to copyup the renaming entry, do it with the rename-target name
- * in order to minimize the cost (the later actual rename is unnecessary).
- * otherwise rename it on the target branch.
- */
-static int au_ren_or_cpup(struct au_ren_args *a)
-{
-	int err;
-	struct dentry *d;
-	struct inode *delegated;
-
-	d = a->src_dentry;
-	if (au_dbstart(d) == a->btgt) {
-		a->h_path.dentry = a->dst_h_dentry;
-		if (au_ftest_ren(a->flags, DIROPQ)
-		    && au_dbdiropq(d) == a->btgt)
-			au_fclr_ren(a->flags, DIROPQ);
-		AuDebugOn(au_dbstart(d) != a->btgt);
-		delegated = NULL;
-		err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt),
-				   a->dst_h_dir, &a->h_path, &delegated);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal rename\n");
-			iput(delegated);
-		}
-	} else
-		BUG();
-
-	if (!err && a->h_dst)
-		/* it will be set to dinfo later */
-		dget(a->h_dst);
-
-	return err;
-}
-
-/* cf. aufs_rmdir() */
-static int au_ren_del_whtmp(struct au_ren_args *a)
-{
-	int err;
-	struct inode *dir;
-
-	dir = a->dst_dir;
-	SiMustAnyLock(dir->i_sb);
-	if (!au_nhash_test_longer_wh(&a->whlist, a->btgt,
-				     au_sbi(dir->i_sb)->si_dirwh)
-	    || au_test_fs_remote(a->h_dst->d_sb)) {
-		err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist);
-		if (unlikely(err))
-			pr_warn("failed removing whtmp dir %pd (%d), "
-				"ignored.\n", a->h_dst, err);
-	} else {
-		au_nhash_wh_free(&a->thargs->whlist);
-		a->thargs->whlist = a->whlist;
-		a->whlist.nh_num = 0;
-		au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs);
-		dput(a->h_dst);
-		a->thargs = NULL;
-	}
-
-	return 0;
-}
-
-/* make it 'opaque' dir. */
-static int au_ren_diropq(struct au_ren_args *a)
-{
-	int err;
-	struct dentry *diropq;
-
-	err = 0;
-	a->src_bdiropq = au_dbdiropq(a->src_dentry);
-	a->src_hinode = au_hi(a->src_inode, a->btgt);
-	au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD);
-	diropq = au_diropq_create(a->src_dentry, a->btgt);
-	au_hn_imtx_unlock(a->src_hinode);
-	if (IS_ERR(diropq))
-		err = PTR_ERR(diropq);
-	else
-		dput(diropq);
-
-	return err;
-}
-
-static int do_rename(struct au_ren_args *a)
-{
-	int err;
-	struct dentry *d, *h_d;
-
-	/* prepare workqueue args for asynchronous rmdir */
-	h_d = a->dst_h_dentry;
-	if (au_ftest_ren(a->flags, ISDIR) && d_is_positive(h_d)) {
-		err = -ENOMEM;
-		a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS);
-		if (unlikely(!a->thargs))
-			goto out;
-		a->h_dst = dget(h_d);
-	}
-
-	/* create whiteout for src_dentry */
-	if (au_ftest_ren(a->flags, WHSRC)) {
-		a->src_bwh = au_dbwh(a->src_dentry);
-		AuDebugOn(a->src_bwh >= 0);
-		a->src_wh_dentry
-			= au_wh_create(a->src_dentry, a->btgt, a->src_h_parent);
-		err = PTR_ERR(a->src_wh_dentry);
-		if (IS_ERR(a->src_wh_dentry))
-			goto out_thargs;
-	}
-
-	/* lookup whiteout for dentry */
-	if (au_ftest_ren(a->flags, WHDST)) {
-		h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name,
-				 a->br);
-		err = PTR_ERR(h_d);
-		if (IS_ERR(h_d))
-			goto out_whsrc;
-		if (d_is_negative(h_d))
-			dput(h_d);
-		else
-			a->dst_wh_dentry = h_d;
-	}
-
-	/* rename dentry to tmpwh */
-	if (a->thargs) {
-		err = au_whtmp_ren(a->dst_h_dentry, a->br);
-		if (unlikely(err))
-			goto out_whdst;
-
-		d = a->dst_dentry;
-		au_set_h_dptr(d, a->btgt, NULL);
-		err = au_lkup_neg(d, a->btgt, /*wh*/0);
-		if (unlikely(err))
-			goto out_whtmp;
-		a->dst_h_dentry = au_h_dptr(d, a->btgt);
-	}
-
-	BUG_ON(d_is_positive(a->dst_h_dentry) && a->src_bstart != a->btgt);
-
-	/* rename by vfs_rename or cpup */
-	d = a->dst_dentry;
-	if (au_ftest_ren(a->flags, ISDIR)
-	    && (a->dst_wh_dentry
-		|| au_dbdiropq(d) == a->btgt
-		/* hide the lower to keep xino */
-		|| a->btgt < au_dbend(d)
-		|| au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ)))
-		au_fset_ren(a->flags, DIROPQ);
-	err = au_ren_or_cpup(a);
-	if (unlikely(err))
-		/* leave the copied-up one */
-		goto out_whtmp;
-
-	/* make dir opaque */
-	if (au_ftest_ren(a->flags, DIROPQ)) {
-		err = au_ren_diropq(a);
-		if (unlikely(err))
-			goto out_rename;
-	}
-
-	/* update target timestamps */
-	AuDebugOn(au_dbstart(a->src_dentry) != a->btgt);
-	a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt);
-	vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/
-	a->src_inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime;
-
-	/* remove whiteout for dentry */
-	if (a->dst_wh_dentry) {
-		a->h_path.dentry = a->dst_wh_dentry;
-		err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path,
-					  a->dst_dentry);
-		if (unlikely(err))
-			goto out_diropq;
-	}
-
-	/* remove whtmp */
-	if (a->thargs)
-		au_ren_del_whtmp(a); /* ignore this error */
-
-	au_fhsm_wrote(a->src_dentry->d_sb, a->btgt, /*force*/0);
-	err = 0;
-	goto out_success;
-
-out_diropq:
-	if (au_ftest_ren(a->flags, DIROPQ))
-		au_ren_rev_diropq(err, a);
-out_rename:
-	au_ren_rev_rename(err, a);
-	dput(a->h_dst);
-out_whtmp:
-	if (a->thargs)
-		au_ren_rev_whtmp(err, a);
-out_whdst:
-	dput(a->dst_wh_dentry);
-	a->dst_wh_dentry = NULL;
-out_whsrc:
-	if (a->src_wh_dentry)
-		au_ren_rev_whsrc(err, a);
-out_success:
-	dput(a->src_wh_dentry);
-	dput(a->dst_wh_dentry);
-out_thargs:
-	if (a->thargs) {
-		dput(a->h_dst);
-		au_whtmp_rmdir_free(a->thargs);
-		a->thargs = NULL;
-	}
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * test if @dentry dir can be rename destination or not.
- * success means, it is a logically empty dir.
- */
-static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist)
-{
-	return au_test_empty(dentry, whlist);
-}
-
-/*
- * test if @dentry dir can be rename source or not.
- * if it can, return 0 and @children is filled.
- * success means,
- * - it is a logically empty dir.
- * - or, it exists on writable branch and has no children including whiteouts
- *       on the lower branch.
- */
-static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt)
-{
-	int err;
-	unsigned int rdhash;
-	aufs_bindex_t bstart;
-
-	bstart = au_dbstart(dentry);
-	if (bstart != btgt) {
-		struct au_nhash whlist;
-
-		SiMustAnyLock(dentry->d_sb);
-		rdhash = au_sbi(dentry->d_sb)->si_rdhash;
-		if (!rdhash)
-			rdhash = au_rdhash_est(au_dir_size(/*file*/NULL,
-							   dentry));
-		err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS);
-		if (unlikely(err))
-			goto out;
-		err = au_test_empty(dentry, &whlist);
-		au_nhash_wh_free(&whlist);
-		goto out;
-	}
-
-	if (bstart == au_dbtaildir(dentry))
-		return 0; /* success */
-
-	err = au_test_empty_lower(dentry);
-
-out:
-	if (err == -ENOTEMPTY) {
-		AuWarn1("renaming dir who has child(ren) on multiple branches,"
-			" is not supported\n");
-		err = -EXDEV;
-	}
-	return err;
-}
-
-/* side effect: sets whlist and h_dentry */
-static int au_ren_may_dir(struct au_ren_args *a)
-{
-	int err;
-	unsigned int rdhash;
-	struct dentry *d;
-
-	d = a->dst_dentry;
-	SiMustAnyLock(d->d_sb);
-
-	err = 0;
-	if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) {
-		rdhash = au_sbi(d->d_sb)->si_rdhash;
-		if (!rdhash)
-			rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d));
-		err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS);
-		if (unlikely(err))
-			goto out;
-
-		au_set_dbstart(d, a->dst_bstart);
-		err = may_rename_dstdir(d, &a->whlist);
-		au_set_dbstart(d, a->btgt);
-	}
-	a->dst_h_dentry = au_h_dptr(d, au_dbstart(d));
-	if (unlikely(err))
-		goto out;
-
-	d = a->src_dentry;
-	a->src_h_dentry = au_h_dptr(d, au_dbstart(d));
-	if (au_ftest_ren(a->flags, ISDIR)) {
-		err = may_rename_srcdir(d, a->btgt);
-		if (unlikely(err)) {
-			au_nhash_wh_free(&a->whlist);
-			a->whlist.nh_num = 0;
-		}
-	}
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * simple tests for rename.
- * following the checks in vfs, plus the parent-child relationship.
- */
-static int au_may_ren(struct au_ren_args *a)
-{
-	int err, isdir;
-	struct inode *h_inode;
-
-	if (a->src_bstart == a->btgt) {
-		err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent,
-				 au_ftest_ren(a->flags, ISDIR));
-		if (unlikely(err))
-			goto out;
-		err = -EINVAL;
-		if (unlikely(a->src_h_dentry == a->h_trap))
-			goto out;
-	}
-
-	err = 0;
-	if (a->dst_bstart != a->btgt)
-		goto out;
-
-	err = -ENOTEMPTY;
-	if (unlikely(a->dst_h_dentry == a->h_trap))
-		goto out;
-
-	err = -EIO;
-	isdir = !!au_ftest_ren(a->flags, ISDIR);
-	if (d_really_is_negative(a->dst_dentry)) {
-		if (d_is_negative(a->dst_h_dentry))
-			err = au_may_add(a->dst_dentry, a->btgt,
-					 a->dst_h_parent, isdir);
-	} else {
-		if (unlikely(d_is_negative(a->dst_h_dentry)))
-			goto out;
-		h_inode = d_inode(a->dst_h_dentry);
-		if (h_inode->i_nlink)
-			err = au_may_del(a->dst_dentry, a->btgt,
-					 a->dst_h_parent, isdir);
-	}
-
-out:
-	if (unlikely(err == -ENOENT || err == -EEXIST))
-		err = -EIO;
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * locking order
- * (VFS)
- * - src_dir and dir by lock_rename()
- * - inode if exitsts
- * (aufs)
- * - lock all
- *   + src_dentry and dentry by aufs_read_and_write_lock2() which calls,
- *     + si_read_lock
- *     + di_write_lock2_child()
- *       + di_write_lock_child()
- *	   + ii_write_lock_child()
- *       + di_write_lock_child2()
- *	   + ii_write_lock_child2()
- *     + src_parent and parent
- *       + di_write_lock_parent()
- *	   + ii_write_lock_parent()
- *       + di_write_lock_parent2()
- *	   + ii_write_lock_parent2()
- *   + lower src_dir and dir by vfsub_lock_rename()
- *   + verify the every relationships between child and parent. if any
- *     of them failed, unlock all and return -EBUSY.
- */
-static void au_ren_unlock(struct au_ren_args *a)
-{
-	vfsub_unlock_rename(a->src_h_parent, a->src_hdir,
-			    a->dst_h_parent, a->dst_hdir);
-	if (au_ftest_ren(a->flags, MNT_WRITE))
-		vfsub_mnt_drop_write(au_br_mnt(a->br));
-}
-
-static int au_ren_lock(struct au_ren_args *a)
-{
-	int err;
-	unsigned int udba;
-
-	err = 0;
-	a->src_h_parent = au_h_dptr(a->src_parent, a->btgt);
-	a->src_hdir = au_hi(a->src_dir, a->btgt);
-	a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt);
-	a->dst_hdir = au_hi(a->dst_dir, a->btgt);
-
-	err = vfsub_mnt_want_write(au_br_mnt(a->br));
-	if (unlikely(err))
-		goto out;
-	au_fset_ren(a->flags, MNT_WRITE);
-	a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir,
-				      a->dst_h_parent, a->dst_hdir);
-	udba = au_opt_udba(a->src_dentry->d_sb);
-	if (unlikely(a->src_hdir->hi_inode != d_inode(a->src_h_parent)
-		     || a->dst_hdir->hi_inode != d_inode(a->dst_h_parent)))
-		err = au_busy_or_stale();
-	if (!err && au_dbstart(a->src_dentry) == a->btgt)
-		err = au_h_verify(a->src_h_dentry, udba,
-				  d_inode(a->src_h_parent), a->src_h_parent,
-				  a->br);
-	if (!err && au_dbstart(a->dst_dentry) == a->btgt)
-		err = au_h_verify(a->dst_h_dentry, udba,
-				  d_inode(a->dst_h_parent), a->dst_h_parent,
-				  a->br);
-	if (!err)
-		goto out; /* success */
-
-	err = au_busy_or_stale();
-	au_ren_unlock(a);
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void au_ren_refresh_dir(struct au_ren_args *a)
-{
-	struct inode *dir;
-
-	dir = a->dst_dir;
-	dir->i_version++;
-	if (au_ftest_ren(a->flags, ISDIR)) {
-		/* is this updating defined in POSIX? */
-		au_cpup_attr_timesizes(a->src_inode);
-		au_cpup_attr_nlink(dir, /*force*/1);
-	}
-
-	au_dir_ts(dir, a->btgt);
-
-	if (au_ftest_ren(a->flags, ISSAMEDIR))
-		return;
-
-	dir = a->src_dir;
-	dir->i_version++;
-	if (au_ftest_ren(a->flags, ISDIR))
-		au_cpup_attr_nlink(dir, /*force*/1);
-	au_dir_ts(dir, a->btgt);
-}
-
-static void au_ren_refresh(struct au_ren_args *a)
-{
-	aufs_bindex_t bend, bindex;
-	struct dentry *d, *h_d;
-	struct inode *i, *h_i;
-	struct super_block *sb;
-
-	d = a->dst_dentry;
-	d_drop(d);
-	if (a->h_dst)
-		/* already dget-ed by au_ren_or_cpup() */
-		au_set_h_dptr(d, a->btgt, a->h_dst);
-
-	i = a->dst_inode;
-	if (i) {
-		if (!au_ftest_ren(a->flags, ISDIR))
-			vfsub_drop_nlink(i);
-		else {
-			vfsub_dead_dir(i);
-			au_cpup_attr_timesizes(i);
-		}
-		au_update_dbrange(d, /*do_put_zero*/1);
-	} else {
-		bend = a->btgt;
-		for (bindex = au_dbstart(d); bindex < bend; bindex++)
-			au_set_h_dptr(d, bindex, NULL);
-		bend = au_dbend(d);
-		for (bindex = a->btgt + 1; bindex <= bend; bindex++)
-			au_set_h_dptr(d, bindex, NULL);
-		au_update_dbrange(d, /*do_put_zero*/0);
-	}
-
-	d = a->src_dentry;
-	au_set_dbwh(d, -1);
-	bend = au_dbend(d);
-	for (bindex = a->btgt + 1; bindex <= bend; bindex++) {
-		h_d = au_h_dptr(d, bindex);
-		if (h_d)
-			au_set_h_dptr(d, bindex, NULL);
-	}
-	au_set_dbend(d, a->btgt);
-
-	sb = d->d_sb;
-	i = a->src_inode;
-	if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i))
-		return; /* success */
-
-	bend = au_ibend(i);
-	for (bindex = a->btgt + 1; bindex <= bend; bindex++) {
-		h_i = au_h_iptr(i, bindex);
-		if (h_i) {
-			au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0);
-			/* ignore this error */
-			au_set_h_iptr(i, bindex, NULL, 0);
-		}
-	}
-	au_set_ibend(i, a->btgt);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* mainly for link(2) and rename(2) */
-int au_wbr(struct dentry *dentry, aufs_bindex_t btgt)
-{
-	aufs_bindex_t bdiropq, bwh;
-	struct dentry *parent;
-	struct au_branch *br;
-
-	parent = dentry->d_parent;
-	IMustLock(d_inode(parent)); /* dir is locked */
-
-	bdiropq = au_dbdiropq(parent);
-	bwh = au_dbwh(dentry);
-	br = au_sbr(dentry->d_sb, btgt);
-	if (au_br_rdonly(br)
-	    || (0 <= bdiropq && bdiropq < btgt)
-	    || (0 <= bwh && bwh < btgt))
-		btgt = -1;
-
-	AuDbg("btgt %d\n", btgt);
-	return btgt;
-}
-
-/* sets src_bstart, dst_bstart and btgt */
-static int au_ren_wbr(struct au_ren_args *a)
-{
-	int err;
-	struct au_wr_dir_args wr_dir_args = {
-		/* .force_btgt	= -1, */
-		.flags		= AuWrDir_ADD_ENTRY
-	};
-
-	a->src_bstart = au_dbstart(a->src_dentry);
-	a->dst_bstart = au_dbstart(a->dst_dentry);
-	if (au_ftest_ren(a->flags, ISDIR))
-		au_fset_wrdir(wr_dir_args.flags, ISDIR);
-	wr_dir_args.force_btgt = a->src_bstart;
-	if (a->dst_inode && a->dst_bstart < a->src_bstart)
-		wr_dir_args.force_btgt = a->dst_bstart;
-	wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt);
-	err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args);
-	a->btgt = err;
-
-	return err;
-}
-
-static void au_ren_dt(struct au_ren_args *a)
-{
-	a->h_path.dentry = a->src_h_parent;
-	au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path);
-	if (!au_ftest_ren(a->flags, ISSAMEDIR)) {
-		a->h_path.dentry = a->dst_h_parent;
-		au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path);
-	}
-
-	au_fclr_ren(a->flags, DT_DSTDIR);
-	if (!au_ftest_ren(a->flags, ISDIR))
-		return;
-
-	a->h_path.dentry = a->src_h_dentry;
-	au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path);
-	if (d_is_positive(a->dst_h_dentry)) {
-		au_fset_ren(a->flags, DT_DSTDIR);
-		a->h_path.dentry = a->dst_h_dentry;
-		au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path);
-	}
-}
-
-static void au_ren_rev_dt(int err, struct au_ren_args *a)
-{
-	struct dentry *h_d;
-	struct mutex *h_mtx;
-
-	au_dtime_revert(a->src_dt + AuPARENT);
-	if (!au_ftest_ren(a->flags, ISSAMEDIR))
-		au_dtime_revert(a->dst_dt + AuPARENT);
-
-	if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) {
-		h_d = a->src_dt[AuCHILD].dt_h_path.dentry;
-		h_mtx = &d_inode(h_d)->i_mutex;
-		mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
-		au_dtime_revert(a->src_dt + AuCHILD);
-		mutex_unlock(h_mtx);
-
-		if (au_ftest_ren(a->flags, DT_DSTDIR)) {
-			h_d = a->dst_dt[AuCHILD].dt_h_path.dentry;
-			h_mtx = &d_inode(h_d)->i_mutex;
-			mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
-			au_dtime_revert(a->dst_dt + AuCHILD);
-			mutex_unlock(h_mtx);
-		}
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry,
-		struct inode *_dst_dir, struct dentry *_dst_dentry)
-{
-	int err, flags;
-	/* reduce stack space */
-	struct au_ren_args *a;
-
-	AuDbg("%pd, %pd\n", _src_dentry, _dst_dentry);
-	IMustLock(_src_dir);
-	IMustLock(_dst_dir);
-
-	err = -ENOMEM;
-	BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE);
-	a = kzalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	a->src_dir = _src_dir;
-	a->src_dentry = _src_dentry;
-	a->src_inode = NULL;
-	if (d_really_is_positive(a->src_dentry))
-		a->src_inode = d_inode(a->src_dentry);
-	a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */
-	a->dst_dir = _dst_dir;
-	a->dst_dentry = _dst_dentry;
-	a->dst_inode = NULL;
-	if (d_really_is_positive(a->dst_dentry))
-		a->dst_inode = d_inode(a->dst_dentry);
-	a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */
-	if (a->dst_inode) {
-		IMustLock(a->dst_inode);
-		au_igrab(a->dst_inode);
-	}
-
-	err = -ENOTDIR;
-	flags = AuLock_FLUSH | AuLock_NOPLM | AuLock_GEN;
-	if (d_is_dir(a->src_dentry)) {
-		au_fset_ren(a->flags, ISDIR);
-		if (unlikely(d_really_is_positive(a->dst_dentry)
-			     && !d_is_dir(a->dst_dentry)))
-			goto out_free;
-		err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry,
-						AuLock_DIR | flags);
-	} else
-		err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry,
-						flags);
-	if (unlikely(err))
-		goto out_free;
-
-	err = au_d_hashed_positive(a->src_dentry);
-	if (unlikely(err))
-		goto out_unlock;
-	err = -ENOENT;
-	if (a->dst_inode) {
-		/*
-		 * If it is a dir, VFS unhash dst_dentry before this
-		 * function. It means we cannot rely upon d_unhashed().
-		 */
-		if (unlikely(!a->dst_inode->i_nlink))
-			goto out_unlock;
-		if (!S_ISDIR(a->dst_inode->i_mode)) {
-			err = au_d_hashed_positive(a->dst_dentry);
-			if (unlikely(err))
-				goto out_unlock;
-		} else if (unlikely(IS_DEADDIR(a->dst_inode)))
-			goto out_unlock;
-	} else if (unlikely(d_unhashed(a->dst_dentry)))
-		goto out_unlock;
-
-	/*
-	 * is it possible?
-	 * yes, it happend (in linux-3.3-rcN) but I don't know why.
-	 * there may exist a problem somewhere else.
-	 */
-	err = -EINVAL;
-	if (unlikely(d_inode(a->dst_parent) == d_inode(a->src_dentry)))
-		goto out_unlock;
-
-	au_fset_ren(a->flags, ISSAMEDIR); /* temporary */
-	di_write_lock_parent(a->dst_parent);
-
-	/* which branch we process */
-	err = au_ren_wbr(a);
-	if (unlikely(err < 0))
-		goto out_parent;
-	a->br = au_sbr(a->dst_dentry->d_sb, a->btgt);
-	a->h_path.mnt = au_br_mnt(a->br);
-
-	/* are they available to be renamed */
-	err = au_ren_may_dir(a);
-	if (unlikely(err))
-		goto out_children;
-
-	/* prepare the writable parent dir on the same branch */
-	if (a->dst_bstart == a->btgt) {
-		au_fset_ren(a->flags, WHDST);
-	} else {
-		err = au_cpup_dirs(a->dst_dentry, a->btgt);
-		if (unlikely(err))
-			goto out_children;
-	}
-
-	if (a->src_dir != a->dst_dir) {
-		/*
-		 * this temporary unlock is safe,
-		 * because both dir->i_mutex are locked.
-		 */
-		di_write_unlock(a->dst_parent);
-		di_write_lock_parent(a->src_parent);
-		err = au_wr_dir_need_wh(a->src_dentry,
-					au_ftest_ren(a->flags, ISDIR),
-					&a->btgt);
-		di_write_unlock(a->src_parent);
-		di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1);
-		au_fclr_ren(a->flags, ISSAMEDIR);
-	} else
-		err = au_wr_dir_need_wh(a->src_dentry,
-					au_ftest_ren(a->flags, ISDIR),
-					&a->btgt);
-	if (unlikely(err < 0))
-		goto out_children;
-	if (err)
-		au_fset_ren(a->flags, WHSRC);
-
-	/* cpup src */
-	if (a->src_bstart != a->btgt) {
-		struct au_pin pin;
-
-		err = au_pin(&pin, a->src_dentry, a->btgt,
-			     au_opt_udba(a->src_dentry->d_sb),
-			     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-		if (!err) {
-			struct au_cp_generic cpg = {
-				.dentry	= a->src_dentry,
-				.bdst	= a->btgt,
-				.bsrc	= a->src_bstart,
-				.len	= -1,
-				.pin	= &pin,
-				.flags	= AuCpup_DTIME | AuCpup_HOPEN
-			};
-			AuDebugOn(au_dbstart(a->src_dentry) != a->src_bstart);
-			err = au_sio_cpup_simple(&cpg);
-			au_unpin(&pin);
-		}
-		if (unlikely(err))
-			goto out_children;
-		a->src_bstart = a->btgt;
-		a->src_h_dentry = au_h_dptr(a->src_dentry, a->btgt);
-		au_fset_ren(a->flags, WHSRC);
-	}
-
-	/* lock them all */
-	err = au_ren_lock(a);
-	if (unlikely(err))
-		/* leave the copied-up one */
-		goto out_children;
-
-	if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE))
-		err = au_may_ren(a);
-	else if (unlikely(a->dst_dentry->d_name.len > AUFS_MAX_NAMELEN))
-		err = -ENAMETOOLONG;
-	if (unlikely(err))
-		goto out_hdir;
-
-	/* store timestamps to be revertible */
-	au_ren_dt(a);
-
-	/* here we go */
-	err = do_rename(a);
-	if (unlikely(err))
-		goto out_dt;
-
-	/* update dir attributes */
-	au_ren_refresh_dir(a);
-
-	/* dput/iput all lower dentries */
-	au_ren_refresh(a);
-
-	goto out_hdir; /* success */
-
-out_dt:
-	au_ren_rev_dt(err, a);
-out_hdir:
-	au_ren_unlock(a);
-out_children:
-	au_nhash_wh_free(&a->whlist);
-	if (err && a->dst_inode && a->dst_bstart != a->btgt) {
-		AuDbg("bstart %d, btgt %d\n", a->dst_bstart, a->btgt);
-		au_set_h_dptr(a->dst_dentry, a->btgt, NULL);
-		au_set_dbstart(a->dst_dentry, a->dst_bstart);
-	}
-out_parent:
-	if (!err)
-		d_move(a->src_dentry, a->dst_dentry);
-	else {
-		au_update_dbstart(a->dst_dentry);
-		if (!a->dst_inode)
-			d_drop(a->dst_dentry);
-	}
-	if (au_ftest_ren(a->flags, ISSAMEDIR))
-		di_write_unlock(a->dst_parent);
-	else
-		di_write_unlock2(a->src_parent, a->dst_parent);
-out_unlock:
-	aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry);
-out_free:
-	iput(a->dst_inode);
-	if (a->thargs)
-		au_whtmp_rmdir_free(a->thargs);
-	kfree(a);
-out:
-	AuTraceErr(err);
-	return err;
-}
diff --git a/fs/aufs/iinfo.c b/fs/aufs/iinfo.c
deleted file mode 100644
index 0eaa7de7e..000000000
--- a/fs/aufs/iinfo.c
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * inode private data
- */
-
-#include "aufs.h"
-
-struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex)
-{
-	struct inode *h_inode;
-
-	IiMustAnyLock(inode);
-
-	h_inode = au_ii(inode)->ii_hinode[0 + bindex].hi_inode;
-	AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0);
-	return h_inode;
-}
-
-/* todo: hard/soft set? */
-void au_hiput(struct au_hinode *hinode)
-{
-	au_hn_free(hinode);
-	dput(hinode->hi_whdentry);
-	iput(hinode->hi_inode);
-}
-
-unsigned int au_hi_flags(struct inode *inode, int isdir)
-{
-	unsigned int flags;
-	const unsigned int mnt_flags = au_mntflags(inode->i_sb);
-
-	flags = 0;
-	if (au_opt_test(mnt_flags, XINO))
-		au_fset_hi(flags, XINO);
-	if (isdir && au_opt_test(mnt_flags, UDBA_HNOTIFY))
-		au_fset_hi(flags, HNOTIFY);
-	return flags;
-}
-
-void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex,
-		   struct inode *h_inode, unsigned int flags)
-{
-	struct au_hinode *hinode;
-	struct inode *hi;
-	struct au_iinfo *iinfo = au_ii(inode);
-
-	IiMustWriteLock(inode);
-
-	hinode = iinfo->ii_hinode + bindex;
-	hi = hinode->hi_inode;
-	AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0);
-
-	if (hi)
-		au_hiput(hinode);
-	hinode->hi_inode = h_inode;
-	if (h_inode) {
-		int err;
-		struct super_block *sb = inode->i_sb;
-		struct au_branch *br;
-
-		AuDebugOn(inode->i_mode
-			  && (h_inode->i_mode & S_IFMT)
-			  != (inode->i_mode & S_IFMT));
-		if (bindex == iinfo->ii_bstart)
-			au_cpup_igen(inode, h_inode);
-		br = au_sbr(sb, bindex);
-		hinode->hi_id = br->br_id;
-		if (au_ftest_hi(flags, XINO)) {
-			err = au_xino_write(sb, bindex, h_inode->i_ino,
-					    inode->i_ino);
-			if (unlikely(err))
-				AuIOErr1("failed au_xino_write() %d\n", err);
-		}
-
-		if (au_ftest_hi(flags, HNOTIFY)
-		    && au_br_hnotifyable(br->br_perm)) {
-			err = au_hn_alloc(hinode, inode);
-			if (unlikely(err))
-				AuIOErr1("au_hn_alloc() %d\n", err);
-		}
-	}
-}
-
-void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex,
-		  struct dentry *h_wh)
-{
-	struct au_hinode *hinode;
-
-	IiMustWriteLock(inode);
-
-	hinode = au_ii(inode)->ii_hinode + bindex;
-	AuDebugOn(hinode->hi_whdentry);
-	hinode->hi_whdentry = h_wh;
-}
-
-void au_update_iigen(struct inode *inode, int half)
-{
-	struct au_iinfo *iinfo;
-	struct au_iigen *iigen;
-	unsigned int sigen;
-
-	sigen = au_sigen(inode->i_sb);
-	iinfo = au_ii(inode);
-	iigen = &iinfo->ii_generation;
-	spin_lock(&iinfo->ii_genspin);
-	iigen->ig_generation = sigen;
-	if (half)
-		au_ig_fset(iigen->ig_flags, HALF_REFRESHED);
-	else
-		au_ig_fclr(iigen->ig_flags, HALF_REFRESHED);
-	spin_unlock(&iinfo->ii_genspin);
-}
-
-/* it may be called at remount time, too */
-void au_update_ibrange(struct inode *inode, int do_put_zero)
-{
-	struct au_iinfo *iinfo;
-	aufs_bindex_t bindex, bend;
-
-	iinfo = au_ii(inode);
-	if (!iinfo)
-		return;
-
-	IiMustWriteLock(inode);
-
-	if (do_put_zero && iinfo->ii_bstart >= 0) {
-		for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend;
-		     bindex++) {
-			struct inode *h_i;
-
-			h_i = iinfo->ii_hinode[0 + bindex].hi_inode;
-			if (h_i
-			    && !h_i->i_nlink
-			    && !(h_i->i_state & I_LINKABLE))
-				au_set_h_iptr(inode, bindex, NULL, 0);
-		}
-	}
-
-	iinfo->ii_bstart = -1;
-	iinfo->ii_bend = -1;
-	bend = au_sbend(inode->i_sb);
-	for (bindex = 0; bindex <= bend; bindex++)
-		if (iinfo->ii_hinode[0 + bindex].hi_inode) {
-			iinfo->ii_bstart = bindex;
-			break;
-		}
-	if (iinfo->ii_bstart >= 0)
-		for (bindex = bend; bindex >= iinfo->ii_bstart; bindex--)
-			if (iinfo->ii_hinode[0 + bindex].hi_inode) {
-				iinfo->ii_bend = bindex;
-				break;
-			}
-	AuDebugOn(iinfo->ii_bstart > iinfo->ii_bend);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_icntnr_init_once(void *_c)
-{
-	struct au_icntnr *c = _c;
-	struct au_iinfo *iinfo = &c->iinfo;
-	static struct lock_class_key aufs_ii;
-
-	spin_lock_init(&iinfo->ii_genspin);
-	au_rw_init(&iinfo->ii_rwsem);
-	au_rw_class(&iinfo->ii_rwsem, &aufs_ii);
-	inode_init_once(&c->vfs_inode);
-}
-
-int au_iinfo_init(struct inode *inode)
-{
-	struct au_iinfo *iinfo;
-	struct super_block *sb;
-	int nbr, i;
-
-	sb = inode->i_sb;
-	iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo);
-	nbr = au_sbend(sb) + 1;
-	if (unlikely(nbr <= 0))
-		nbr = 1;
-	iinfo->ii_hinode = kcalloc(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS);
-	if (iinfo->ii_hinode) {
-		au_ninodes_inc(sb);
-		for (i = 0; i < nbr; i++)
-			iinfo->ii_hinode[i].hi_id = -1;
-
-		iinfo->ii_generation.ig_generation = au_sigen(sb);
-		iinfo->ii_bstart = -1;
-		iinfo->ii_bend = -1;
-		iinfo->ii_vdir = NULL;
-		return 0;
-	}
-	return -ENOMEM;
-}
-
-int au_ii_realloc(struct au_iinfo *iinfo, int nbr)
-{
-	int err, sz;
-	struct au_hinode *hip;
-
-	AuRwMustWriteLock(&iinfo->ii_rwsem);
-
-	err = -ENOMEM;
-	sz = sizeof(*hip) * (iinfo->ii_bend + 1);
-	if (!sz)
-		sz = sizeof(*hip);
-	hip = au_kzrealloc(iinfo->ii_hinode, sz, sizeof(*hip) * nbr, GFP_NOFS);
-	if (hip) {
-		iinfo->ii_hinode = hip;
-		err = 0;
-	}
-
-	return err;
-}
-
-void au_iinfo_fin(struct inode *inode)
-{
-	struct au_iinfo *iinfo;
-	struct au_hinode *hi;
-	struct super_block *sb;
-	aufs_bindex_t bindex, bend;
-	const unsigned char unlinked = !inode->i_nlink;
-
-	iinfo = au_ii(inode);
-	/* bad_inode case */
-	if (!iinfo)
-		return;
-
-	sb = inode->i_sb;
-	au_ninodes_dec(sb);
-	if (si_pid_test(sb))
-		au_xino_delete_inode(inode, unlinked);
-	else {
-		/*
-		 * it is safe to hide the dependency between sbinfo and
-		 * sb->s_umount.
-		 */
-		lockdep_off();
-		si_noflush_read_lock(sb);
-		au_xino_delete_inode(inode, unlinked);
-		si_read_unlock(sb);
-		lockdep_on();
-	}
-
-	if (iinfo->ii_vdir)
-		au_vdir_free(iinfo->ii_vdir);
-
-	bindex = iinfo->ii_bstart;
-	if (bindex >= 0) {
-		hi = iinfo->ii_hinode + bindex;
-		bend = iinfo->ii_bend;
-		while (bindex++ <= bend) {
-			if (hi->hi_inode)
-				au_hiput(hi);
-			hi++;
-		}
-	}
-	kfree(iinfo->ii_hinode);
-	iinfo->ii_hinode = NULL;
-	AuRwDestroy(&iinfo->ii_rwsem);
-}
diff --git a/fs/aufs/inode.c b/fs/aufs/inode.c
deleted file mode 100644
index 201e112a0..000000000
--- a/fs/aufs/inode.c
+++ /dev/null
@@ -1,500 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * inode functions
- */
-
-#include "aufs.h"
-
-struct inode *au_igrab(struct inode *inode)
-{
-	if (inode) {
-		AuDebugOn(!atomic_read(&inode->i_count));
-		ihold(inode);
-	}
-	return inode;
-}
-
-static void au_refresh_hinode_attr(struct inode *inode, int do_version)
-{
-	au_cpup_attr_all(inode, /*force*/0);
-	au_update_iigen(inode, /*half*/1);
-	if (do_version)
-		inode->i_version++;
-}
-
-static int au_ii_refresh(struct inode *inode, int *update)
-{
-	int err, e;
-	umode_t type;
-	aufs_bindex_t bindex, new_bindex;
-	struct super_block *sb;
-	struct au_iinfo *iinfo;
-	struct au_hinode *p, *q, tmp;
-
-	IiMustWriteLock(inode);
-
-	*update = 0;
-	sb = inode->i_sb;
-	type = inode->i_mode & S_IFMT;
-	iinfo = au_ii(inode);
-	err = au_ii_realloc(iinfo, au_sbend(sb) + 1);
-	if (unlikely(err))
-		goto out;
-
-	AuDebugOn(iinfo->ii_bstart < 0);
-	p = iinfo->ii_hinode + iinfo->ii_bstart;
-	for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend;
-	     bindex++, p++) {
-		if (!p->hi_inode)
-			continue;
-
-		AuDebugOn(type != (p->hi_inode->i_mode & S_IFMT));
-		new_bindex = au_br_index(sb, p->hi_id);
-		if (new_bindex == bindex)
-			continue;
-
-		if (new_bindex < 0) {
-			*update = 1;
-			au_hiput(p);
-			p->hi_inode = NULL;
-			continue;
-		}
-
-		if (new_bindex < iinfo->ii_bstart)
-			iinfo->ii_bstart = new_bindex;
-		if (iinfo->ii_bend < new_bindex)
-			iinfo->ii_bend = new_bindex;
-		/* swap two lower inode, and loop again */
-		q = iinfo->ii_hinode + new_bindex;
-		tmp = *q;
-		*q = *p;
-		*p = tmp;
-		if (tmp.hi_inode) {
-			bindex--;
-			p--;
-		}
-	}
-	au_update_ibrange(inode, /*do_put_zero*/0);
-	e = au_dy_irefresh(inode);
-	if (unlikely(e && !err))
-		err = e;
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int au_refresh_hinode_self(struct inode *inode)
-{
-	int err, update;
-
-	err = au_ii_refresh(inode, &update);
-	if (!err)
-		au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode));
-
-	AuTraceErr(err);
-	return err;
-}
-
-int au_refresh_hinode(struct inode *inode, struct dentry *dentry)
-{
-	int err, e, update;
-	unsigned int flags;
-	umode_t mode;
-	aufs_bindex_t bindex, bend;
-	unsigned char isdir;
-	struct au_hinode *p;
-	struct au_iinfo *iinfo;
-
-	err = au_ii_refresh(inode, &update);
-	if (unlikely(err))
-		goto out;
-
-	update = 0;
-	iinfo = au_ii(inode);
-	p = iinfo->ii_hinode + iinfo->ii_bstart;
-	mode = (inode->i_mode & S_IFMT);
-	isdir = S_ISDIR(mode);
-	flags = au_hi_flags(inode, isdir);
-	bend = au_dbend(dentry);
-	for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) {
-		struct inode *h_i, *h_inode;
-		struct dentry *h_d;
-
-		h_d = au_h_dptr(dentry, bindex);
-		if (!h_d || d_is_negative(h_d))
-			continue;
-
-		h_inode = d_inode(h_d);
-		AuDebugOn(mode != (h_inode->i_mode & S_IFMT));
-		if (iinfo->ii_bstart <= bindex && bindex <= iinfo->ii_bend) {
-			h_i = au_h_iptr(inode, bindex);
-			if (h_i) {
-				if (h_i == h_inode)
-					continue;
-				err = -EIO;
-				break;
-			}
-		}
-		if (bindex < iinfo->ii_bstart)
-			iinfo->ii_bstart = bindex;
-		if (iinfo->ii_bend < bindex)
-			iinfo->ii_bend = bindex;
-		au_set_h_iptr(inode, bindex, au_igrab(h_inode), flags);
-		update = 1;
-	}
-	au_update_ibrange(inode, /*do_put_zero*/0);
-	e = au_dy_irefresh(inode);
-	if (unlikely(e && !err))
-		err = e;
-	if (!err)
-		au_refresh_hinode_attr(inode, update && isdir);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int set_inode(struct inode *inode, struct dentry *dentry)
-{
-	int err;
-	unsigned int flags;
-	umode_t mode;
-	aufs_bindex_t bindex, bstart, btail;
-	unsigned char isdir;
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-	struct au_iinfo *iinfo;
-
-	IiMustWriteLock(inode);
-
-	err = 0;
-	isdir = 0;
-	bstart = au_dbstart(dentry);
-	h_dentry = au_h_dptr(dentry, bstart);
-	h_inode = d_inode(h_dentry);
-	mode = h_inode->i_mode;
-	switch (mode & S_IFMT) {
-	case S_IFREG:
-		btail = au_dbtail(dentry);
-		inode->i_op = &aufs_iop;
-		inode->i_fop = &aufs_file_fop;
-		err = au_dy_iaop(inode, bstart, h_inode);
-		if (unlikely(err))
-			goto out;
-		break;
-	case S_IFDIR:
-		isdir = 1;
-		btail = au_dbtaildir(dentry);
-		inode->i_op = &aufs_dir_iop;
-		inode->i_fop = &aufs_dir_fop;
-		break;
-	case S_IFLNK:
-		btail = au_dbtail(dentry);
-		inode->i_op = &aufs_symlink_iop;
-		break;
-	case S_IFBLK:
-	case S_IFCHR:
-	case S_IFIFO:
-	case S_IFSOCK:
-		btail = au_dbtail(dentry);
-		inode->i_op = &aufs_iop;
-		init_special_inode(inode, mode, h_inode->i_rdev);
-		break;
-	default:
-		AuIOErr("Unknown file type 0%o\n", mode);
-		err = -EIO;
-		goto out;
-	}
-
-	/* do not set hnotify for whiteouted dirs (SHWH mode) */
-	flags = au_hi_flags(inode, isdir);
-	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)
-	    && au_ftest_hi(flags, HNOTIFY)
-	    && dentry->d_name.len > AUFS_WH_PFX_LEN
-	    && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))
-		au_fclr_hi(flags, HNOTIFY);
-	iinfo = au_ii(inode);
-	iinfo->ii_bstart = bstart;
-	iinfo->ii_bend = btail;
-	for (bindex = bstart; bindex <= btail; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (h_dentry)
-			au_set_h_iptr(inode, bindex,
-				      au_igrab(d_inode(h_dentry)), flags);
-	}
-	au_cpup_attr_all(inode, /*force*/1);
-	/*
-	 * to force calling aufs_get_acl() every time,
-	 * do not call cache_no_acl() for aufs inode.
-	 */
-
-out:
-	return err;
-}
-
-/*
- * successful returns with iinfo write_locked
- * minus: errno
- * zero: success, matched
- * plus: no error, but unmatched
- */
-static int reval_inode(struct inode *inode, struct dentry *dentry)
-{
-	int err;
-	unsigned int gen;
-	struct au_iigen iigen;
-	aufs_bindex_t bindex, bend;
-	struct inode *h_inode, *h_dinode;
-	struct dentry *h_dentry;
-
-	/*
-	 * before this function, if aufs got any iinfo lock, it must be only
-	 * one, the parent dir.
-	 * it can happen by UDBA and the obsoleted inode number.
-	 */
-	err = -EIO;
-	if (unlikely(inode->i_ino == parent_ino(dentry)))
-		goto out;
-
-	err = 1;
-	ii_write_lock_new_child(inode);
-	h_dentry = au_h_dptr(dentry, au_dbstart(dentry));
-	h_dinode = d_inode(h_dentry);
-	bend = au_ibend(inode);
-	for (bindex = au_ibstart(inode); bindex <= bend; bindex++) {
-		h_inode = au_h_iptr(inode, bindex);
-		if (!h_inode || h_inode != h_dinode)
-			continue;
-
-		err = 0;
-		gen = au_iigen(inode, &iigen);
-		if (gen == au_digen(dentry)
-		    && !au_ig_ftest(iigen.ig_flags, HALF_REFRESHED))
-			break;
-
-		/* fully refresh inode using dentry */
-		err = au_refresh_hinode(inode, dentry);
-		if (!err)
-			au_update_iigen(inode, /*half*/0);
-		break;
-	}
-
-	if (unlikely(err))
-		ii_write_unlock(inode);
-out:
-	return err;
-}
-
-int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-	   unsigned int d_type, ino_t *ino)
-{
-	int err;
-	struct mutex *mtx;
-
-	/* prevent hardlinked inode number from race condition */
-	mtx = NULL;
-	if (d_type != DT_DIR) {
-		mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx;
-		mutex_lock(mtx);
-	}
-	err = au_xino_read(sb, bindex, h_ino, ino);
-	if (unlikely(err))
-		goto out;
-
-	if (!*ino) {
-		err = -EIO;
-		*ino = au_xino_new_ino(sb);
-		if (unlikely(!*ino))
-			goto out;
-		err = au_xino_write(sb, bindex, h_ino, *ino);
-		if (unlikely(err))
-			goto out;
-	}
-
-out:
-	if (mtx)
-		mutex_unlock(mtx);
-	return err;
-}
-
-/* successful returns with iinfo write_locked */
-/* todo: return with unlocked? */
-struct inode *au_new_inode(struct dentry *dentry, int must_new)
-{
-	struct inode *inode, *h_inode;
-	struct dentry *h_dentry;
-	struct super_block *sb;
-	struct mutex *mtx;
-	ino_t h_ino, ino;
-	int err;
-	aufs_bindex_t bstart;
-
-	sb = dentry->d_sb;
-	bstart = au_dbstart(dentry);
-	h_dentry = au_h_dptr(dentry, bstart);
-	h_inode = d_inode(h_dentry);
-	h_ino = h_inode->i_ino;
-
-	/*
-	 * stop 'race'-ing between hardlinks under different
-	 * parents.
-	 */
-	mtx = NULL;
-	if (!d_is_dir(h_dentry))
-		mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx;
-
-new_ino:
-	if (mtx)
-		mutex_lock(mtx);
-	err = au_xino_read(sb, bstart, h_ino, &ino);
-	inode = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	if (!ino) {
-		ino = au_xino_new_ino(sb);
-		if (unlikely(!ino)) {
-			inode = ERR_PTR(-EIO);
-			goto out;
-		}
-	}
-
-	AuDbg("i%lu\n", (unsigned long)ino);
-	inode = au_iget_locked(sb, ino);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto out;
-
-	AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW));
-	if (inode->i_state & I_NEW) {
-		/* verbose coding for lock class name */
-		if (unlikely(d_is_symlink(h_dentry)))
-			au_rw_class(&au_ii(inode)->ii_rwsem,
-				    au_lc_key + AuLcSymlink_IIINFO);
-		else if (unlikely(d_is_dir(h_dentry)))
-			au_rw_class(&au_ii(inode)->ii_rwsem,
-				    au_lc_key + AuLcDir_IIINFO);
-		else /* likely */
-			au_rw_class(&au_ii(inode)->ii_rwsem,
-				    au_lc_key + AuLcNonDir_IIINFO);
-
-		ii_write_lock_new_child(inode);
-		err = set_inode(inode, dentry);
-		if (!err) {
-			unlock_new_inode(inode);
-			goto out; /* success */
-		}
-
-		/*
-		 * iget_failed() calls iput(), but we need to call
-		 * ii_write_unlock() after iget_failed(). so dirty hack for
-		 * i_count.
-		 */
-		atomic_inc(&inode->i_count);
-		iget_failed(inode);
-		ii_write_unlock(inode);
-		au_xino_write(sb, bstart, h_ino, /*ino*/0);
-		/* ignore this error */
-		goto out_iput;
-	} else if (!must_new && !IS_DEADDIR(inode) && inode->i_nlink) {
-		/*
-		 * horrible race condition between lookup, readdir and copyup
-		 * (or something).
-		 */
-		if (mtx)
-			mutex_unlock(mtx);
-		err = reval_inode(inode, dentry);
-		if (unlikely(err < 0)) {
-			mtx = NULL;
-			goto out_iput;
-		}
-
-		if (!err) {
-			mtx = NULL;
-			goto out; /* success */
-		} else if (mtx)
-			mutex_lock(mtx);
-	}
-
-	if (unlikely(au_test_fs_unique_ino(h_inode)))
-		AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir,"
-			" b%d, %s, %pd, hi%lu, i%lu.\n",
-			bstart, au_sbtype(h_dentry->d_sb), dentry,
-			(unsigned long)h_ino, (unsigned long)ino);
-	ino = 0;
-	err = au_xino_write(sb, bstart, h_ino, /*ino*/0);
-	if (!err) {
-		iput(inode);
-		if (mtx)
-			mutex_unlock(mtx);
-		goto new_ino;
-	}
-
-out_iput:
-	iput(inode);
-	inode = ERR_PTR(err);
-out:
-	if (mtx)
-		mutex_unlock(mtx);
-	return inode;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_test_ro(struct super_block *sb, aufs_bindex_t bindex,
-	       struct inode *inode)
-{
-	int err;
-	struct inode *hi;
-
-	err = au_br_rdonly(au_sbr(sb, bindex));
-
-	/* pseudo-link after flushed may happen out of bounds */
-	if (!err
-	    && inode
-	    && au_ibstart(inode) <= bindex
-	    && bindex <= au_ibend(inode)) {
-		/*
-		 * permission check is unnecessary since vfsub routine
-		 * will be called later
-		 */
-		hi = au_h_iptr(inode, bindex);
-		if (hi)
-			err = IS_IMMUTABLE(hi) ? -EROFS : 0;
-	}
-
-	return err;
-}
-
-int au_test_h_perm(struct inode *h_inode, int mask)
-{
-	if (uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
-		return 0;
-	return inode_permission(h_inode, mask);
-}
-
-int au_test_h_perm_sio(struct inode *h_inode, int mask)
-{
-	if (au_test_nfs(h_inode->i_sb)
-	    && (mask & MAY_WRITE)
-	    && S_ISDIR(h_inode->i_mode))
-		mask |= MAY_READ; /* force permission check */
-	return au_test_h_perm(h_inode, mask);
-}
diff --git a/fs/aufs/inode.h b/fs/aufs/inode.h
deleted file mode 100644
index 1b9be356d..000000000
--- a/fs/aufs/inode.h
+++ /dev/null
@@ -1,673 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * inode operations
- */
-
-#ifndef __AUFS_INODE_H__
-#define __AUFS_INODE_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fsnotify.h>
-#include "rwsem.h"
-
-struct vfsmount;
-
-struct au_hnotify {
-#ifdef CONFIG_AUFS_HNOTIFY
-#ifdef CONFIG_AUFS_HFSNOTIFY
-	/* never use fsnotify_add_vfsmount_mark() */
-	struct fsnotify_mark		hn_mark;
-#endif
-	struct inode			*hn_aufs_inode;	/* no get/put */
-#endif
-} ____cacheline_aligned_in_smp;
-
-struct au_hinode {
-	struct inode		*hi_inode;
-	aufs_bindex_t		hi_id;
-#ifdef CONFIG_AUFS_HNOTIFY
-	struct au_hnotify	*hi_notify;
-#endif
-
-	/* reference to the copied-up whiteout with get/put */
-	struct dentry		*hi_whdentry;
-};
-
-/* ig_flags */
-#define AuIG_HALF_REFRESHED		1
-#define au_ig_ftest(flags, name)	((flags) & AuIG_##name)
-#define au_ig_fset(flags, name) \
-	do { (flags) |= AuIG_##name; } while (0)
-#define au_ig_fclr(flags, name) \
-	do { (flags) &= ~AuIG_##name; } while (0)
-
-struct au_iigen {
-	__u32		ig_generation, ig_flags;
-};
-
-struct au_vdir;
-struct au_iinfo {
-	spinlock_t		ii_genspin;
-	struct au_iigen		ii_generation;
-	struct super_block	*ii_hsb1;	/* no get/put */
-
-	struct au_rwsem		ii_rwsem;
-	aufs_bindex_t		ii_bstart, ii_bend;
-	__u32			ii_higen;
-	struct au_hinode	*ii_hinode;
-	struct au_vdir		*ii_vdir;
-};
-
-struct au_icntnr {
-	struct au_iinfo iinfo;
-	struct inode vfs_inode;
-} ____cacheline_aligned_in_smp;
-
-/* au_pin flags */
-#define AuPin_DI_LOCKED		1
-#define AuPin_MNT_WRITE		(1 << 1)
-#define au_ftest_pin(flags, name)	((flags) & AuPin_##name)
-#define au_fset_pin(flags, name) \
-	do { (flags) |= AuPin_##name; } while (0)
-#define au_fclr_pin(flags, name) \
-	do { (flags) &= ~AuPin_##name; } while (0)
-
-struct au_pin {
-	/* input */
-	struct dentry *dentry;
-	unsigned int udba;
-	unsigned char lsc_di, lsc_hi, flags;
-	aufs_bindex_t bindex;
-
-	/* output */
-	struct dentry *parent;
-	struct au_hinode *hdir;
-	struct vfsmount *h_mnt;
-
-	/* temporary unlock/relock for copyup */
-	struct dentry *h_dentry, *h_parent;
-	struct au_branch *br;
-	struct task_struct *task;
-};
-
-void au_pin_hdir_unlock(struct au_pin *p);
-int au_pin_hdir_lock(struct au_pin *p);
-int au_pin_hdir_relock(struct au_pin *p);
-void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task);
-void au_pin_hdir_acquire_nest(struct au_pin *p);
-void au_pin_hdir_release(struct au_pin *p);
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct au_iinfo *au_ii(struct inode *inode)
-{
-	struct au_iinfo *iinfo;
-
-	iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo);
-	if (iinfo->ii_hinode)
-		return iinfo;
-	return NULL; /* debugging bad_inode case */
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* inode.c */
-struct inode *au_igrab(struct inode *inode);
-int au_refresh_hinode_self(struct inode *inode);
-int au_refresh_hinode(struct inode *inode, struct dentry *dentry);
-int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-	   unsigned int d_type, ino_t *ino);
-struct inode *au_new_inode(struct dentry *dentry, int must_new);
-int au_test_ro(struct super_block *sb, aufs_bindex_t bindex,
-	       struct inode *inode);
-int au_test_h_perm(struct inode *h_inode, int mask);
-int au_test_h_perm_sio(struct inode *h_inode, int mask);
-
-static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex,
-			    ino_t h_ino, unsigned int d_type, ino_t *ino)
-{
-#ifdef CONFIG_AUFS_SHWH
-	return au_ino(sb, bindex, h_ino, d_type, ino);
-#else
-	return 0;
-#endif
-}
-
-/* i_op.c */
-extern struct inode_operations aufs_iop, aufs_symlink_iop, aufs_dir_iop;
-
-/* au_wr_dir flags */
-#define AuWrDir_ADD_ENTRY	1
-#define AuWrDir_ISDIR		(1 << 1)
-#define AuWrDir_TMPFILE		(1 << 2)
-#define au_ftest_wrdir(flags, name)	((flags) & AuWrDir_##name)
-#define au_fset_wrdir(flags, name) \
-	do { (flags) |= AuWrDir_##name; } while (0)
-#define au_fclr_wrdir(flags, name) \
-	do { (flags) &= ~AuWrDir_##name; } while (0)
-
-struct au_wr_dir_args {
-	aufs_bindex_t force_btgt;
-	unsigned char flags;
-};
-int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry,
-	      struct au_wr_dir_args *args);
-
-struct dentry *au_pinned_h_parent(struct au_pin *pin);
-void au_pin_init(struct au_pin *pin, struct dentry *dentry,
-		 aufs_bindex_t bindex, int lsc_di, int lsc_hi,
-		 unsigned int udba, unsigned char flags);
-int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex,
-	   unsigned int udba, unsigned char flags) __must_check;
-int au_do_pin(struct au_pin *pin) __must_check;
-void au_unpin(struct au_pin *pin);
-int au_reval_for_attr(struct dentry *dentry, unsigned int sigen);
-
-#define AuIcpup_DID_CPUP	1
-#define au_ftest_icpup(flags, name)	((flags) & AuIcpup_##name)
-#define au_fset_icpup(flags, name) \
-	do { (flags) |= AuIcpup_##name; } while (0)
-#define au_fclr_icpup(flags, name) \
-	do { (flags) &= ~AuIcpup_##name; } while (0)
-
-struct au_icpup_args {
-	unsigned char flags;
-	unsigned char pin_flags;
-	aufs_bindex_t btgt;
-	unsigned int udba;
-	struct au_pin pin;
-	struct path h_path;
-	struct inode *h_inode;
-};
-
-int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia,
-		     struct au_icpup_args *a);
-
-int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path);
-
-/* i_op_add.c */
-int au_may_add(struct dentry *dentry, aufs_bindex_t bindex,
-	       struct dentry *h_parent, int isdir);
-int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-	       dev_t dev);
-int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname);
-int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool want_excl);
-struct vfsub_aopen_args;
-int au_aopen_or_create(struct inode *dir, struct dentry *dentry,
-		       struct vfsub_aopen_args *args);
-int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode);
-int aufs_link(struct dentry *src_dentry, struct inode *dir,
-	      struct dentry *dentry);
-int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-
-/* i_op_del.c */
-int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup);
-int au_may_del(struct dentry *dentry, aufs_bindex_t bindex,
-	       struct dentry *h_parent, int isdir);
-int aufs_unlink(struct inode *dir, struct dentry *dentry);
-int aufs_rmdir(struct inode *dir, struct dentry *dentry);
-
-/* i_op_ren.c */
-int au_wbr(struct dentry *dentry, aufs_bindex_t btgt);
-int aufs_rename(struct inode *src_dir, struct dentry *src_dentry,
-		struct inode *dir, struct dentry *dentry);
-
-/* iinfo.c */
-struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex);
-void au_hiput(struct au_hinode *hinode);
-void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex,
-		  struct dentry *h_wh);
-unsigned int au_hi_flags(struct inode *inode, int isdir);
-
-/* hinode flags */
-#define AuHi_XINO	1
-#define AuHi_HNOTIFY	(1 << 1)
-#define au_ftest_hi(flags, name)	((flags) & AuHi_##name)
-#define au_fset_hi(flags, name) \
-	do { (flags) |= AuHi_##name; } while (0)
-#define au_fclr_hi(flags, name) \
-	do { (flags) &= ~AuHi_##name; } while (0)
-
-#ifndef CONFIG_AUFS_HNOTIFY
-#undef AuHi_HNOTIFY
-#define AuHi_HNOTIFY	0
-#endif
-
-void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex,
-		   struct inode *h_inode, unsigned int flags);
-
-void au_update_iigen(struct inode *inode, int half);
-void au_update_ibrange(struct inode *inode, int do_put_zero);
-
-void au_icntnr_init_once(void *_c);
-int au_iinfo_init(struct inode *inode);
-void au_iinfo_fin(struct inode *inode);
-int au_ii_realloc(struct au_iinfo *iinfo, int nbr);
-
-#ifdef CONFIG_PROC_FS
-/* plink.c */
-int au_plink_maint(struct super_block *sb, int flags);
-struct au_sbinfo;
-void au_plink_maint_leave(struct au_sbinfo *sbinfo);
-int au_plink_maint_enter(struct super_block *sb);
-#ifdef CONFIG_AUFS_DEBUG
-void au_plink_list(struct super_block *sb);
-#else
-AuStubVoid(au_plink_list, struct super_block *sb)
-#endif
-int au_plink_test(struct inode *inode);
-struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex);
-void au_plink_append(struct inode *inode, aufs_bindex_t bindex,
-		     struct dentry *h_dentry);
-void au_plink_put(struct super_block *sb, int verbose);
-void au_plink_clean(struct super_block *sb, int verbose);
-void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id);
-#else
-AuStubInt0(au_plink_maint, struct super_block *sb, int flags);
-AuStubVoid(au_plink_maint_leave, struct au_sbinfo *sbinfo);
-AuStubInt0(au_plink_maint_enter, struct super_block *sb);
-AuStubVoid(au_plink_list, struct super_block *sb);
-AuStubInt0(au_plink_test, struct inode *inode);
-AuStub(struct dentry *, au_plink_lkup, return NULL,
-       struct inode *inode, aufs_bindex_t bindex);
-AuStubVoid(au_plink_append, struct inode *inode, aufs_bindex_t bindex,
-	   struct dentry *h_dentry);
-AuStubVoid(au_plink_put, struct super_block *sb, int verbose);
-AuStubVoid(au_plink_clean, struct super_block *sb, int verbose);
-AuStubVoid(au_plink_half_refresh, struct super_block *sb, aufs_bindex_t br_id);
-#endif /* CONFIG_PROC_FS */
-
-#ifdef CONFIG_AUFS_XATTR
-/* xattr.c */
-int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags,
-		  unsigned int verbose);
-ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size);
-ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value,
-		      size_t size);
-int aufs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		  size_t size, int flags);
-int aufs_removexattr(struct dentry *dentry, const char *name);
-
-/* void au_xattr_init(struct super_block *sb); */
-#else
-AuStubInt0(au_cpup_xattr, struct dentry *h_dst, struct dentry *h_src,
-	   int ignore_flags, unsigned int verbose);
-/* AuStubVoid(au_xattr_init, struct super_block *sb); */
-#endif
-
-#ifdef CONFIG_FS_POSIX_ACL
-struct posix_acl *aufs_get_acl(struct inode *inode, int type);
-int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-#endif
-
-#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL)
-enum {
-	AU_XATTR_SET,
-	AU_XATTR_REMOVE,
-	AU_ACL_SET
-};
-
-struct au_srxattr {
-	int type;
-	union {
-		struct {
-			const char	*name;
-			const void	*value;
-			size_t		size;
-			int		flags;
-		} set;
-		struct {
-			const char	*name;
-		} remove;
-		struct {
-			struct posix_acl *acl;
-			int		type;
-		} acl_set;
-	} u;
-};
-ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg);
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-/* lock subclass for iinfo */
-enum {
-	AuLsc_II_CHILD,		/* child first */
-	AuLsc_II_CHILD2,	/* rename(2), link(2), and cpup at hnotify */
-	AuLsc_II_CHILD3,	/* copyup dirs */
-	AuLsc_II_PARENT,	/* see AuLsc_I_PARENT in vfsub.h */
-	AuLsc_II_PARENT2,
-	AuLsc_II_PARENT3,	/* copyup dirs */
-	AuLsc_II_NEW_CHILD
-};
-
-/*
- * ii_read_lock_child, ii_write_lock_child,
- * ii_read_lock_child2, ii_write_lock_child2,
- * ii_read_lock_child3, ii_write_lock_child3,
- * ii_read_lock_parent, ii_write_lock_parent,
- * ii_read_lock_parent2, ii_write_lock_parent2,
- * ii_read_lock_parent3, ii_write_lock_parent3,
- * ii_read_lock_new_child, ii_write_lock_new_child,
- */
-#define AuReadLockFunc(name, lsc) \
-static inline void ii_read_lock_##name(struct inode *i) \
-{ \
-	au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \
-}
-
-#define AuWriteLockFunc(name, lsc) \
-static inline void ii_write_lock_##name(struct inode *i) \
-{ \
-	au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \
-}
-
-#define AuRWLockFuncs(name, lsc) \
-	AuReadLockFunc(name, lsc) \
-	AuWriteLockFunc(name, lsc)
-
-AuRWLockFuncs(child, CHILD);
-AuRWLockFuncs(child2, CHILD2);
-AuRWLockFuncs(child3, CHILD3);
-AuRWLockFuncs(parent, PARENT);
-AuRWLockFuncs(parent2, PARENT2);
-AuRWLockFuncs(parent3, PARENT3);
-AuRWLockFuncs(new_child, NEW_CHILD);
-
-#undef AuReadLockFunc
-#undef AuWriteLockFunc
-#undef AuRWLockFuncs
-
-/*
- * ii_read_unlock, ii_write_unlock, ii_downgrade_lock
- */
-AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem);
-
-#define IiMustNoWaiters(i)	AuRwMustNoWaiters(&au_ii(i)->ii_rwsem)
-#define IiMustAnyLock(i)	AuRwMustAnyLock(&au_ii(i)->ii_rwsem)
-#define IiMustWriteLock(i)	AuRwMustWriteLock(&au_ii(i)->ii_rwsem)
-
-/* ---------------------------------------------------------------------- */
-
-static inline void au_icntnr_init(struct au_icntnr *c)
-{
-#ifdef CONFIG_AUFS_DEBUG
-	c->vfs_inode.i_mode = 0;
-#endif
-}
-
-static inline unsigned int au_iigen(struct inode *inode, struct au_iigen *iigen)
-{
-	unsigned int gen;
-	struct au_iinfo *iinfo;
-
-	iinfo = au_ii(inode);
-	spin_lock(&iinfo->ii_genspin);
-	if (iigen)
-		*iigen = iinfo->ii_generation;
-	gen = iinfo->ii_generation.ig_generation;
-	spin_unlock(&iinfo->ii_genspin);
-
-	return gen;
-}
-
-/* tiny test for inode number */
-/* tmpfs generation is too rough */
-static inline int au_test_higen(struct inode *inode, struct inode *h_inode)
-{
-	struct au_iinfo *iinfo;
-
-	iinfo = au_ii(inode);
-	AuRwMustAnyLock(&iinfo->ii_rwsem);
-	return !(iinfo->ii_hsb1 == h_inode->i_sb
-		 && iinfo->ii_higen == h_inode->i_generation);
-}
-
-static inline void au_iigen_dec(struct inode *inode)
-{
-	struct au_iinfo *iinfo;
-
-	iinfo = au_ii(inode);
-	spin_lock(&iinfo->ii_genspin);
-	iinfo->ii_generation.ig_generation--;
-	spin_unlock(&iinfo->ii_genspin);
-}
-
-static inline int au_iigen_test(struct inode *inode, unsigned int sigen)
-{
-	int err;
-
-	err = 0;
-	if (unlikely(inode && au_iigen(inode, NULL) != sigen))
-		err = -EIO;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static inline aufs_bindex_t au_ii_br_id(struct inode *inode,
-					aufs_bindex_t bindex)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_hinode[0 + bindex].hi_id;
-}
-
-static inline aufs_bindex_t au_ibstart(struct inode *inode)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_bstart;
-}
-
-static inline aufs_bindex_t au_ibend(struct inode *inode)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_bend;
-}
-
-static inline struct au_vdir *au_ivdir(struct inode *inode)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_vdir;
-}
-
-static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_hinode[0 + bindex].hi_whdentry;
-}
-
-static inline void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex)
-{
-	IiMustWriteLock(inode);
-	au_ii(inode)->ii_bstart = bindex;
-}
-
-static inline void au_set_ibend(struct inode *inode, aufs_bindex_t bindex)
-{
-	IiMustWriteLock(inode);
-	au_ii(inode)->ii_bend = bindex;
-}
-
-static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir)
-{
-	IiMustWriteLock(inode);
-	au_ii(inode)->ii_vdir = vdir;
-}
-
-static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_hinode + bindex;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct dentry *au_pinned_parent(struct au_pin *pin)
-{
-	if (pin)
-		return pin->parent;
-	return NULL;
-}
-
-static inline struct inode *au_pinned_h_dir(struct au_pin *pin)
-{
-	if (pin && pin->hdir)
-		return pin->hdir->hi_inode;
-	return NULL;
-}
-
-static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin)
-{
-	if (pin)
-		return pin->hdir;
-	return NULL;
-}
-
-static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry)
-{
-	if (pin)
-		pin->dentry = dentry;
-}
-
-static inline void au_pin_set_parent_lflag(struct au_pin *pin,
-					   unsigned char lflag)
-{
-	if (pin) {
-		if (lflag)
-			au_fset_pin(pin->flags, DI_LOCKED);
-		else
-			au_fclr_pin(pin->flags, DI_LOCKED);
-	}
-}
-
-#if 0 /* reserved */
-static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent)
-{
-	if (pin) {
-		dput(pin->parent);
-		pin->parent = dget(parent);
-	}
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-struct au_branch;
-#ifdef CONFIG_AUFS_HNOTIFY
-struct au_hnotify_op {
-	void (*ctl)(struct au_hinode *hinode, int do_set);
-	int (*alloc)(struct au_hinode *hinode);
-
-	/*
-	 * if it returns true, the the caller should free hinode->hi_notify,
-	 * otherwise ->free() frees it.
-	 */
-	int (*free)(struct au_hinode *hinode,
-		    struct au_hnotify *hn) __must_check;
-
-	void (*fin)(void);
-	int (*init)(void);
-
-	int (*reset_br)(unsigned int udba, struct au_branch *br, int perm);
-	void (*fin_br)(struct au_branch *br);
-	int (*init_br)(struct au_branch *br, int perm);
-};
-
-/* hnotify.c */
-int au_hn_alloc(struct au_hinode *hinode, struct inode *inode);
-void au_hn_free(struct au_hinode *hinode);
-void au_hn_ctl(struct au_hinode *hinode, int do_set);
-void au_hn_reset(struct inode *inode, unsigned int flags);
-int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask,
-	       struct qstr *h_child_qstr, struct inode *h_child_inode);
-int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm);
-int au_hnotify_init_br(struct au_branch *br, int perm);
-void au_hnotify_fin_br(struct au_branch *br);
-int __init au_hnotify_init(void);
-void au_hnotify_fin(void);
-
-/* hfsnotify.c */
-extern const struct au_hnotify_op au_hnotify_op;
-
-static inline
-void au_hn_init(struct au_hinode *hinode)
-{
-	hinode->hi_notify = NULL;
-}
-
-static inline struct au_hnotify *au_hn(struct au_hinode *hinode)
-{
-	return hinode->hi_notify;
-}
-
-#else
-AuStub(int, au_hn_alloc, return -EOPNOTSUPP,
-       struct au_hinode *hinode __maybe_unused,
-       struct inode *inode __maybe_unused)
-AuStub(struct au_hnotify *, au_hn, return NULL, struct au_hinode *hinode)
-AuStubVoid(au_hn_free, struct au_hinode *hinode __maybe_unused)
-AuStubVoid(au_hn_ctl, struct au_hinode *hinode __maybe_unused,
-	   int do_set __maybe_unused)
-AuStubVoid(au_hn_reset, struct inode *inode __maybe_unused,
-	   unsigned int flags __maybe_unused)
-AuStubInt0(au_hnotify_reset_br, unsigned int udba __maybe_unused,
-	   struct au_branch *br __maybe_unused,
-	   int perm __maybe_unused)
-AuStubInt0(au_hnotify_init_br, struct au_branch *br __maybe_unused,
-	   int perm __maybe_unused)
-AuStubVoid(au_hnotify_fin_br, struct au_branch *br __maybe_unused)
-AuStubInt0(__init au_hnotify_init, void)
-AuStubVoid(au_hnotify_fin, void)
-AuStubVoid(au_hn_init, struct au_hinode *hinode __maybe_unused)
-#endif /* CONFIG_AUFS_HNOTIFY */
-
-static inline void au_hn_suspend(struct au_hinode *hdir)
-{
-	au_hn_ctl(hdir, /*do_set*/0);
-}
-
-static inline void au_hn_resume(struct au_hinode *hdir)
-{
-	au_hn_ctl(hdir, /*do_set*/1);
-}
-
-static inline void au_hn_imtx_lock(struct au_hinode *hdir)
-{
-	mutex_lock(&hdir->hi_inode->i_mutex);
-	au_hn_suspend(hdir);
-}
-
-static inline void au_hn_imtx_lock_nested(struct au_hinode *hdir,
-					  unsigned int sc __maybe_unused)
-{
-	mutex_lock_nested(&hdir->hi_inode->i_mutex, sc);
-	au_hn_suspend(hdir);
-}
-
-static inline void au_hn_imtx_unlock(struct au_hinode *hdir)
-{
-	au_hn_resume(hdir);
-	mutex_unlock(&hdir->hi_inode->i_mutex);
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_INODE_H__ */
diff --git a/fs/aufs/ioctl.c b/fs/aufs/ioctl.c
deleted file mode 100644
index 18d7c6aea..000000000
--- a/fs/aufs/ioctl.c
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * ioctl
- * plink-management and readdir in userspace.
- * assist the pathconf(3) wrapper library.
- * move-down
- * File-based Hierarchical Storage Management.
- */
-
-#include <linux/compat.h>
-#include <linux/file.h>
-#include "aufs.h"
-
-static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg)
-{
-	int err, fd;
-	aufs_bindex_t wbi, bindex, bend;
-	struct file *h_file;
-	struct super_block *sb;
-	struct dentry *root;
-	struct au_branch *br;
-	struct aufs_wbr_fd wbrfd = {
-		.oflags	= au_dir_roflags,
-		.brid	= -1
-	};
-	const int valid = O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY
-		| O_NOATIME | O_CLOEXEC;
-
-	AuDebugOn(wbrfd.oflags & ~valid);
-
-	if (arg) {
-		err = copy_from_user(&wbrfd, arg, sizeof(wbrfd));
-		if (unlikely(err)) {
-			err = -EFAULT;
-			goto out;
-		}
-
-		err = -EINVAL;
-		AuDbg("wbrfd{0%o, %d}\n", wbrfd.oflags, wbrfd.brid);
-		wbrfd.oflags |= au_dir_roflags;
-		AuDbg("0%o\n", wbrfd.oflags);
-		if (unlikely(wbrfd.oflags & ~valid))
-			goto out;
-	}
-
-	fd = get_unused_fd_flags(0);
-	err = fd;
-	if (unlikely(fd < 0))
-		goto out;
-
-	h_file = ERR_PTR(-EINVAL);
-	wbi = 0;
-	br = NULL;
-	sb = path->dentry->d_sb;
-	root = sb->s_root;
-	aufs_read_lock(root, AuLock_IR);
-	bend = au_sbend(sb);
-	if (wbrfd.brid >= 0) {
-		wbi = au_br_index(sb, wbrfd.brid);
-		if (unlikely(wbi < 0 || wbi > bend))
-			goto out_unlock;
-	}
-
-	h_file = ERR_PTR(-ENOENT);
-	br = au_sbr(sb, wbi);
-	if (!au_br_writable(br->br_perm)) {
-		if (arg)
-			goto out_unlock;
-
-		bindex = wbi + 1;
-		wbi = -1;
-		for (; bindex <= bend; bindex++) {
-			br = au_sbr(sb, bindex);
-			if (au_br_writable(br->br_perm)) {
-				wbi = bindex;
-				br = au_sbr(sb, wbi);
-				break;
-			}
-		}
-	}
-	AuDbg("wbi %d\n", wbi);
-	if (wbi >= 0)
-		h_file = au_h_open(root, wbi, wbrfd.oflags, NULL,
-				   /*force_wr*/0);
-
-out_unlock:
-	aufs_read_unlock(root, AuLock_IR);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out_fd;
-
-	atomic_dec(&br->br_count); /* cf. au_h_open() */
-	fd_install(fd, h_file);
-	err = fd;
-	goto out; /* success */
-
-out_fd:
-	put_unused_fd(fd);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	long err;
-	struct dentry *dentry;
-
-	switch (cmd) {
-	case AUFS_CTL_RDU:
-	case AUFS_CTL_RDU_INO:
-		err = au_rdu_ioctl(file, cmd, arg);
-		break;
-
-	case AUFS_CTL_WBR_FD:
-		err = au_wbr_fd(&file->f_path, (void __user *)arg);
-		break;
-
-	case AUFS_CTL_IBUSY:
-		err = au_ibusy_ioctl(file, arg);
-		break;
-
-	case AUFS_CTL_BRINFO:
-		err = au_brinfo_ioctl(file, arg);
-		break;
-
-	case AUFS_CTL_FHSM_FD:
-		dentry = file->f_path.dentry;
-		if (IS_ROOT(dentry))
-			err = au_fhsm_fd(dentry->d_sb, arg);
-		else
-			err = -ENOTTY;
-		break;
-
-	default:
-		/* do not call the lower */
-		AuDbg("0x%x\n", cmd);
-		err = -ENOTTY;
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	long err;
-
-	switch (cmd) {
-	case AUFS_CTL_MVDOWN:
-		err = au_mvdown(file->f_path.dentry, (void __user *)arg);
-		break;
-
-	case AUFS_CTL_WBR_FD:
-		err = au_wbr_fd(&file->f_path, (void __user *)arg);
-		break;
-
-	default:
-		/* do not call the lower */
-		AuDbg("0x%x\n", cmd);
-		err = -ENOTTY;
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-#ifdef CONFIG_COMPAT
-long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd,
-			   unsigned long arg)
-{
-	long err;
-
-	switch (cmd) {
-	case AUFS_CTL_RDU:
-	case AUFS_CTL_RDU_INO:
-		err = au_rdu_compat_ioctl(file, cmd, arg);
-		break;
-
-	case AUFS_CTL_IBUSY:
-		err = au_ibusy_compat_ioctl(file, arg);
-		break;
-
-	case AUFS_CTL_BRINFO:
-		err = au_brinfo_compat_ioctl(file, arg);
-		break;
-
-	default:
-		err = aufs_ioctl_dir(file, cmd, arg);
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd,
-			      unsigned long arg)
-{
-	return aufs_ioctl_nondir(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
diff --git a/fs/aufs/loop.c b/fs/aufs/loop.c
deleted file mode 100644
index 677568445..000000000
--- a/fs/aufs/loop.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * support for loopback block device as a branch
- */
-
-#include "aufs.h"
-
-/* added into drivers/block/loop.c */
-static struct file *(*backing_file_func)(struct super_block *sb);
-
-/*
- * test if two lower dentries have overlapping branches.
- */
-int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding)
-{
-	struct super_block *h_sb;
-	struct file *backing_file;
-
-	if (unlikely(!backing_file_func)) {
-		/* don't load "loop" module here */
-		backing_file_func = symbol_get(loop_backing_file);
-		if (unlikely(!backing_file_func))
-			/* "loop" module is not loaded */
-			return 0;
-	}
-
-	h_sb = h_adding->d_sb;
-	backing_file = backing_file_func(h_sb);
-	if (!backing_file)
-		return 0;
-
-	h_adding = backing_file->f_path.dentry;
-	/*
-	 * h_adding can be local NFS.
-	 * in this case aufs cannot detect the loop.
-	 */
-	if (unlikely(h_adding->d_sb == sb))
-		return 1;
-	return !!au_test_subdir(h_adding, sb->s_root);
-}
-
-/* true if a kernel thread named 'loop[0-9].*' accesses a file */
-int au_test_loopback_kthread(void)
-{
-	int ret;
-	struct task_struct *tsk = current;
-	char c, comm[sizeof(tsk->comm)];
-
-	ret = 0;
-	if (tsk->flags & PF_KTHREAD) {
-		get_task_comm(comm, tsk);
-		c = comm[4];
-		ret = ('0' <= c && c <= '9'
-		       && !strncmp(comm, "loop", 4));
-	}
-
-	return ret;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#define au_warn_loopback_step	16
-static int au_warn_loopback_nelem = au_warn_loopback_step;
-static unsigned long *au_warn_loopback_array;
-
-void au_warn_loopback(struct super_block *h_sb)
-{
-	int i, new_nelem;
-	unsigned long *a, magic;
-	static DEFINE_SPINLOCK(spin);
-
-	magic = h_sb->s_magic;
-	spin_lock(&spin);
-	a = au_warn_loopback_array;
-	for (i = 0; i < au_warn_loopback_nelem && *a; i++)
-		if (a[i] == magic) {
-			spin_unlock(&spin);
-			return;
-		}
-
-	/* h_sb is new to us, print it */
-	if (i < au_warn_loopback_nelem) {
-		a[i] = magic;
-		goto pr;
-	}
-
-	/* expand the array */
-	new_nelem = au_warn_loopback_nelem + au_warn_loopback_step;
-	a = au_kzrealloc(au_warn_loopback_array,
-			 au_warn_loopback_nelem * sizeof(unsigned long),
-			 new_nelem * sizeof(unsigned long), GFP_ATOMIC);
-	if (a) {
-		au_warn_loopback_nelem = new_nelem;
-		au_warn_loopback_array = a;
-		a[i] = magic;
-		goto pr;
-	}
-
-	spin_unlock(&spin);
-	AuWarn1("realloc failed, ignored\n");
-	return;
-
-pr:
-	spin_unlock(&spin);
-	pr_warn("you may want to try another patch for loopback file "
-		"on %s(0x%lx) branch\n", au_sbtype(h_sb), magic);
-}
-
-int au_loopback_init(void)
-{
-	int err;
-	struct super_block *sb __maybe_unused;
-
-	AuDebugOn(sizeof(sb->s_magic) != sizeof(unsigned long));
-
-	err = 0;
-	au_warn_loopback_array = kcalloc(au_warn_loopback_step,
-					 sizeof(unsigned long), GFP_NOFS);
-	if (unlikely(!au_warn_loopback_array))
-		err = -ENOMEM;
-
-	return err;
-}
-
-void au_loopback_fin(void)
-{
-	symbol_put(loop_backing_file);
-	kfree(au_warn_loopback_array);
-}
diff --git a/fs/aufs/loop.h b/fs/aufs/loop.h
deleted file mode 100644
index 0c28c2366..000000000
--- a/fs/aufs/loop.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * support for loopback mount as a branch
- */
-
-#ifndef __AUFS_LOOP_H__
-#define __AUFS_LOOP_H__
-
-#ifdef __KERNEL__
-
-struct dentry;
-struct super_block;
-
-#ifdef CONFIG_AUFS_BDEV_LOOP
-/* drivers/block/loop.c */
-struct file *loop_backing_file(struct super_block *sb);
-
-/* loop.c */
-int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding);
-int au_test_loopback_kthread(void);
-void au_warn_loopback(struct super_block *h_sb);
-
-int au_loopback_init(void);
-void au_loopback_fin(void);
-#else
-AuStubInt0(au_test_loopback_overlap, struct super_block *sb,
-	   struct dentry *h_adding)
-AuStubInt0(au_test_loopback_kthread, void)
-AuStubVoid(au_warn_loopback, struct super_block *h_sb)
-
-AuStubInt0(au_loopback_init, void)
-AuStubVoid(au_loopback_fin, void)
-#endif /* BLK_DEV_LOOP */
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_LOOP_H__ */
diff --git a/fs/aufs/magic.mk b/fs/aufs/magic.mk
deleted file mode 100644
index 4f83bdf1d..000000000
--- a/fs/aufs/magic.mk
+++ /dev/null
@@ -1,30 +0,0 @@
-
-# defined in ${srctree}/fs/fuse/inode.c
-# tristate
-ifdef CONFIG_FUSE_FS
-ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546
-endif
-
-# defined in ${srctree}/fs/xfs/xfs_sb.h
-# tristate
-ifdef CONFIG_XFS_FS
-ccflags-y += -DXFS_SB_MAGIC=0x58465342
-endif
-
-# defined in ${srctree}/fs/configfs/mount.c
-# tristate
-ifdef CONFIG_CONFIGFS_FS
-ccflags-y += -DCONFIGFS_MAGIC=0x62656570
-endif
-
-# defined in ${srctree}/fs/ubifs/ubifs.h
-# tristate
-ifdef CONFIG_UBIFS_FS
-ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905
-endif
-
-# defined in ${srctree}/fs/hfsplus/hfsplus_raw.h
-# tristate
-ifdef CONFIG_HFSPLUS_FS
-ccflags-y += -DHFSPLUS_SUPER_MAGIC=0x482b
-endif
diff --git a/fs/aufs/module.c b/fs/aufs/module.c
deleted file mode 100644
index 2c70dfffc..000000000
--- a/fs/aufs/module.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * module global variables and operations
- */
-
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include "aufs.h"
-
-void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp)
-{
-	if (new_sz <= nused)
-		return p;
-
-	p = krealloc(p, new_sz, gfp);
-	if (p)
-		memset(p + nused, 0, new_sz - nused);
-	return p;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * aufs caches
- */
-struct kmem_cache *au_cachep[AuCache_Last];
-static int __init au_cache_init(void)
-{
-	au_cachep[AuCache_DINFO] = AuCacheCtor(au_dinfo, au_di_init_once);
-	if (au_cachep[AuCache_DINFO])
-		/* SLAB_DESTROY_BY_RCU */
-		au_cachep[AuCache_ICNTNR] = AuCacheCtor(au_icntnr,
-							au_icntnr_init_once);
-	if (au_cachep[AuCache_ICNTNR])
-		au_cachep[AuCache_FINFO] = AuCacheCtor(au_finfo,
-						       au_fi_init_once);
-	if (au_cachep[AuCache_FINFO])
-		au_cachep[AuCache_VDIR] = AuCache(au_vdir);
-	if (au_cachep[AuCache_VDIR])
-		au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr);
-	if (au_cachep[AuCache_DEHSTR])
-		return 0;
-
-	return -ENOMEM;
-}
-
-static void au_cache_fin(void)
-{
-	int i;
-
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-
-	/* excluding AuCache_HNOTIFY */
-	BUILD_BUG_ON(AuCache_HNOTIFY + 1 != AuCache_Last);
-	for (i = 0; i < AuCache_HNOTIFY; i++)
-		if (au_cachep[i]) {
-			kmem_cache_destroy(au_cachep[i]);
-			au_cachep[i] = NULL;
-		}
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_dir_roflags;
-
-#ifdef CONFIG_AUFS_SBILIST
-/*
- * iterate_supers_type() doesn't protect us from
- * remounting (branch management)
- */
-struct au_splhead au_sbilist;
-#endif
-
-struct lock_class_key au_lc_key[AuLcKey_Last];
-
-/*
- * functions for module interface.
- */
-MODULE_LICENSE("GPL");
-/* MODULE_LICENSE("GPL v2"); */
-MODULE_AUTHOR("Junjiro R. Okajima <aufs-users@lists.sourceforge.net>");
-MODULE_DESCRIPTION(AUFS_NAME
-	" -- Advanced multi layered unification filesystem");
-MODULE_VERSION(AUFS_VERSION);
-MODULE_ALIAS_FS(AUFS_NAME);
-
-/* this module parameter has no meaning when SYSFS is disabled */
-int sysaufs_brs = 1;
-MODULE_PARM_DESC(brs, "use <sysfs>/fs/aufs/si_*/brN");
-module_param_named(brs, sysaufs_brs, int, S_IRUGO);
-
-/* this module parameter has no meaning when USER_NS is disabled */
-static bool au_userns;
-MODULE_PARM_DESC(allow_userns, "allow unprivileged to mount under userns");
-module_param_named(allow_userns, au_userns, bool, S_IRUGO);
-
-/* ---------------------------------------------------------------------- */
-
-static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */
-
-int au_seq_path(struct seq_file *seq, struct path *path)
-{
-	return seq_path(seq, path, au_esc_chars);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int __init aufs_init(void)
-{
-	int err, i;
-	char *p;
-
-	p = au_esc_chars;
-	for (i = 1; i <= ' '; i++)
-		*p++ = i;
-	*p++ = '\\';
-	*p++ = '\x7f';
-	*p = 0;
-
-	au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE);
-
-	au_sbilist_init();
-	sysaufs_brs_init();
-	au_debug_init();
-	au_dy_init();
-	err = sysaufs_init();
-	if (unlikely(err))
-		goto out;
-	err = au_procfs_init();
-	if (unlikely(err))
-		goto out_sysaufs;
-	err = au_wkq_init();
-	if (unlikely(err))
-		goto out_procfs;
-	err = au_loopback_init();
-	if (unlikely(err))
-		goto out_wkq;
-	err = au_hnotify_init();
-	if (unlikely(err))
-		goto out_loopback;
-	err = au_sysrq_init();
-	if (unlikely(err))
-		goto out_hin;
-	err = au_cache_init();
-	if (unlikely(err))
-		goto out_sysrq;
-
-	aufs_fs_type.fs_flags |= au_userns ? FS_USERNS_MOUNT : 0;
-	err = register_filesystem(&aufs_fs_type);
-	if (unlikely(err))
-		goto out_cache;
-
-	/* since we define pr_fmt, call printk directly */
-	printk(KERN_INFO AUFS_NAME " " AUFS_VERSION "\n");
-	goto out; /* success */
-
-out_cache:
-	au_cache_fin();
-out_sysrq:
-	au_sysrq_fin();
-out_hin:
-	au_hnotify_fin();
-out_loopback:
-	au_loopback_fin();
-out_wkq:
-	au_wkq_fin();
-out_procfs:
-	au_procfs_fin();
-out_sysaufs:
-	sysaufs_fin();
-	au_dy_fin();
-out:
-	return err;
-}
-
-static void __exit aufs_exit(void)
-{
-	unregister_filesystem(&aufs_fs_type);
-	au_cache_fin();
-	au_sysrq_fin();
-	au_hnotify_fin();
-	au_loopback_fin();
-	au_wkq_fin();
-	au_procfs_fin();
-	sysaufs_fin();
-	au_dy_fin();
-}
-
-module_init(aufs_init);
-module_exit(aufs_exit);
diff --git a/fs/aufs/module.h b/fs/aufs/module.h
deleted file mode 100644
index c0a29d49d..000000000
--- a/fs/aufs/module.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * module initialization and module-global
- */
-
-#ifndef __AUFS_MODULE_H__
-#define __AUFS_MODULE_H__
-
-#ifdef __KERNEL__
-
-#include <linux/slab.h>
-
-struct path;
-struct seq_file;
-
-/* module parameters */
-extern int sysaufs_brs;
-
-/* ---------------------------------------------------------------------- */
-
-extern int au_dir_roflags;
-
-enum {
-	AuLcNonDir_FIINFO,
-	AuLcNonDir_DIINFO,
-	AuLcNonDir_IIINFO,
-
-	AuLcDir_FIINFO,
-	AuLcDir_DIINFO,
-	AuLcDir_IIINFO,
-
-	AuLcSymlink_DIINFO,
-	AuLcSymlink_IIINFO,
-
-	AuLcKey_Last
-};
-extern struct lock_class_key au_lc_key[AuLcKey_Last];
-
-void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp);
-int au_seq_path(struct seq_file *seq, struct path *path);
-
-#ifdef CONFIG_PROC_FS
-/* procfs.c */
-int __init au_procfs_init(void);
-void au_procfs_fin(void);
-#else
-AuStubInt0(au_procfs_init, void);
-AuStubVoid(au_procfs_fin, void);
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-/* kmem cache */
-enum {
-	AuCache_DINFO,
-	AuCache_ICNTNR,
-	AuCache_FINFO,
-	AuCache_VDIR,
-	AuCache_DEHSTR,
-	AuCache_HNOTIFY, /* must be last */
-	AuCache_Last
-};
-
-#define AuCacheFlags		(SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD)
-#define AuCache(type)		KMEM_CACHE(type, AuCacheFlags)
-#define AuCacheCtor(type, ctor)	\
-	kmem_cache_create(#type, sizeof(struct type), \
-			  __alignof__(struct type), AuCacheFlags, ctor)
-
-extern struct kmem_cache *au_cachep[];
-
-#define AuCacheFuncs(name, index) \
-static inline struct au_##name *au_cache_alloc_##name(void) \
-{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \
-static inline void au_cache_free_##name(struct au_##name *p) \
-{ kmem_cache_free(au_cachep[AuCache_##index], p); }
-
-AuCacheFuncs(dinfo, DINFO);
-AuCacheFuncs(icntnr, ICNTNR);
-AuCacheFuncs(finfo, FINFO);
-AuCacheFuncs(vdir, VDIR);
-AuCacheFuncs(vdir_dehstr, DEHSTR);
-#ifdef CONFIG_AUFS_HNOTIFY
-AuCacheFuncs(hnotify, HNOTIFY);
-#endif
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_MODULE_H__ */
diff --git a/fs/aufs/mvdown.c b/fs/aufs/mvdown.c
deleted file mode 100644
index 25e3a80b9..000000000
--- a/fs/aufs/mvdown.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * Copyright (C) 2011-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * move-down, opposite of copy-up
- */
-
-#include "aufs.h"
-
-struct au_mvd_args {
-	struct {
-		struct super_block *h_sb;
-		struct dentry *h_parent;
-		struct au_hinode *hdir;
-		struct inode *h_dir, *h_inode;
-		struct au_pin pin;
-	} info[AUFS_MVDOWN_NARRAY];
-
-	struct aufs_mvdown mvdown;
-	struct dentry *dentry, *parent;
-	struct inode *inode, *dir;
-	struct super_block *sb;
-	aufs_bindex_t bopq, bwh, bfound;
-	unsigned char rename_lock;
-};
-
-#define mvd_errno		mvdown.au_errno
-#define mvd_bsrc		mvdown.stbr[AUFS_MVDOWN_UPPER].bindex
-#define mvd_src_brid		mvdown.stbr[AUFS_MVDOWN_UPPER].brid
-#define mvd_bdst		mvdown.stbr[AUFS_MVDOWN_LOWER].bindex
-#define mvd_dst_brid		mvdown.stbr[AUFS_MVDOWN_LOWER].brid
-
-#define mvd_h_src_sb		info[AUFS_MVDOWN_UPPER].h_sb
-#define mvd_h_src_parent	info[AUFS_MVDOWN_UPPER].h_parent
-#define mvd_hdir_src		info[AUFS_MVDOWN_UPPER].hdir
-#define mvd_h_src_dir		info[AUFS_MVDOWN_UPPER].h_dir
-#define mvd_h_src_inode		info[AUFS_MVDOWN_UPPER].h_inode
-#define mvd_pin_src		info[AUFS_MVDOWN_UPPER].pin
-
-#define mvd_h_dst_sb		info[AUFS_MVDOWN_LOWER].h_sb
-#define mvd_h_dst_parent	info[AUFS_MVDOWN_LOWER].h_parent
-#define mvd_hdir_dst		info[AUFS_MVDOWN_LOWER].hdir
-#define mvd_h_dst_dir		info[AUFS_MVDOWN_LOWER].h_dir
-#define mvd_h_dst_inode		info[AUFS_MVDOWN_LOWER].h_inode
-#define mvd_pin_dst		info[AUFS_MVDOWN_LOWER].pin
-
-#define AU_MVD_PR(flag, ...) do {			\
-		if (flag)				\
-			pr_err(__VA_ARGS__);		\
-	} while (0)
-
-static int find_lower_writable(struct au_mvd_args *a)
-{
-	struct super_block *sb;
-	aufs_bindex_t bindex, bend;
-	struct au_branch *br;
-
-	sb = a->sb;
-	bindex = a->mvd_bsrc;
-	bend = au_sbend(sb);
-	if (a->mvdown.flags & AUFS_MVDOWN_FHSM_LOWER)
-		for (bindex++; bindex <= bend; bindex++) {
-			br = au_sbr(sb, bindex);
-			if (au_br_fhsm(br->br_perm)
-			    && (!(au_br_sb(br)->s_flags & MS_RDONLY)))
-				return bindex;
-		}
-	else if (!(a->mvdown.flags & AUFS_MVDOWN_ROLOWER))
-		for (bindex++; bindex <= bend; bindex++) {
-			br = au_sbr(sb, bindex);
-			if (!au_br_rdonly(br))
-				return bindex;
-		}
-	else
-		for (bindex++; bindex <= bend; bindex++) {
-			br = au_sbr(sb, bindex);
-			if (!(au_br_sb(br)->s_flags & MS_RDONLY)) {
-				if (au_br_rdonly(br))
-					a->mvdown.flags
-						|= AUFS_MVDOWN_ROLOWER_R;
-				return bindex;
-			}
-		}
-
-	return -1;
-}
-
-/* make the parent dir on bdst */
-static int au_do_mkdir(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-
-	err = 0;
-	a->mvd_hdir_src = au_hi(a->dir, a->mvd_bsrc);
-	a->mvd_hdir_dst = au_hi(a->dir, a->mvd_bdst);
-	a->mvd_h_src_parent = au_h_dptr(a->parent, a->mvd_bsrc);
-	a->mvd_h_dst_parent = NULL;
-	if (au_dbend(a->parent) >= a->mvd_bdst)
-		a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst);
-	if (!a->mvd_h_dst_parent) {
-		err = au_cpdown_dirs(a->dentry, a->mvd_bdst);
-		if (unlikely(err)) {
-			AU_MVD_PR(dmsg, "cpdown_dirs failed\n");
-			goto out;
-		}
-		a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst);
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* lock them all */
-static int au_do_lock(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct dentry *h_trap;
-
-	a->mvd_h_src_sb = au_sbr_sb(a->sb, a->mvd_bsrc);
-	a->mvd_h_dst_sb = au_sbr_sb(a->sb, a->mvd_bdst);
-	err = au_pin(&a->mvd_pin_dst, a->dentry, a->mvd_bdst,
-		     au_opt_udba(a->sb),
-		     AuPin_MNT_WRITE | AuPin_DI_LOCKED);
-	AuTraceErr(err);
-	if (unlikely(err)) {
-		AU_MVD_PR(dmsg, "pin_dst failed\n");
-		goto out;
-	}
-
-	if (a->mvd_h_src_sb != a->mvd_h_dst_sb) {
-		a->rename_lock = 0;
-		au_pin_init(&a->mvd_pin_src, a->dentry, a->mvd_bsrc,
-			    AuLsc_DI_PARENT, AuLsc_I_PARENT3,
-			    au_opt_udba(a->sb),
-			    AuPin_MNT_WRITE | AuPin_DI_LOCKED);
-		err = au_do_pin(&a->mvd_pin_src);
-		AuTraceErr(err);
-		a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent);
-		if (unlikely(err)) {
-			AU_MVD_PR(dmsg, "pin_src failed\n");
-			goto out_dst;
-		}
-		goto out; /* success */
-	}
-
-	a->rename_lock = 1;
-	au_pin_hdir_unlock(&a->mvd_pin_dst);
-	err = au_pin(&a->mvd_pin_src, a->dentry, a->mvd_bsrc,
-		     au_opt_udba(a->sb),
-		     AuPin_MNT_WRITE | AuPin_DI_LOCKED);
-	AuTraceErr(err);
-	a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent);
-	if (unlikely(err)) {
-		AU_MVD_PR(dmsg, "pin_src failed\n");
-		au_pin_hdir_lock(&a->mvd_pin_dst);
-		goto out_dst;
-	}
-	au_pin_hdir_unlock(&a->mvd_pin_src);
-	h_trap = vfsub_lock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
-				   a->mvd_h_dst_parent, a->mvd_hdir_dst);
-	if (h_trap) {
-		err = (h_trap != a->mvd_h_src_parent);
-		if (err)
-			err = (h_trap != a->mvd_h_dst_parent);
-	}
-	BUG_ON(err); /* it should never happen */
-	if (unlikely(a->mvd_h_src_dir != au_pinned_h_dir(&a->mvd_pin_src))) {
-		err = -EBUSY;
-		AuTraceErr(err);
-		vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
-				    a->mvd_h_dst_parent, a->mvd_hdir_dst);
-		au_pin_hdir_lock(&a->mvd_pin_src);
-		au_unpin(&a->mvd_pin_src);
-		au_pin_hdir_lock(&a->mvd_pin_dst);
-		goto out_dst;
-	}
-	goto out; /* success */
-
-out_dst:
-	au_unpin(&a->mvd_pin_dst);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static void au_do_unlock(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	if (!a->rename_lock)
-		au_unpin(&a->mvd_pin_src);
-	else {
-		vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
-				    a->mvd_h_dst_parent, a->mvd_hdir_dst);
-		au_pin_hdir_lock(&a->mvd_pin_src);
-		au_unpin(&a->mvd_pin_src);
-		au_pin_hdir_lock(&a->mvd_pin_dst);
-	}
-	au_unpin(&a->mvd_pin_dst);
-}
-
-/* copy-down the file */
-static int au_do_cpdown(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct au_cp_generic cpg = {
-		.dentry	= a->dentry,
-		.bdst	= a->mvd_bdst,
-		.bsrc	= a->mvd_bsrc,
-		.len	= -1,
-		.pin	= &a->mvd_pin_dst,
-		.flags	= AuCpup_DTIME | AuCpup_HOPEN
-	};
-
-	AuDbg("b%d, b%d\n", cpg.bsrc, cpg.bdst);
-	if (a->mvdown.flags & AUFS_MVDOWN_OWLOWER)
-		au_fset_cpup(cpg.flags, OVERWRITE);
-	if (a->mvdown.flags & AUFS_MVDOWN_ROLOWER)
-		au_fset_cpup(cpg.flags, RWDST);
-	err = au_sio_cpdown_simple(&cpg);
-	if (unlikely(err))
-		AU_MVD_PR(dmsg, "cpdown failed\n");
-
-	AuTraceErr(err);
-	return err;
-}
-
-/*
- * unlink the whiteout on bdst if exist which may be created by UDBA while we
- * were sleeping
- */
-static int au_do_unlink_wh(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct path h_path;
-	struct au_branch *br;
-	struct inode *delegated;
-
-	br = au_sbr(a->sb, a->mvd_bdst);
-	h_path.dentry = au_wh_lkup(a->mvd_h_dst_parent, &a->dentry->d_name, br);
-	err = PTR_ERR(h_path.dentry);
-	if (IS_ERR(h_path.dentry)) {
-		AU_MVD_PR(dmsg, "wh_lkup failed\n");
-		goto out;
-	}
-
-	err = 0;
-	if (d_is_positive(h_path.dentry)) {
-		h_path.mnt = au_br_mnt(br);
-		delegated = NULL;
-		err = vfsub_unlink(d_inode(a->mvd_h_dst_parent), &h_path,
-				   &delegated, /*force*/0);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal unlink\n");
-			iput(delegated);
-		}
-		if (unlikely(err))
-			AU_MVD_PR(dmsg, "wh_unlink failed\n");
-	}
-	dput(h_path.dentry);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/*
- * unlink the topmost h_dentry
- */
-static int au_do_unlink(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct path h_path;
-	struct inode *delegated;
-
-	h_path.mnt = au_sbr_mnt(a->sb, a->mvd_bsrc);
-	h_path.dentry = au_h_dptr(a->dentry, a->mvd_bsrc);
-	delegated = NULL;
-	err = vfsub_unlink(a->mvd_h_src_dir, &h_path, &delegated, /*force*/0);
-	if (unlikely(err == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal unlink\n");
-		iput(delegated);
-	}
-	if (unlikely(err))
-		AU_MVD_PR(dmsg, "unlink failed\n");
-
-	AuTraceErr(err);
-	return err;
-}
-
-/* Since mvdown succeeded, we ignore an error of this function */
-static void au_do_stfs(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct au_branch *br;
-
-	a->mvdown.flags |= AUFS_MVDOWN_STFS_FAILED;
-	br = au_sbr(a->sb, a->mvd_bsrc);
-	err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_UPPER].stfs);
-	if (!err) {
-		br = au_sbr(a->sb, a->mvd_bdst);
-		a->mvdown.stbr[AUFS_MVDOWN_LOWER].brid = br->br_id;
-		err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_LOWER].stfs);
-	}
-	if (!err)
-		a->mvdown.flags &= ~AUFS_MVDOWN_STFS_FAILED;
-	else
-		AU_MVD_PR(dmsg, "statfs failed (%d), ignored\n", err);
-}
-
-/*
- * copy-down the file and unlink the bsrc file.
- * - unlink the bdst whout if exist
- * - copy-down the file (with whtmp name and rename)
- * - unlink the bsrc file
- */
-static int au_do_mvdown(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-
-	err = au_do_mkdir(dmsg, a);
-	if (!err)
-		err = au_do_lock(dmsg, a);
-	if (unlikely(err))
-		goto out;
-
-	/*
-	 * do not revert the activities we made on bdst since they should be
-	 * harmless in aufs.
-	 */
-
-	err = au_do_cpdown(dmsg, a);
-	if (!err)
-		err = au_do_unlink_wh(dmsg, a);
-	if (!err && !(a->mvdown.flags & AUFS_MVDOWN_KUPPER))
-		err = au_do_unlink(dmsg, a);
-	if (unlikely(err))
-		goto out_unlock;
-
-	AuDbg("%pd2, 0x%x, %d --> %d\n",
-	      a->dentry, a->mvdown.flags, a->mvd_bsrc, a->mvd_bdst);
-	if (find_lower_writable(a) < 0)
-		a->mvdown.flags |= AUFS_MVDOWN_BOTTOM;
-
-	if (a->mvdown.flags & AUFS_MVDOWN_STFS)
-		au_do_stfs(dmsg, a);
-
-	/* maintain internal array */
-	if (!(a->mvdown.flags & AUFS_MVDOWN_KUPPER)) {
-		au_set_h_dptr(a->dentry, a->mvd_bsrc, NULL);
-		au_set_dbstart(a->dentry, a->mvd_bdst);
-		au_set_h_iptr(a->inode, a->mvd_bsrc, NULL, /*flags*/0);
-		au_set_ibstart(a->inode, a->mvd_bdst);
-	}
-	if (au_dbend(a->dentry) < a->mvd_bdst)
-		au_set_dbend(a->dentry, a->mvd_bdst);
-	if (au_ibend(a->inode) < a->mvd_bdst)
-		au_set_ibend(a->inode, a->mvd_bdst);
-
-out_unlock:
-	au_do_unlock(dmsg, a);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* make sure the file is idle */
-static int au_mvd_args_busy(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err, plinked;
-
-	err = 0;
-	plinked = !!au_opt_test(au_mntflags(a->sb), PLINK);
-	if (au_dbstart(a->dentry) == a->mvd_bsrc
-	    && au_dcount(a->dentry) == 1
-	    && atomic_read(&a->inode->i_count) == 1
-	    /* && a->mvd_h_src_inode->i_nlink == 1 */
-	    && (!plinked || !au_plink_test(a->inode))
-	    && a->inode->i_nlink == 1)
-		goto out;
-
-	err = -EBUSY;
-	AU_MVD_PR(dmsg,
-		  "b%d, d{b%d, c%d?}, i{c%d?, l%u}, hi{l%u}, p{%d, %d}\n",
-		  a->mvd_bsrc, au_dbstart(a->dentry), au_dcount(a->dentry),
-		  atomic_read(&a->inode->i_count), a->inode->i_nlink,
-		  a->mvd_h_src_inode->i_nlink,
-		  plinked, plinked ? au_plink_test(a->inode) : 0);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* make sure the parent dir is fine */
-static int au_mvd_args_parent(const unsigned char dmsg,
-			      struct au_mvd_args *a)
-{
-	int err;
-	aufs_bindex_t bindex;
-
-	err = 0;
-	if (unlikely(au_alive_dir(a->parent))) {
-		err = -ENOENT;
-		AU_MVD_PR(dmsg, "parent dir is dead\n");
-		goto out;
-	}
-
-	a->bopq = au_dbdiropq(a->parent);
-	bindex = au_wbr_nonopq(a->dentry, a->mvd_bdst);
-	AuDbg("b%d\n", bindex);
-	if (unlikely((bindex >= 0 && bindex < a->mvd_bdst)
-		     || (a->bopq != -1 && a->bopq < a->mvd_bdst))) {
-		err = -EINVAL;
-		a->mvd_errno = EAU_MVDOWN_OPAQUE;
-		AU_MVD_PR(dmsg, "ancestor is opaque b%d, b%d\n",
-			  a->bopq, a->mvd_bdst);
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_mvd_args_intermediate(const unsigned char dmsg,
-				    struct au_mvd_args *a)
-{
-	int err;
-	struct au_dinfo *dinfo, *tmp;
-
-	/* lookup the next lower positive entry */
-	err = -ENOMEM;
-	tmp = au_di_alloc(a->sb, AuLsc_DI_TMP);
-	if (unlikely(!tmp))
-		goto out;
-
-	a->bfound = -1;
-	a->bwh = -1;
-	dinfo = au_di(a->dentry);
-	au_di_cp(tmp, dinfo);
-	au_di_swap(tmp, dinfo);
-
-	/* returns the number of positive dentries */
-	err = au_lkup_dentry(a->dentry, a->mvd_bsrc + 1, /*type*/0);
-	if (!err)
-		a->bwh = au_dbwh(a->dentry);
-	else if (err > 0)
-		a->bfound = au_dbstart(a->dentry);
-
-	au_di_swap(tmp, dinfo);
-	au_rw_write_unlock(&tmp->di_rwsem);
-	au_di_free(tmp);
-	if (unlikely(err < 0))
-		AU_MVD_PR(dmsg, "failed look-up lower\n");
-
-	/*
-	 * here, we have these cases.
-	 * bfound == -1
-	 *	no positive dentry under bsrc. there are more sub-cases.
-	 *	bwh < 0
-	 *		there no whiteout, we can safely move-down.
-	 *	bwh <= bsrc
-	 *		impossible
-	 *	bsrc < bwh && bwh < bdst
-	 *		there is a whiteout on RO branch. cannot proceed.
-	 *	bwh == bdst
-	 *		there is a whiteout on the RW target branch. it should
-	 *		be removed.
-	 *	bdst < bwh
-	 *		there is a whiteout somewhere unrelated branch.
-	 * -1 < bfound && bfound <= bsrc
-	 *	impossible.
-	 * bfound < bdst
-	 *	found, but it is on RO branch between bsrc and bdst. cannot
-	 *	proceed.
-	 * bfound == bdst
-	 *	found, replace it if AUFS_MVDOWN_FORCE is set. otherwise return
-	 *	error.
-	 * bdst < bfound
-	 *	found, after we create the file on bdst, it will be hidden.
-	 */
-
-	AuDebugOn(a->bfound == -1
-		  && a->bwh != -1
-		  && a->bwh <= a->mvd_bsrc);
-	AuDebugOn(-1 < a->bfound
-		  && a->bfound <= a->mvd_bsrc);
-
-	err = -EINVAL;
-	if (a->bfound == -1
-	    && a->mvd_bsrc < a->bwh
-	    && a->bwh != -1
-	    && a->bwh < a->mvd_bdst) {
-		a->mvd_errno = EAU_MVDOWN_WHITEOUT;
-		AU_MVD_PR(dmsg, "bsrc %d, bdst %d, bfound %d, bwh %d\n",
-			  a->mvd_bsrc, a->mvd_bdst, a->bfound, a->bwh);
-		goto out;
-	} else if (a->bfound != -1 && a->bfound < a->mvd_bdst) {
-		a->mvd_errno = EAU_MVDOWN_UPPER;
-		AU_MVD_PR(dmsg, "bdst %d, bfound %d\n",
-			  a->mvd_bdst, a->bfound);
-		goto out;
-	}
-
-	err = 0; /* success */
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_mvd_args_exist(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-
-	err = 0;
-	if (!(a->mvdown.flags & AUFS_MVDOWN_OWLOWER)
-	    && a->bfound == a->mvd_bdst)
-		err = -EEXIST;
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_mvd_args(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct au_branch *br;
-
-	err = -EISDIR;
-	if (unlikely(S_ISDIR(a->inode->i_mode)))
-		goto out;
-
-	err = -EINVAL;
-	if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_UPPER))
-		a->mvd_bsrc = au_ibstart(a->inode);
-	else {
-		a->mvd_bsrc = au_br_index(a->sb, a->mvd_src_brid);
-		if (unlikely(a->mvd_bsrc < 0
-			     || (a->mvd_bsrc < au_dbstart(a->dentry)
-				 || au_dbend(a->dentry) < a->mvd_bsrc
-				 || !au_h_dptr(a->dentry, a->mvd_bsrc))
-			     || (a->mvd_bsrc < au_ibstart(a->inode)
-				 || au_ibend(a->inode) < a->mvd_bsrc
-				 || !au_h_iptr(a->inode, a->mvd_bsrc)))) {
-			a->mvd_errno = EAU_MVDOWN_NOUPPER;
-			AU_MVD_PR(dmsg, "no upper\n");
-			goto out;
-		}
-	}
-	if (unlikely(a->mvd_bsrc == au_sbend(a->sb))) {
-		a->mvd_errno = EAU_MVDOWN_BOTTOM;
-		AU_MVD_PR(dmsg, "on the bottom\n");
-		goto out;
-	}
-	a->mvd_h_src_inode = au_h_iptr(a->inode, a->mvd_bsrc);
-	br = au_sbr(a->sb, a->mvd_bsrc);
-	err = au_br_rdonly(br);
-	if (!(a->mvdown.flags & AUFS_MVDOWN_ROUPPER)) {
-		if (unlikely(err))
-			goto out;
-	} else if (!(vfsub_native_ro(a->mvd_h_src_inode)
-		     || IS_APPEND(a->mvd_h_src_inode))) {
-		if (err)
-			a->mvdown.flags |= AUFS_MVDOWN_ROUPPER_R;
-		/* go on */
-	} else
-		goto out;
-
-	err = -EINVAL;
-	if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_LOWER)) {
-		a->mvd_bdst = find_lower_writable(a);
-		if (unlikely(a->mvd_bdst < 0)) {
-			a->mvd_errno = EAU_MVDOWN_BOTTOM;
-			AU_MVD_PR(dmsg, "no writable lower branch\n");
-			goto out;
-		}
-	} else {
-		a->mvd_bdst = au_br_index(a->sb, a->mvd_dst_brid);
-		if (unlikely(a->mvd_bdst < 0
-			     || au_sbend(a->sb) < a->mvd_bdst)) {
-			a->mvd_errno = EAU_MVDOWN_NOLOWERBR;
-			AU_MVD_PR(dmsg, "no lower brid\n");
-			goto out;
-		}
-	}
-
-	err = au_mvd_args_busy(dmsg, a);
-	if (!err)
-		err = au_mvd_args_parent(dmsg, a);
-	if (!err)
-		err = au_mvd_args_intermediate(dmsg, a);
-	if (!err)
-		err = au_mvd_args_exist(dmsg, a);
-	if (!err)
-		AuDbg("b%d, b%d\n", a->mvd_bsrc, a->mvd_bdst);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *uarg)
-{
-	int err, e;
-	unsigned char dmsg;
-	struct au_mvd_args *args;
-
-	err = -EPERM;
-	if (unlikely(!capable(CAP_SYS_ADMIN)))
-		goto out;
-
-	err = -ENOMEM;
-	args = kmalloc(sizeof(*args), GFP_NOFS);
-	if (unlikely(!args))
-		goto out;
-
-	err = copy_from_user(&args->mvdown, uarg, sizeof(args->mvdown));
-	if (!err)
-		err = !access_ok(VERIFY_WRITE, uarg, sizeof(*uarg));
-	if (unlikely(err)) {
-		err = -EFAULT;
-		AuTraceErr(err);
-		goto out_free;
-	}
-	AuDbg("flags 0x%x\n", args->mvdown.flags);
-	args->mvdown.flags &= ~(AUFS_MVDOWN_ROLOWER_R | AUFS_MVDOWN_ROUPPER_R);
-	args->mvdown.au_errno = 0;
-	args->dentry = dentry;
-	args->inode = d_inode(dentry);
-	args->sb = dentry->d_sb;
-
-	err = -ENOENT;
-	dmsg = !!(args->mvdown.flags & AUFS_MVDOWN_DMSG);
-	args->parent = dget_parent(dentry);
-	args->dir = d_inode(args->parent);
-	mutex_lock_nested(&args->dir->i_mutex, I_MUTEX_PARENT);
-	dput(args->parent);
-	if (unlikely(args->parent != dentry->d_parent)) {
-		AU_MVD_PR(dmsg, "parent dir is moved\n");
-		goto out_dir;
-	}
-
-	mutex_lock_nested(&args->inode->i_mutex, I_MUTEX_CHILD);
-	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH);
-	if (unlikely(err))
-		goto out_inode;
-
-	di_write_lock_parent(args->parent);
-	err = au_mvd_args(dmsg, args);
-	if (unlikely(err))
-		goto out_parent;
-
-	err = au_do_mvdown(dmsg, args);
-	if (unlikely(err))
-		goto out_parent;
-
-	au_cpup_attr_timesizes(args->dir);
-	au_cpup_attr_timesizes(args->inode);
-	au_cpup_igen(args->inode, au_h_iptr(args->inode, args->mvd_bdst));
-	/* au_digen_dec(dentry); */
-
-out_parent:
-	di_write_unlock(args->parent);
-	aufs_read_unlock(dentry, AuLock_DW);
-out_inode:
-	mutex_unlock(&args->inode->i_mutex);
-out_dir:
-	mutex_unlock(&args->dir->i_mutex);
-out_free:
-	e = copy_to_user(uarg, &args->mvdown, sizeof(args->mvdown));
-	if (unlikely(e))
-		err = -EFAULT;
-	kfree(args);
-out:
-	AuTraceErr(err);
-	return err;
-}
diff --git a/fs/aufs/opts.c b/fs/aufs/opts.c
deleted file mode 100644
index 4e8e2e1ff..000000000
--- a/fs/aufs/opts.c
+++ /dev/null
@@ -1,1835 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * mount options/flags
- */
-
-#include <linux/namei.h>
-#include <linux/types.h> /* a distribution requires */
-#include <linux/parser.h>
-#include "aufs.h"
-
-/* ---------------------------------------------------------------------- */
-
-enum {
-	Opt_br,
-	Opt_add, Opt_del, Opt_mod, Opt_append, Opt_prepend,
-	Opt_idel, Opt_imod,
-	Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash,
-	Opt_rdblk_def, Opt_rdhash_def,
-	Opt_xino, Opt_noxino,
-	Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino,
-	Opt_trunc_xino_path, Opt_itrunc_xino,
-	Opt_trunc_xib, Opt_notrunc_xib,
-	Opt_shwh, Opt_noshwh,
-	Opt_plink, Opt_noplink, Opt_list_plink,
-	Opt_udba,
-	Opt_dio, Opt_nodio,
-	Opt_diropq_a, Opt_diropq_w,
-	Opt_warn_perm, Opt_nowarn_perm,
-	Opt_wbr_copyup, Opt_wbr_create,
-	Opt_fhsm_sec,
-	Opt_verbose, Opt_noverbose,
-	Opt_sum, Opt_nosum, Opt_wsum,
-	Opt_dirperm1, Opt_nodirperm1,
-	Opt_acl, Opt_noacl,
-	Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err
-};
-
-static match_table_t options = {
-	{Opt_br, "br=%s"},
-	{Opt_br, "br:%s"},
-
-	{Opt_add, "add=%d:%s"},
-	{Opt_add, "add:%d:%s"},
-	{Opt_add, "ins=%d:%s"},
-	{Opt_add, "ins:%d:%s"},
-	{Opt_append, "append=%s"},
-	{Opt_append, "append:%s"},
-	{Opt_prepend, "prepend=%s"},
-	{Opt_prepend, "prepend:%s"},
-
-	{Opt_del, "del=%s"},
-	{Opt_del, "del:%s"},
-	/* {Opt_idel, "idel:%d"}, */
-	{Opt_mod, "mod=%s"},
-	{Opt_mod, "mod:%s"},
-	/* {Opt_imod, "imod:%d:%s"}, */
-
-	{Opt_dirwh, "dirwh=%d"},
-
-	{Opt_xino, "xino=%s"},
-	{Opt_noxino, "noxino"},
-	{Opt_trunc_xino, "trunc_xino"},
-	{Opt_trunc_xino_v, "trunc_xino_v=%d:%d"},
-	{Opt_notrunc_xino, "notrunc_xino"},
-	{Opt_trunc_xino_path, "trunc_xino=%s"},
-	{Opt_itrunc_xino, "itrunc_xino=%d"},
-	/* {Opt_zxino, "zxino=%s"}, */
-	{Opt_trunc_xib, "trunc_xib"},
-	{Opt_notrunc_xib, "notrunc_xib"},
-
-#ifdef CONFIG_PROC_FS
-	{Opt_plink, "plink"},
-#else
-	{Opt_ignore_silent, "plink"},
-#endif
-
-	{Opt_noplink, "noplink"},
-
-#ifdef CONFIG_AUFS_DEBUG
-	{Opt_list_plink, "list_plink"},
-#endif
-
-	{Opt_udba, "udba=%s"},
-
-	{Opt_dio, "dio"},
-	{Opt_nodio, "nodio"},
-
-#ifdef CONFIG_AUFS_FHSM
-	{Opt_fhsm_sec, "fhsm_sec=%d"},
-#else
-	{Opt_ignore_silent, "fhsm_sec=%d"},
-#endif
-
-	{Opt_diropq_a, "diropq=always"},
-	{Opt_diropq_a, "diropq=a"},
-	{Opt_diropq_w, "diropq=whiteouted"},
-	{Opt_diropq_w, "diropq=w"},
-
-	{Opt_warn_perm, "warn_perm"},
-	{Opt_nowarn_perm, "nowarn_perm"},
-
-	/* keep them temporary */
-	{Opt_ignore_silent, "nodlgt"},
-	{Opt_ignore_silent, "clean_plink"},
-
-#ifdef CONFIG_AUFS_SHWH
-	{Opt_shwh, "shwh"},
-#endif
-	{Opt_noshwh, "noshwh"},
-
-	{Opt_dirperm1, "dirperm1"},
-	{Opt_nodirperm1, "nodirperm1"},
-
-	{Opt_verbose, "verbose"},
-	{Opt_verbose, "v"},
-	{Opt_noverbose, "noverbose"},
-	{Opt_noverbose, "quiet"},
-	{Opt_noverbose, "q"},
-	{Opt_noverbose, "silent"},
-
-	{Opt_sum, "sum"},
-	{Opt_nosum, "nosum"},
-	{Opt_wsum, "wsum"},
-
-	{Opt_rdcache, "rdcache=%d"},
-	{Opt_rdblk, "rdblk=%d"},
-	{Opt_rdblk_def, "rdblk=def"},
-	{Opt_rdhash, "rdhash=%d"},
-	{Opt_rdhash_def, "rdhash=def"},
-
-	{Opt_wbr_create, "create=%s"},
-	{Opt_wbr_create, "create_policy=%s"},
-	{Opt_wbr_copyup, "cpup=%s"},
-	{Opt_wbr_copyup, "copyup=%s"},
-	{Opt_wbr_copyup, "copyup_policy=%s"},
-
-	/* generic VFS flag */
-#ifdef CONFIG_FS_POSIX_ACL
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-#else
-	{Opt_ignore_silent, "acl"},
-	{Opt_ignore_silent, "noacl"},
-#endif
-
-	/* internal use for the scripts */
-	{Opt_ignore_silent, "si=%s"},
-
-	{Opt_br, "dirs=%s"},
-	{Opt_ignore, "debug=%d"},
-	{Opt_ignore, "delete=whiteout"},
-	{Opt_ignore, "delete=all"},
-	{Opt_ignore, "imap=%s"},
-
-	/* temporary workaround, due to old mount(8)? */
-	{Opt_ignore_silent, "relatime"},
-
-	{Opt_err, NULL}
-};
-
-/* ---------------------------------------------------------------------- */
-
-static const char *au_parser_pattern(int val, match_table_t tbl)
-{
-	struct match_token *p;
-
-	p = tbl;
-	while (p->pattern) {
-		if (p->token == val)
-			return p->pattern;
-		p++;
-	}
-	BUG();
-	return "??";
-}
-
-static const char *au_optstr(int *val, match_table_t tbl)
-{
-	struct match_token *p;
-	int v;
-
-	v = *val;
-	if (!v)
-		goto out;
-	p = tbl;
-	while (p->pattern) {
-		if (p->token
-		    && (v & p->token) == p->token) {
-			*val &= ~p->token;
-			return p->pattern;
-		}
-		p++;
-	}
-
-out:
-	return NULL;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static match_table_t brperm = {
-	{AuBrPerm_RO, AUFS_BRPERM_RO},
-	{AuBrPerm_RR, AUFS_BRPERM_RR},
-	{AuBrPerm_RW, AUFS_BRPERM_RW},
-	{0, NULL}
-};
-
-static match_table_t brattr = {
-	/* general */
-	{AuBrAttr_COO_REG, AUFS_BRATTR_COO_REG},
-	{AuBrAttr_COO_ALL, AUFS_BRATTR_COO_ALL},
-	/* 'unpin' attrib is meaningless since linux-3.18-rc1 */
-	{AuBrAttr_UNPIN, AUFS_BRATTR_UNPIN},
-#ifdef CONFIG_AUFS_FHSM
-	{AuBrAttr_FHSM, AUFS_BRATTR_FHSM},
-#endif
-#ifdef CONFIG_AUFS_XATTR
-	{AuBrAttr_ICEX, AUFS_BRATTR_ICEX},
-	{AuBrAttr_ICEX_SEC, AUFS_BRATTR_ICEX_SEC},
-	{AuBrAttr_ICEX_SYS, AUFS_BRATTR_ICEX_SYS},
-	{AuBrAttr_ICEX_TR, AUFS_BRATTR_ICEX_TR},
-	{AuBrAttr_ICEX_USR, AUFS_BRATTR_ICEX_USR},
-	{AuBrAttr_ICEX_OTH, AUFS_BRATTR_ICEX_OTH},
-#endif
-
-	/* ro/rr branch */
-	{AuBrRAttr_WH, AUFS_BRRATTR_WH},
-
-	/* rw branch */
-	{AuBrWAttr_MOO, AUFS_BRWATTR_MOO},
-	{AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH},
-
-	{0, NULL}
-};
-
-static int br_attr_val(char *str, match_table_t table, substring_t args[])
-{
-	int attr, v;
-	char *p;
-
-	attr = 0;
-	do {
-		p = strchr(str, '+');
-		if (p)
-			*p = 0;
-		v = match_token(str, table, args);
-		if (v) {
-			if (v & AuBrAttr_CMOO_Mask)
-				attr &= ~AuBrAttr_CMOO_Mask;
-			attr |= v;
-		} else {
-			if (p)
-				*p = '+';
-			pr_warn("ignored branch attribute %s\n", str);
-			break;
-		}
-		if (p)
-			str = p + 1;
-	} while (p);
-
-	return attr;
-}
-
-static int au_do_optstr_br_attr(au_br_perm_str_t *str, int perm)
-{
-	int sz;
-	const char *p;
-	char *q;
-
-	q = str->a;
-	*q = 0;
-	p = au_optstr(&perm, brattr);
-	if (p) {
-		sz = strlen(p);
-		memcpy(q, p, sz + 1);
-		q += sz;
-	} else
-		goto out;
-
-	do {
-		p = au_optstr(&perm, brattr);
-		if (p) {
-			*q++ = '+';
-			sz = strlen(p);
-			memcpy(q, p, sz + 1);
-			q += sz;
-		}
-	} while (p);
-
-out:
-	return q - str->a;
-}
-
-static int noinline_for_stack br_perm_val(char *perm)
-{
-	int val, bad, sz;
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	au_br_perm_str_t attr;
-
-	p = strchr(perm, '+');
-	if (p)
-		*p = 0;
-	val = match_token(perm, brperm, args);
-	if (!val) {
-		if (p)
-			*p = '+';
-		pr_warn("ignored branch permission %s\n", perm);
-		val = AuBrPerm_RO;
-		goto out;
-	}
-	if (!p)
-		goto out;
-
-	val |= br_attr_val(p + 1, brattr, args);
-
-	bad = 0;
-	switch (val & AuBrPerm_Mask) {
-	case AuBrPerm_RO:
-	case AuBrPerm_RR:
-		bad = val & AuBrWAttr_Mask;
-		val &= ~AuBrWAttr_Mask;
-		break;
-	case AuBrPerm_RW:
-		bad = val & AuBrRAttr_Mask;
-		val &= ~AuBrRAttr_Mask;
-		break;
-	}
-
-	/*
-	 * 'unpin' attrib becomes meaningless since linux-3.18-rc1, but aufs
-	 * does not treat it as an error, just warning.
-	 * this is a tiny guard for the user operation.
-	 */
-	if (val & AuBrAttr_UNPIN) {
-		bad |= AuBrAttr_UNPIN;
-		val &= ~AuBrAttr_UNPIN;
-	}
-
-	if (unlikely(bad)) {
-		sz = au_do_optstr_br_attr(&attr, bad);
-		AuDebugOn(!sz);
-		pr_warn("ignored branch attribute %s\n", attr.a);
-	}
-
-out:
-	return val;
-}
-
-void au_optstr_br_perm(au_br_perm_str_t *str, int perm)
-{
-	au_br_perm_str_t attr;
-	const char *p;
-	char *q;
-	int sz;
-
-	q = str->a;
-	p = au_optstr(&perm, brperm);
-	AuDebugOn(!p || !*p);
-	sz = strlen(p);
-	memcpy(q, p, sz + 1);
-	q += sz;
-
-	sz = au_do_optstr_br_attr(&attr, perm);
-	if (sz) {
-		*q++ = '+';
-		memcpy(q, attr.a, sz + 1);
-	}
-
-	AuDebugOn(strlen(str->a) >= sizeof(str->a));
-}
-
-/* ---------------------------------------------------------------------- */
-
-static match_table_t udbalevel = {
-	{AuOpt_UDBA_REVAL, "reval"},
-	{AuOpt_UDBA_NONE, "none"},
-#ifdef CONFIG_AUFS_HNOTIFY
-	{AuOpt_UDBA_HNOTIFY, "notify"}, /* abstraction */
-#ifdef CONFIG_AUFS_HFSNOTIFY
-	{AuOpt_UDBA_HNOTIFY, "fsnotify"},
-#endif
-#endif
-	{-1, NULL}
-};
-
-static int noinline_for_stack udba_val(char *str)
-{
-	substring_t args[MAX_OPT_ARGS];
-
-	return match_token(str, udbalevel, args);
-}
-
-const char *au_optstr_udba(int udba)
-{
-	return au_parser_pattern(udba, udbalevel);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static match_table_t au_wbr_create_policy = {
-	{AuWbrCreate_TDP, "tdp"},
-	{AuWbrCreate_TDP, "top-down-parent"},
-	{AuWbrCreate_RR, "rr"},
-	{AuWbrCreate_RR, "round-robin"},
-	{AuWbrCreate_MFS, "mfs"},
-	{AuWbrCreate_MFS, "most-free-space"},
-	{AuWbrCreate_MFSV, "mfs:%d"},
-	{AuWbrCreate_MFSV, "most-free-space:%d"},
-
-	{AuWbrCreate_MFSRR, "mfsrr:%d"},
-	{AuWbrCreate_MFSRRV, "mfsrr:%d:%d"},
-	{AuWbrCreate_PMFS, "pmfs"},
-	{AuWbrCreate_PMFSV, "pmfs:%d"},
-	{AuWbrCreate_PMFSRR, "pmfsrr:%d"},
-	{AuWbrCreate_PMFSRRV, "pmfsrr:%d:%d"},
-
-	{-1, NULL}
-};
-
-/*
- * cf. linux/lib/parser.c and cmdline.c
- * gave up calling memparse() since it uses simple_strtoull() instead of
- * kstrto...().
- */
-static int noinline_for_stack
-au_match_ull(substring_t *s, unsigned long long *result)
-{
-	int err;
-	unsigned int len;
-	char a[32];
-
-	err = -ERANGE;
-	len = s->to - s->from;
-	if (len + 1 <= sizeof(a)) {
-		memcpy(a, s->from, len);
-		a[len] = '\0';
-		err = kstrtoull(a, 0, result);
-	}
-	return err;
-}
-
-static int au_wbr_mfs_wmark(substring_t *arg, char *str,
-			    struct au_opt_wbr_create *create)
-{
-	int err;
-	unsigned long long ull;
-
-	err = 0;
-	if (!au_match_ull(arg, &ull))
-		create->mfsrr_watermark = ull;
-	else {
-		pr_err("bad integer in %s\n", str);
-		err = -EINVAL;
-	}
-
-	return err;
-}
-
-static int au_wbr_mfs_sec(substring_t *arg, char *str,
-			  struct au_opt_wbr_create *create)
-{
-	int n, err;
-
-	err = 0;
-	if (!match_int(arg, &n) && 0 <= n && n <= AUFS_MFS_MAX_SEC)
-		create->mfs_second = n;
-	else {
-		pr_err("bad integer in %s\n", str);
-		err = -EINVAL;
-	}
-
-	return err;
-}
-
-static int noinline_for_stack
-au_wbr_create_val(char *str, struct au_opt_wbr_create *create)
-{
-	int err, e;
-	substring_t args[MAX_OPT_ARGS];
-
-	err = match_token(str, au_wbr_create_policy, args);
-	create->wbr_create = err;
-	switch (err) {
-	case AuWbrCreate_MFSRRV:
-	case AuWbrCreate_PMFSRRV:
-		e = au_wbr_mfs_wmark(&args[0], str, create);
-		if (!e)
-			e = au_wbr_mfs_sec(&args[1], str, create);
-		if (unlikely(e))
-			err = e;
-		break;
-	case AuWbrCreate_MFSRR:
-	case AuWbrCreate_PMFSRR:
-		e = au_wbr_mfs_wmark(&args[0], str, create);
-		if (unlikely(e)) {
-			err = e;
-			break;
-		}
-		/*FALLTHROUGH*/
-	case AuWbrCreate_MFS:
-	case AuWbrCreate_PMFS:
-		create->mfs_second = AUFS_MFS_DEF_SEC;
-		break;
-	case AuWbrCreate_MFSV:
-	case AuWbrCreate_PMFSV:
-		e = au_wbr_mfs_sec(&args[0], str, create);
-		if (unlikely(e))
-			err = e;
-		break;
-	}
-
-	return err;
-}
-
-const char *au_optstr_wbr_create(int wbr_create)
-{
-	return au_parser_pattern(wbr_create, au_wbr_create_policy);
-}
-
-static match_table_t au_wbr_copyup_policy = {
-	{AuWbrCopyup_TDP, "tdp"},
-	{AuWbrCopyup_TDP, "top-down-parent"},
-	{AuWbrCopyup_BUP, "bup"},
-	{AuWbrCopyup_BUP, "bottom-up-parent"},
-	{AuWbrCopyup_BU, "bu"},
-	{AuWbrCopyup_BU, "bottom-up"},
-	{-1, NULL}
-};
-
-static int noinline_for_stack au_wbr_copyup_val(char *str)
-{
-	substring_t args[MAX_OPT_ARGS];
-
-	return match_token(str, au_wbr_copyup_policy, args);
-}
-
-const char *au_optstr_wbr_copyup(int wbr_copyup)
-{
-	return au_parser_pattern(wbr_copyup, au_wbr_copyup_policy);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-
-static void dump_opts(struct au_opts *opts)
-{
-#ifdef CONFIG_AUFS_DEBUG
-	/* reduce stack space */
-	union {
-		struct au_opt_add *add;
-		struct au_opt_del *del;
-		struct au_opt_mod *mod;
-		struct au_opt_xino *xino;
-		struct au_opt_xino_itrunc *xino_itrunc;
-		struct au_opt_wbr_create *create;
-	} u;
-	struct au_opt *opt;
-
-	opt = opts->opt;
-	while (opt->type != Opt_tail) {
-		switch (opt->type) {
-		case Opt_add:
-			u.add = &opt->add;
-			AuDbg("add {b%d, %s, 0x%x, %p}\n",
-				  u.add->bindex, u.add->pathname, u.add->perm,
-				  u.add->path.dentry);
-			break;
-		case Opt_del:
-		case Opt_idel:
-			u.del = &opt->del;
-			AuDbg("del {%s, %p}\n",
-			      u.del->pathname, u.del->h_path.dentry);
-			break;
-		case Opt_mod:
-		case Opt_imod:
-			u.mod = &opt->mod;
-			AuDbg("mod {%s, 0x%x, %p}\n",
-				  u.mod->path, u.mod->perm, u.mod->h_root);
-			break;
-		case Opt_append:
-			u.add = &opt->add;
-			AuDbg("append {b%d, %s, 0x%x, %p}\n",
-				  u.add->bindex, u.add->pathname, u.add->perm,
-				  u.add->path.dentry);
-			break;
-		case Opt_prepend:
-			u.add = &opt->add;
-			AuDbg("prepend {b%d, %s, 0x%x, %p}\n",
-				  u.add->bindex, u.add->pathname, u.add->perm,
-				  u.add->path.dentry);
-			break;
-		case Opt_dirwh:
-			AuDbg("dirwh %d\n", opt->dirwh);
-			break;
-		case Opt_rdcache:
-			AuDbg("rdcache %d\n", opt->rdcache);
-			break;
-		case Opt_rdblk:
-			AuDbg("rdblk %u\n", opt->rdblk);
-			break;
-		case Opt_rdblk_def:
-			AuDbg("rdblk_def\n");
-			break;
-		case Opt_rdhash:
-			AuDbg("rdhash %u\n", opt->rdhash);
-			break;
-		case Opt_rdhash_def:
-			AuDbg("rdhash_def\n");
-			break;
-		case Opt_xino:
-			u.xino = &opt->xino;
-			AuDbg("xino {%s %pD}\n", u.xino->path, u.xino->file);
-			break;
-		case Opt_trunc_xino:
-			AuLabel(trunc_xino);
-			break;
-		case Opt_notrunc_xino:
-			AuLabel(notrunc_xino);
-			break;
-		case Opt_trunc_xino_path:
-		case Opt_itrunc_xino:
-			u.xino_itrunc = &opt->xino_itrunc;
-			AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex);
-			break;
-		case Opt_noxino:
-			AuLabel(noxino);
-			break;
-		case Opt_trunc_xib:
-			AuLabel(trunc_xib);
-			break;
-		case Opt_notrunc_xib:
-			AuLabel(notrunc_xib);
-			break;
-		case Opt_shwh:
-			AuLabel(shwh);
-			break;
-		case Opt_noshwh:
-			AuLabel(noshwh);
-			break;
-		case Opt_dirperm1:
-			AuLabel(dirperm1);
-			break;
-		case Opt_nodirperm1:
-			AuLabel(nodirperm1);
-			break;
-		case Opt_plink:
-			AuLabel(plink);
-			break;
-		case Opt_noplink:
-			AuLabel(noplink);
-			break;
-		case Opt_list_plink:
-			AuLabel(list_plink);
-			break;
-		case Opt_udba:
-			AuDbg("udba %d, %s\n",
-				  opt->udba, au_optstr_udba(opt->udba));
-			break;
-		case Opt_dio:
-			AuLabel(dio);
-			break;
-		case Opt_nodio:
-			AuLabel(nodio);
-			break;
-		case Opt_diropq_a:
-			AuLabel(diropq_a);
-			break;
-		case Opt_diropq_w:
-			AuLabel(diropq_w);
-			break;
-		case Opt_warn_perm:
-			AuLabel(warn_perm);
-			break;
-		case Opt_nowarn_perm:
-			AuLabel(nowarn_perm);
-			break;
-		case Opt_verbose:
-			AuLabel(verbose);
-			break;
-		case Opt_noverbose:
-			AuLabel(noverbose);
-			break;
-		case Opt_sum:
-			AuLabel(sum);
-			break;
-		case Opt_nosum:
-			AuLabel(nosum);
-			break;
-		case Opt_wsum:
-			AuLabel(wsum);
-			break;
-		case Opt_wbr_create:
-			u.create = &opt->wbr_create;
-			AuDbg("create %d, %s\n", u.create->wbr_create,
-				  au_optstr_wbr_create(u.create->wbr_create));
-			switch (u.create->wbr_create) {
-			case AuWbrCreate_MFSV:
-			case AuWbrCreate_PMFSV:
-				AuDbg("%d sec\n", u.create->mfs_second);
-				break;
-			case AuWbrCreate_MFSRR:
-				AuDbg("%llu watermark\n",
-					  u.create->mfsrr_watermark);
-				break;
-			case AuWbrCreate_MFSRRV:
-			case AuWbrCreate_PMFSRRV:
-				AuDbg("%llu watermark, %d sec\n",
-					  u.create->mfsrr_watermark,
-					  u.create->mfs_second);
-				break;
-			}
-			break;
-		case Opt_wbr_copyup:
-			AuDbg("copyup %d, %s\n", opt->wbr_copyup,
-				  au_optstr_wbr_copyup(opt->wbr_copyup));
-			break;
-		case Opt_fhsm_sec:
-			AuDbg("fhsm_sec %u\n", opt->fhsm_second);
-			break;
-		case Opt_acl:
-			AuLabel(acl);
-			break;
-		case Opt_noacl:
-			AuLabel(noacl);
-			break;
-		default:
-			BUG();
-		}
-		opt++;
-	}
-#endif
-}
-
-void au_opts_free(struct au_opts *opts)
-{
-	struct au_opt *opt;
-
-	opt = opts->opt;
-	while (opt->type != Opt_tail) {
-		switch (opt->type) {
-		case Opt_add:
-		case Opt_append:
-		case Opt_prepend:
-			path_put(&opt->add.path);
-			break;
-		case Opt_del:
-		case Opt_idel:
-			path_put(&opt->del.h_path);
-			break;
-		case Opt_mod:
-		case Opt_imod:
-			dput(opt->mod.h_root);
-			break;
-		case Opt_xino:
-			fput(opt->xino.file);
-			break;
-		}
-		opt++;
-	}
-}
-
-static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags,
-		   aufs_bindex_t bindex)
-{
-	int err;
-	struct au_opt_add *add = &opt->add;
-	char *p;
-
-	add->bindex = bindex;
-	add->perm = AuBrPerm_RO;
-	add->pathname = opt_str;
-	p = strchr(opt_str, '=');
-	if (p) {
-		*p++ = 0;
-		if (*p)
-			add->perm = br_perm_val(p);
-	}
-
-	err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path);
-	if (!err) {
-		if (!p) {
-			add->perm = AuBrPerm_RO;
-			if (au_test_fs_rr(add->path.dentry->d_sb))
-				add->perm = AuBrPerm_RR;
-			else if (!bindex && !(sb_flags & MS_RDONLY))
-				add->perm = AuBrPerm_RW;
-		}
-		opt->type = Opt_add;
-		goto out;
-	}
-	pr_err("lookup failed %s (%d)\n", add->pathname, err);
-	err = -EINVAL;
-
-out:
-	return err;
-}
-
-static int au_opts_parse_del(struct au_opt_del *del, substring_t args[])
-{
-	int err;
-
-	del->pathname = args[0].from;
-	AuDbg("del path %s\n", del->pathname);
-
-	err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path);
-	if (unlikely(err))
-		pr_err("lookup failed %s (%d)\n", del->pathname, err);
-
-	return err;
-}
-
-#if 0 /* reserved for future use */
-static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex,
-			      struct au_opt_del *del, substring_t args[])
-{
-	int err;
-	struct dentry *root;
-
-	err = -EINVAL;
-	root = sb->s_root;
-	aufs_read_lock(root, AuLock_FLUSH);
-	if (bindex < 0 || au_sbend(sb) < bindex) {
-		pr_err("out of bounds, %d\n", bindex);
-		goto out;
-	}
-
-	err = 0;
-	del->h_path.dentry = dget(au_h_dptr(root, bindex));
-	del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex));
-
-out:
-	aufs_read_unlock(root, !AuLock_IR);
-	return err;
-}
-#endif
-
-static int noinline_for_stack
-au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[])
-{
-	int err;
-	struct path path;
-	char *p;
-
-	err = -EINVAL;
-	mod->path = args[0].from;
-	p = strchr(mod->path, '=');
-	if (unlikely(!p)) {
-		pr_err("no permssion %s\n", args[0].from);
-		goto out;
-	}
-
-	*p++ = 0;
-	err = vfsub_kern_path(mod->path, lkup_dirflags, &path);
-	if (unlikely(err)) {
-		pr_err("lookup failed %s (%d)\n", mod->path, err);
-		goto out;
-	}
-
-	mod->perm = br_perm_val(p);
-	AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p);
-	mod->h_root = dget(path.dentry);
-	path_put(&path);
-
-out:
-	return err;
-}
-
-#if 0 /* reserved for future use */
-static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex,
-			      struct au_opt_mod *mod, substring_t args[])
-{
-	int err;
-	struct dentry *root;
-
-	err = -EINVAL;
-	root = sb->s_root;
-	aufs_read_lock(root, AuLock_FLUSH);
-	if (bindex < 0 || au_sbend(sb) < bindex) {
-		pr_err("out of bounds, %d\n", bindex);
-		goto out;
-	}
-
-	err = 0;
-	mod->perm = br_perm_val(args[1].from);
-	AuDbg("mod path %s, perm 0x%x, %s\n",
-	      mod->path, mod->perm, args[1].from);
-	mod->h_root = dget(au_h_dptr(root, bindex));
-
-out:
-	aufs_read_unlock(root, !AuLock_IR);
-	return err;
-}
-#endif
-
-static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino,
-			      substring_t args[])
-{
-	int err;
-	struct file *file;
-
-	file = au_xino_create(sb, args[0].from, /*silent*/0);
-	err = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out;
-
-	err = -EINVAL;
-	if (unlikely(file->f_path.dentry->d_sb == sb)) {
-		fput(file);
-		pr_err("%s must be outside\n", args[0].from);
-		goto out;
-	}
-
-	err = 0;
-	xino->file = file;
-	xino->path = args[0].from;
-
-out:
-	return err;
-}
-
-static int noinline_for_stack
-au_opts_parse_xino_itrunc_path(struct super_block *sb,
-			       struct au_opt_xino_itrunc *xino_itrunc,
-			       substring_t args[])
-{
-	int err;
-	aufs_bindex_t bend, bindex;
-	struct path path;
-	struct dentry *root;
-
-	err = vfsub_kern_path(args[0].from, lkup_dirflags, &path);
-	if (unlikely(err)) {
-		pr_err("lookup failed %s (%d)\n", args[0].from, err);
-		goto out;
-	}
-
-	xino_itrunc->bindex = -1;
-	root = sb->s_root;
-	aufs_read_lock(root, AuLock_FLUSH);
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		if (au_h_dptr(root, bindex) == path.dentry) {
-			xino_itrunc->bindex = bindex;
-			break;
-		}
-	}
-	aufs_read_unlock(root, !AuLock_IR);
-	path_put(&path);
-
-	if (unlikely(xino_itrunc->bindex < 0)) {
-		pr_err("no such branch %s\n", args[0].from);
-		err = -EINVAL;
-	}
-
-out:
-	return err;
-}
-
-/* called without aufs lock */
-int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts)
-{
-	int err, n, token;
-	aufs_bindex_t bindex;
-	unsigned char skipped;
-	struct dentry *root;
-	struct au_opt *opt, *opt_tail;
-	char *opt_str;
-	/* reduce the stack space */
-	union {
-		struct au_opt_xino_itrunc *xino_itrunc;
-		struct au_opt_wbr_create *create;
-	} u;
-	struct {
-		substring_t args[MAX_OPT_ARGS];
-	} *a;
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	root = sb->s_root;
-	err = 0;
-	bindex = 0;
-	opt = opts->opt;
-	opt_tail = opt + opts->max_opt - 1;
-	opt->type = Opt_tail;
-	while (!err && (opt_str = strsep(&str, ",")) && *opt_str) {
-		err = -EINVAL;
-		skipped = 0;
-		token = match_token(opt_str, options, a->args);
-		switch (token) {
-		case Opt_br:
-			err = 0;
-			while (!err && (opt_str = strsep(&a->args[0].from, ":"))
-			       && *opt_str) {
-				err = opt_add(opt, opt_str, opts->sb_flags,
-					      bindex++);
-				if (unlikely(!err && ++opt > opt_tail)) {
-					err = -E2BIG;
-					break;
-				}
-				opt->type = Opt_tail;
-				skipped = 1;
-			}
-			break;
-		case Opt_add:
-			if (unlikely(match_int(&a->args[0], &n))) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			bindex = n;
-			err = opt_add(opt, a->args[1].from, opts->sb_flags,
-				      bindex);
-			if (!err)
-				opt->type = token;
-			break;
-		case Opt_append:
-			err = opt_add(opt, a->args[0].from, opts->sb_flags,
-				      /*dummy bindex*/1);
-			if (!err)
-				opt->type = token;
-			break;
-		case Opt_prepend:
-			err = opt_add(opt, a->args[0].from, opts->sb_flags,
-				      /*bindex*/0);
-			if (!err)
-				opt->type = token;
-			break;
-		case Opt_del:
-			err = au_opts_parse_del(&opt->del, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-#if 0 /* reserved for future use */
-		case Opt_idel:
-			del->pathname = "(indexed)";
-			if (unlikely(match_int(&args[0], &n))) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			err = au_opts_parse_idel(sb, n, &opt->del, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-#endif
-		case Opt_mod:
-			err = au_opts_parse_mod(&opt->mod, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-#ifdef IMOD /* reserved for future use */
-		case Opt_imod:
-			u.mod->path = "(indexed)";
-			if (unlikely(match_int(&a->args[0], &n))) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			err = au_opts_parse_imod(sb, n, &opt->mod, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-#endif
-		case Opt_xino:
-			err = au_opts_parse_xino(sb, &opt->xino, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-
-		case Opt_trunc_xino_path:
-			err = au_opts_parse_xino_itrunc_path
-				(sb, &opt->xino_itrunc, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-
-		case Opt_itrunc_xino:
-			u.xino_itrunc = &opt->xino_itrunc;
-			if (unlikely(match_int(&a->args[0], &n))) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			u.xino_itrunc->bindex = n;
-			aufs_read_lock(root, AuLock_FLUSH);
-			if (n < 0 || au_sbend(sb) < n) {
-				pr_err("out of bounds, %d\n", n);
-				aufs_read_unlock(root, !AuLock_IR);
-				break;
-			}
-			aufs_read_unlock(root, !AuLock_IR);
-			err = 0;
-			opt->type = token;
-			break;
-
-		case Opt_dirwh:
-			if (unlikely(match_int(&a->args[0], &opt->dirwh)))
-				break;
-			err = 0;
-			opt->type = token;
-			break;
-
-		case Opt_rdcache:
-			if (unlikely(match_int(&a->args[0], &n))) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			if (unlikely(n > AUFS_RDCACHE_MAX)) {
-				pr_err("rdcache must be smaller than %d\n",
-				       AUFS_RDCACHE_MAX);
-				break;
-			}
-			opt->rdcache = n;
-			err = 0;
-			opt->type = token;
-			break;
-		case Opt_rdblk:
-			if (unlikely(match_int(&a->args[0], &n)
-				     || n < 0
-				     || n > KMALLOC_MAX_SIZE)) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			if (unlikely(n && n < NAME_MAX)) {
-				pr_err("rdblk must be larger than %d\n",
-				       NAME_MAX);
-				break;
-			}
-			opt->rdblk = n;
-			err = 0;
-			opt->type = token;
-			break;
-		case Opt_rdhash:
-			if (unlikely(match_int(&a->args[0], &n)
-				     || n < 0
-				     || n * sizeof(struct hlist_head)
-				     > KMALLOC_MAX_SIZE)) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			opt->rdhash = n;
-			err = 0;
-			opt->type = token;
-			break;
-
-		case Opt_trunc_xino:
-		case Opt_notrunc_xino:
-		case Opt_noxino:
-		case Opt_trunc_xib:
-		case Opt_notrunc_xib:
-		case Opt_shwh:
-		case Opt_noshwh:
-		case Opt_dirperm1:
-		case Opt_nodirperm1:
-		case Opt_plink:
-		case Opt_noplink:
-		case Opt_list_plink:
-		case Opt_dio:
-		case Opt_nodio:
-		case Opt_diropq_a:
-		case Opt_diropq_w:
-		case Opt_warn_perm:
-		case Opt_nowarn_perm:
-		case Opt_verbose:
-		case Opt_noverbose:
-		case Opt_sum:
-		case Opt_nosum:
-		case Opt_wsum:
-		case Opt_rdblk_def:
-		case Opt_rdhash_def:
-		case Opt_acl:
-		case Opt_noacl:
-			err = 0;
-			opt->type = token;
-			break;
-
-		case Opt_udba:
-			opt->udba = udba_val(a->args[0].from);
-			if (opt->udba >= 0) {
-				err = 0;
-				opt->type = token;
-			} else
-				pr_err("wrong value, %s\n", opt_str);
-			break;
-
-		case Opt_wbr_create:
-			u.create = &opt->wbr_create;
-			u.create->wbr_create
-				= au_wbr_create_val(a->args[0].from, u.create);
-			if (u.create->wbr_create >= 0) {
-				err = 0;
-				opt->type = token;
-			} else
-				pr_err("wrong value, %s\n", opt_str);
-			break;
-		case Opt_wbr_copyup:
-			opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from);
-			if (opt->wbr_copyup >= 0) {
-				err = 0;
-				opt->type = token;
-			} else
-				pr_err("wrong value, %s\n", opt_str);
-			break;
-
-		case Opt_fhsm_sec:
-			if (unlikely(match_int(&a->args[0], &n)
-				     || n < 0)) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			if (sysaufs_brs) {
-				opt->fhsm_second = n;
-				opt->type = token;
-			} else
-				pr_warn("ignored %s\n", opt_str);
-			err = 0;
-			break;
-
-		case Opt_ignore:
-			pr_warn("ignored %s\n", opt_str);
-			/*FALLTHROUGH*/
-		case Opt_ignore_silent:
-			skipped = 1;
-			err = 0;
-			break;
-		case Opt_err:
-			pr_err("unknown option %s\n", opt_str);
-			break;
-		}
-
-		if (!err && !skipped) {
-			if (unlikely(++opt > opt_tail)) {
-				err = -E2BIG;
-				opt--;
-				opt->type = Opt_tail;
-				break;
-			}
-			opt->type = Opt_tail;
-		}
-	}
-
-	kfree(a);
-	dump_opts(opts);
-	if (unlikely(err))
-		au_opts_free(opts);
-
-out:
-	return err;
-}
-
-static int au_opt_wbr_create(struct super_block *sb,
-			     struct au_opt_wbr_create *create)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	err = 1; /* handled */
-	sbinfo = au_sbi(sb);
-	if (sbinfo->si_wbr_create_ops->fin) {
-		err = sbinfo->si_wbr_create_ops->fin(sb);
-		if (!err)
-			err = 1;
-	}
-
-	sbinfo->si_wbr_create = create->wbr_create;
-	sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create;
-	switch (create->wbr_create) {
-	case AuWbrCreate_MFSRRV:
-	case AuWbrCreate_MFSRR:
-	case AuWbrCreate_PMFSRR:
-	case AuWbrCreate_PMFSRRV:
-		sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark;
-		/*FALLTHROUGH*/
-	case AuWbrCreate_MFS:
-	case AuWbrCreate_MFSV:
-	case AuWbrCreate_PMFS:
-	case AuWbrCreate_PMFSV:
-		sbinfo->si_wbr_mfs.mfs_expire
-			= msecs_to_jiffies(create->mfs_second * MSEC_PER_SEC);
-		break;
-	}
-
-	if (sbinfo->si_wbr_create_ops->init)
-		sbinfo->si_wbr_create_ops->init(sb); /* ignore */
-
-	return err;
-}
-
-/*
- * returns,
- * plus: processed without an error
- * zero: unprocessed
- */
-static int au_opt_simple(struct super_block *sb, struct au_opt *opt,
-			 struct au_opts *opts)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	err = 1; /* handled */
-	sbinfo = au_sbi(sb);
-	switch (opt->type) {
-	case Opt_udba:
-		sbinfo->si_mntflags &= ~AuOptMask_UDBA;
-		sbinfo->si_mntflags |= opt->udba;
-		opts->given_udba |= opt->udba;
-		break;
-
-	case Opt_plink:
-		au_opt_set(sbinfo->si_mntflags, PLINK);
-		break;
-	case Opt_noplink:
-		if (au_opt_test(sbinfo->si_mntflags, PLINK))
-			au_plink_put(sb, /*verbose*/1);
-		au_opt_clr(sbinfo->si_mntflags, PLINK);
-		break;
-	case Opt_list_plink:
-		if (au_opt_test(sbinfo->si_mntflags, PLINK))
-			au_plink_list(sb);
-		break;
-
-	case Opt_dio:
-		au_opt_set(sbinfo->si_mntflags, DIO);
-		au_fset_opts(opts->flags, REFRESH_DYAOP);
-		break;
-	case Opt_nodio:
-		au_opt_clr(sbinfo->si_mntflags, DIO);
-		au_fset_opts(opts->flags, REFRESH_DYAOP);
-		break;
-
-	case Opt_fhsm_sec:
-		au_fhsm_set(sbinfo, opt->fhsm_second);
-		break;
-
-	case Opt_diropq_a:
-		au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ);
-		break;
-	case Opt_diropq_w:
-		au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ);
-		break;
-
-	case Opt_warn_perm:
-		au_opt_set(sbinfo->si_mntflags, WARN_PERM);
-		break;
-	case Opt_nowarn_perm:
-		au_opt_clr(sbinfo->si_mntflags, WARN_PERM);
-		break;
-
-	case Opt_verbose:
-		au_opt_set(sbinfo->si_mntflags, VERBOSE);
-		break;
-	case Opt_noverbose:
-		au_opt_clr(sbinfo->si_mntflags, VERBOSE);
-		break;
-
-	case Opt_sum:
-		au_opt_set(sbinfo->si_mntflags, SUM);
-		break;
-	case Opt_wsum:
-		au_opt_clr(sbinfo->si_mntflags, SUM);
-		au_opt_set(sbinfo->si_mntflags, SUM_W);
-	case Opt_nosum:
-		au_opt_clr(sbinfo->si_mntflags, SUM);
-		au_opt_clr(sbinfo->si_mntflags, SUM_W);
-		break;
-
-	case Opt_wbr_create:
-		err = au_opt_wbr_create(sb, &opt->wbr_create);
-		break;
-	case Opt_wbr_copyup:
-		sbinfo->si_wbr_copyup = opt->wbr_copyup;
-		sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup;
-		break;
-
-	case Opt_dirwh:
-		sbinfo->si_dirwh = opt->dirwh;
-		break;
-
-	case Opt_rdcache:
-		sbinfo->si_rdcache
-			= msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC);
-		break;
-	case Opt_rdblk:
-		sbinfo->si_rdblk = opt->rdblk;
-		break;
-	case Opt_rdblk_def:
-		sbinfo->si_rdblk = AUFS_RDBLK_DEF;
-		break;
-	case Opt_rdhash:
-		sbinfo->si_rdhash = opt->rdhash;
-		break;
-	case Opt_rdhash_def:
-		sbinfo->si_rdhash = AUFS_RDHASH_DEF;
-		break;
-
-	case Opt_shwh:
-		au_opt_set(sbinfo->si_mntflags, SHWH);
-		break;
-	case Opt_noshwh:
-		au_opt_clr(sbinfo->si_mntflags, SHWH);
-		break;
-
-	case Opt_dirperm1:
-		au_opt_set(sbinfo->si_mntflags, DIRPERM1);
-		break;
-	case Opt_nodirperm1:
-		au_opt_clr(sbinfo->si_mntflags, DIRPERM1);
-		break;
-
-	case Opt_trunc_xino:
-		au_opt_set(sbinfo->si_mntflags, TRUNC_XINO);
-		break;
-	case Opt_notrunc_xino:
-		au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO);
-		break;
-
-	case Opt_trunc_xino_path:
-	case Opt_itrunc_xino:
-		err = au_xino_trunc(sb, opt->xino_itrunc.bindex);
-		if (!err)
-			err = 1;
-		break;
-
-	case Opt_trunc_xib:
-		au_fset_opts(opts->flags, TRUNC_XIB);
-		break;
-	case Opt_notrunc_xib:
-		au_fclr_opts(opts->flags, TRUNC_XIB);
-		break;
-
-	case Opt_acl:
-		sb->s_flags |= MS_POSIXACL;
-		break;
-	case Opt_noacl:
-		sb->s_flags &= ~MS_POSIXACL;
-		break;
-
-	default:
-		err = 0;
-		break;
-	}
-
-	return err;
-}
-
-/*
- * returns tri-state.
- * plus: processed without an error
- * zero: unprocessed
- * minus: error
- */
-static int au_opt_br(struct super_block *sb, struct au_opt *opt,
-		     struct au_opts *opts)
-{
-	int err, do_refresh;
-
-	err = 0;
-	switch (opt->type) {
-	case Opt_append:
-		opt->add.bindex = au_sbend(sb) + 1;
-		if (opt->add.bindex < 0)
-			opt->add.bindex = 0;
-		goto add;
-	case Opt_prepend:
-		opt->add.bindex = 0;
-	add: /* indented label */
-	case Opt_add:
-		err = au_br_add(sb, &opt->add,
-				au_ftest_opts(opts->flags, REMOUNT));
-		if (!err) {
-			err = 1;
-			au_fset_opts(opts->flags, REFRESH);
-		}
-		break;
-
-	case Opt_del:
-	case Opt_idel:
-		err = au_br_del(sb, &opt->del,
-				au_ftest_opts(opts->flags, REMOUNT));
-		if (!err) {
-			err = 1;
-			au_fset_opts(opts->flags, TRUNC_XIB);
-			au_fset_opts(opts->flags, REFRESH);
-		}
-		break;
-
-	case Opt_mod:
-	case Opt_imod:
-		err = au_br_mod(sb, &opt->mod,
-				au_ftest_opts(opts->flags, REMOUNT),
-				&do_refresh);
-		if (!err) {
-			err = 1;
-			if (do_refresh)
-				au_fset_opts(opts->flags, REFRESH);
-		}
-		break;
-	}
-
-	return err;
-}
-
-static int au_opt_xino(struct super_block *sb, struct au_opt *opt,
-		       struct au_opt_xino **opt_xino,
-		       struct au_opts *opts)
-{
-	int err;
-	aufs_bindex_t bend, bindex;
-	struct dentry *root, *parent, *h_root;
-
-	err = 0;
-	switch (opt->type) {
-	case Opt_xino:
-		err = au_xino_set(sb, &opt->xino,
-				  !!au_ftest_opts(opts->flags, REMOUNT));
-		if (unlikely(err))
-			break;
-
-		*opt_xino = &opt->xino;
-		au_xino_brid_set(sb, -1);
-
-		/* safe d_parent access */
-		parent = opt->xino.file->f_path.dentry->d_parent;
-		root = sb->s_root;
-		bend = au_sbend(sb);
-		for (bindex = 0; bindex <= bend; bindex++) {
-			h_root = au_h_dptr(root, bindex);
-			if (h_root == parent) {
-				au_xino_brid_set(sb, au_sbr_id(sb, bindex));
-				break;
-			}
-		}
-		break;
-
-	case Opt_noxino:
-		au_xino_clr(sb);
-		au_xino_brid_set(sb, -1);
-		*opt_xino = (void *)-1;
-		break;
-	}
-
-	return err;
-}
-
-int au_opts_verify(struct super_block *sb, unsigned long sb_flags,
-		   unsigned int pending)
-{
-	int err, fhsm;
-	aufs_bindex_t bindex, bend;
-	unsigned char do_plink, skip, do_free;
-	struct au_branch *br;
-	struct au_wbr *wbr;
-	struct dentry *root;
-	struct inode *dir, *h_dir;
-	struct au_sbinfo *sbinfo;
-	struct au_hinode *hdir;
-
-	SiMustAnyLock(sb);
-
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA));
-
-	if (!(sb_flags & MS_RDONLY)) {
-		if (unlikely(!au_br_writable(au_sbr_perm(sb, 0))))
-			pr_warn("first branch should be rw\n");
-		if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH)))
-			pr_warn("shwh should be used with ro\n");
-	}
-
-	if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HNOTIFY)
-	    && !au_opt_test(sbinfo->si_mntflags, XINO))
-		pr_warn("udba=*notify requires xino\n");
-
-	if (au_opt_test(sbinfo->si_mntflags, DIRPERM1))
-		pr_warn("dirperm1 breaks the protection"
-			" by the permission bits on the lower branch\n");
-
-	err = 0;
-	fhsm = 0;
-	root = sb->s_root;
-	dir = d_inode(root);
-	do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK);
-	bend = au_sbend(sb);
-	for (bindex = 0; !err && bindex <= bend; bindex++) {
-		skip = 0;
-		h_dir = au_h_iptr(dir, bindex);
-		br = au_sbr(sb, bindex);
-
-		if ((br->br_perm & AuBrAttr_ICEX)
-		    && !h_dir->i_op->listxattr)
-			br->br_perm &= ~AuBrAttr_ICEX;
-#if 0
-		if ((br->br_perm & AuBrAttr_ICEX_SEC)
-		    && (au_br_sb(br)->s_flags & MS_NOSEC))
-			br->br_perm &= ~AuBrAttr_ICEX_SEC;
-#endif
-
-		do_free = 0;
-		wbr = br->br_wbr;
-		if (wbr)
-			wbr_wh_read_lock(wbr);
-
-		if (!au_br_writable(br->br_perm)) {
-			do_free = !!wbr;
-			skip = (!wbr
-				|| (!wbr->wbr_whbase
-				    && !wbr->wbr_plink
-				    && !wbr->wbr_orph));
-		} else if (!au_br_wh_linkable(br->br_perm)) {
-			/* skip = (!br->br_whbase && !br->br_orph); */
-			skip = (!wbr || !wbr->wbr_whbase);
-			if (skip && wbr) {
-				if (do_plink)
-					skip = !!wbr->wbr_plink;
-				else
-					skip = !wbr->wbr_plink;
-			}
-		} else {
-			/* skip = (br->br_whbase && br->br_ohph); */
-			skip = (wbr && wbr->wbr_whbase);
-			if (skip) {
-				if (do_plink)
-					skip = !!wbr->wbr_plink;
-				else
-					skip = !wbr->wbr_plink;
-			}
-		}
-		if (wbr)
-			wbr_wh_read_unlock(wbr);
-
-		if (au_br_fhsm(br->br_perm)) {
-			fhsm++;
-			AuDebugOn(!br->br_fhsm);
-		}
-
-		if (skip)
-			continue;
-
-		hdir = au_hi(dir, bindex);
-		au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
-		if (wbr)
-			wbr_wh_write_lock(wbr);
-		err = au_wh_init(br, sb);
-		if (wbr)
-			wbr_wh_write_unlock(wbr);
-		au_hn_imtx_unlock(hdir);
-
-		if (!err && do_free) {
-			kfree(wbr);
-			br->br_wbr = NULL;
-		}
-	}
-
-	if (fhsm >= 2) {
-		au_fset_si(sbinfo, FHSM);
-		for (bindex = bend; bindex >= 0; bindex--) {
-			br = au_sbr(sb, bindex);
-			if (au_br_fhsm(br->br_perm)) {
-				au_fhsm_set_bottom(sb, bindex);
-				break;
-			}
-		}
-	} else {
-		au_fclr_si(sbinfo, FHSM);
-		au_fhsm_set_bottom(sb, -1);
-	}
-
-	return err;
-}
-
-int au_opts_mount(struct super_block *sb, struct au_opts *opts)
-{
-	int err;
-	unsigned int tmp;
-	aufs_bindex_t bindex, bend;
-	struct au_opt *opt;
-	struct au_opt_xino *opt_xino, xino;
-	struct au_sbinfo *sbinfo;
-	struct au_branch *br;
-	struct inode *dir;
-
-	SiMustWriteLock(sb);
-
-	err = 0;
-	opt_xino = NULL;
-	opt = opts->opt;
-	while (err >= 0 && opt->type != Opt_tail)
-		err = au_opt_simple(sb, opt++, opts);
-	if (err > 0)
-		err = 0;
-	else if (unlikely(err < 0))
-		goto out;
-
-	/* disable xino and udba temporary */
-	sbinfo = au_sbi(sb);
-	tmp = sbinfo->si_mntflags;
-	au_opt_clr(sbinfo->si_mntflags, XINO);
-	au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL);
-
-	opt = opts->opt;
-	while (err >= 0 && opt->type != Opt_tail)
-		err = au_opt_br(sb, opt++, opts);
-	if (err > 0)
-		err = 0;
-	else if (unlikely(err < 0))
-		goto out;
-
-	bend = au_sbend(sb);
-	if (unlikely(bend < 0)) {
-		err = -EINVAL;
-		pr_err("no branches\n");
-		goto out;
-	}
-
-	if (au_opt_test(tmp, XINO))
-		au_opt_set(sbinfo->si_mntflags, XINO);
-	opt = opts->opt;
-	while (!err && opt->type != Opt_tail)
-		err = au_opt_xino(sb, opt++, &opt_xino, opts);
-	if (unlikely(err))
-		goto out;
-
-	err = au_opts_verify(sb, sb->s_flags, tmp);
-	if (unlikely(err))
-		goto out;
-
-	/* restore xino */
-	if (au_opt_test(tmp, XINO) && !opt_xino) {
-		xino.file = au_xino_def(sb);
-		err = PTR_ERR(xino.file);
-		if (IS_ERR(xino.file))
-			goto out;
-
-		err = au_xino_set(sb, &xino, /*remount*/0);
-		fput(xino.file);
-		if (unlikely(err))
-			goto out;
-	}
-
-	/* restore udba */
-	tmp &= AuOptMask_UDBA;
-	sbinfo->si_mntflags &= ~AuOptMask_UDBA;
-	sbinfo->si_mntflags |= tmp;
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		err = au_hnotify_reset_br(tmp, br, br->br_perm);
-		if (unlikely(err))
-			AuIOErr("hnotify failed on br %d, %d, ignored\n",
-				bindex, err);
-		/* go on even if err */
-	}
-	if (au_opt_test(tmp, UDBA_HNOTIFY)) {
-		dir = d_inode(sb->s_root);
-		au_hn_reset(dir, au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO);
-	}
-
-out:
-	return err;
-}
-
-int au_opts_remount(struct super_block *sb, struct au_opts *opts)
-{
-	int err, rerr;
-	struct inode *dir;
-	struct au_opt_xino *opt_xino;
-	struct au_opt *opt;
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	dir = d_inode(sb->s_root);
-	sbinfo = au_sbi(sb);
-	err = 0;
-	opt_xino = NULL;
-	opt = opts->opt;
-	while (err >= 0 && opt->type != Opt_tail) {
-		err = au_opt_simple(sb, opt, opts);
-		if (!err)
-			err = au_opt_br(sb, opt, opts);
-		if (!err)
-			err = au_opt_xino(sb, opt, &opt_xino, opts);
-		opt++;
-	}
-	if (err > 0)
-		err = 0;
-	AuTraceErr(err);
-	/* go on even err */
-
-	rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0);
-	if (unlikely(rerr && !err))
-		err = rerr;
-
-	if (au_ftest_opts(opts->flags, TRUNC_XIB)) {
-		rerr = au_xib_trunc(sb);
-		if (unlikely(rerr && !err))
-			err = rerr;
-	}
-
-	/* will be handled by the caller */
-	if (!au_ftest_opts(opts->flags, REFRESH)
-	    && (opts->given_udba || au_opt_test(sbinfo->si_mntflags, XINO)))
-		au_fset_opts(opts->flags, REFRESH);
-
-	AuDbg("status 0x%x\n", opts->flags);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-unsigned int au_opt_udba(struct super_block *sb)
-{
-	return au_mntflags(sb) & AuOptMask_UDBA;
-}
diff --git a/fs/aufs/opts.h b/fs/aufs/opts.h
deleted file mode 100644
index b18a4aa00..000000000
--- a/fs/aufs/opts.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * mount options/flags
- */
-
-#ifndef __AUFS_OPTS_H__
-#define __AUFS_OPTS_H__
-
-#ifdef __KERNEL__
-
-#include <linux/path.h>
-
-struct file;
-struct super_block;
-
-/* ---------------------------------------------------------------------- */
-
-/* mount flags */
-#define AuOpt_XINO		1		/* external inode number bitmap
-						   and translation table */
-#define AuOpt_TRUNC_XINO	(1 << 1)	/* truncate xino files */
-#define AuOpt_UDBA_NONE		(1 << 2)	/* users direct branch access */
-#define AuOpt_UDBA_REVAL	(1 << 3)
-#define AuOpt_UDBA_HNOTIFY	(1 << 4)
-#define AuOpt_SHWH		(1 << 5)	/* show whiteout */
-#define AuOpt_PLINK		(1 << 6)	/* pseudo-link */
-#define AuOpt_DIRPERM1		(1 << 7)	/* ignore the lower dir's perm
-						   bits */
-#define AuOpt_ALWAYS_DIROPQ	(1 << 9)	/* policy to creating diropq */
-#define AuOpt_SUM		(1 << 10)	/* summation for statfs(2) */
-#define AuOpt_SUM_W		(1 << 11)	/* unimplemented */
-#define AuOpt_WARN_PERM		(1 << 12)	/* warn when add-branch */
-#define AuOpt_VERBOSE		(1 << 13)	/* busy inode when del-branch */
-#define AuOpt_DIO		(1 << 14)	/* direct io */
-
-#ifndef CONFIG_AUFS_HNOTIFY
-#undef AuOpt_UDBA_HNOTIFY
-#define AuOpt_UDBA_HNOTIFY	0
-#endif
-#ifndef CONFIG_AUFS_SHWH
-#undef AuOpt_SHWH
-#define AuOpt_SHWH		0
-#endif
-
-#define AuOpt_Def	(AuOpt_XINO \
-			 | AuOpt_UDBA_REVAL \
-			 | AuOpt_PLINK \
-			 /* | AuOpt_DIRPERM1 */ \
-			 | AuOpt_WARN_PERM)
-#define AuOptMask_UDBA	(AuOpt_UDBA_NONE \
-			 | AuOpt_UDBA_REVAL \
-			 | AuOpt_UDBA_HNOTIFY)
-
-#define au_opt_test(flags, name)	(flags & AuOpt_##name)
-#define au_opt_set(flags, name) do { \
-	BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \
-	((flags) |= AuOpt_##name); \
-} while (0)
-#define au_opt_set_udba(flags, name) do { \
-	(flags) &= ~AuOptMask_UDBA; \
-	((flags) |= AuOpt_##name); \
-} while (0)
-#define au_opt_clr(flags, name) do { \
-	((flags) &= ~AuOpt_##name); \
-} while (0)
-
-static inline unsigned int au_opts_plink(unsigned int mntflags)
-{
-#ifdef CONFIG_PROC_FS
-	return mntflags;
-#else
-	return mntflags & ~AuOpt_PLINK;
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* policies to select one among multiple writable branches */
-enum {
-	AuWbrCreate_TDP,	/* top down parent */
-	AuWbrCreate_RR,		/* round robin */
-	AuWbrCreate_MFS,	/* most free space */
-	AuWbrCreate_MFSV,	/* mfs with seconds */
-	AuWbrCreate_MFSRR,	/* mfs then rr */
-	AuWbrCreate_MFSRRV,	/* mfs then rr with seconds */
-	AuWbrCreate_PMFS,	/* parent and mfs */
-	AuWbrCreate_PMFSV,	/* parent and mfs with seconds */
-	AuWbrCreate_PMFSRR,	/* parent, mfs and round-robin */
-	AuWbrCreate_PMFSRRV,	/* plus seconds */
-
-	AuWbrCreate_Def = AuWbrCreate_TDP
-};
-
-enum {
-	AuWbrCopyup_TDP,	/* top down parent */
-	AuWbrCopyup_BUP,	/* bottom up parent */
-	AuWbrCopyup_BU,		/* bottom up */
-
-	AuWbrCopyup_Def = AuWbrCopyup_TDP
-};
-
-/* ---------------------------------------------------------------------- */
-
-struct au_opt_add {
-	aufs_bindex_t	bindex;
-	char		*pathname;
-	int		perm;
-	struct path	path;
-};
-
-struct au_opt_del {
-	char		*pathname;
-	struct path	h_path;
-};
-
-struct au_opt_mod {
-	char		*path;
-	int		perm;
-	struct dentry	*h_root;
-};
-
-struct au_opt_xino {
-	char		*path;
-	struct file	*file;
-};
-
-struct au_opt_xino_itrunc {
-	aufs_bindex_t	bindex;
-};
-
-struct au_opt_wbr_create {
-	int			wbr_create;
-	int			mfs_second;
-	unsigned long long	mfsrr_watermark;
-};
-
-struct au_opt {
-	int type;
-	union {
-		struct au_opt_xino	xino;
-		struct au_opt_xino_itrunc xino_itrunc;
-		struct au_opt_add	add;
-		struct au_opt_del	del;
-		struct au_opt_mod	mod;
-		int			dirwh;
-		int			rdcache;
-		unsigned int		rdblk;
-		unsigned int		rdhash;
-		int			udba;
-		struct au_opt_wbr_create wbr_create;
-		int			wbr_copyup;
-		unsigned int		fhsm_second;
-	};
-};
-
-/* opts flags */
-#define AuOpts_REMOUNT		1
-#define AuOpts_REFRESH		(1 << 1)
-#define AuOpts_TRUNC_XIB	(1 << 2)
-#define AuOpts_REFRESH_DYAOP	(1 << 3)
-#define au_ftest_opts(flags, name)	((flags) & AuOpts_##name)
-#define au_fset_opts(flags, name) \
-	do { (flags) |= AuOpts_##name; } while (0)
-#define au_fclr_opts(flags, name) \
-	do { (flags) &= ~AuOpts_##name; } while (0)
-
-struct au_opts {
-	struct au_opt	*opt;
-	int		max_opt;
-
-	unsigned int	given_udba;
-	unsigned int	flags;
-	unsigned long	sb_flags;
-};
-
-/* ---------------------------------------------------------------------- */
-
-/* opts.c */
-void au_optstr_br_perm(au_br_perm_str_t *str, int perm);
-const char *au_optstr_udba(int udba);
-const char *au_optstr_wbr_copyup(int wbr_copyup);
-const char *au_optstr_wbr_create(int wbr_create);
-
-void au_opts_free(struct au_opts *opts);
-int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts);
-int au_opts_verify(struct super_block *sb, unsigned long sb_flags,
-		   unsigned int pending);
-int au_opts_mount(struct super_block *sb, struct au_opts *opts);
-int au_opts_remount(struct super_block *sb, struct au_opts *opts);
-
-unsigned int au_opt_udba(struct super_block *sb);
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_OPTS_H__ */
diff --git a/fs/aufs/plink.c b/fs/aufs/plink.c
deleted file mode 100644
index 988fba940..000000000
--- a/fs/aufs/plink.c
+++ /dev/null
@@ -1,528 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * pseudo-link
- */
-
-#include "aufs.h"
-
-/*
- * the pseudo-link maintenance mode.
- * during a user process maintains the pseudo-links,
- * prohibit adding a new plink and branch manipulation.
- *
- * Flags
- * NOPLM:
- *	For entry functions which will handle plink, and i_mutex is already held
- *	in VFS.
- *	They cannot wait and should return an error at once.
- *	Callers has to check the error.
- * NOPLMW:
- *	For entry functions which will handle plink, but i_mutex is not held
- *	in VFS.
- *	They can wait the plink maintenance mode to finish.
- *
- * They behave like F_SETLK and F_SETLKW.
- * If the caller never handle plink, then both flags are unnecessary.
- */
-
-int au_plink_maint(struct super_block *sb, int flags)
-{
-	int err;
-	pid_t pid, ppid;
-	struct au_sbinfo *sbi;
-
-	SiMustAnyLock(sb);
-
-	err = 0;
-	if (!au_opt_test(au_mntflags(sb), PLINK))
-		goto out;
-
-	sbi = au_sbi(sb);
-	pid = sbi->si_plink_maint_pid;
-	if (!pid || pid == current->pid)
-		goto out;
-
-	/* todo: it highly depends upon /sbin/mount.aufs */
-	rcu_read_lock();
-	ppid = task_pid_vnr(rcu_dereference(current->real_parent));
-	rcu_read_unlock();
-	if (pid == ppid)
-		goto out;
-
-	if (au_ftest_lock(flags, NOPLMW)) {
-		/* if there is no i_mutex lock in VFS, we don't need to wait */
-		/* AuDebugOn(!lockdep_depth(current)); */
-		while (sbi->si_plink_maint_pid) {
-			si_read_unlock(sb);
-			/* gave up wake_up_bit() */
-			wait_event(sbi->si_plink_wq, !sbi->si_plink_maint_pid);
-
-			if (au_ftest_lock(flags, FLUSH))
-				au_nwt_flush(&sbi->si_nowait);
-			si_noflush_read_lock(sb);
-		}
-	} else if (au_ftest_lock(flags, NOPLM)) {
-		AuDbg("ppid %d, pid %d\n", ppid, pid);
-		err = -EAGAIN;
-	}
-
-out:
-	return err;
-}
-
-void au_plink_maint_leave(struct au_sbinfo *sbinfo)
-{
-	spin_lock(&sbinfo->si_plink_maint_lock);
-	sbinfo->si_plink_maint_pid = 0;
-	spin_unlock(&sbinfo->si_plink_maint_lock);
-	wake_up_all(&sbinfo->si_plink_wq);
-}
-
-int au_plink_maint_enter(struct super_block *sb)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	/* make sure i am the only one in this fs */
-	si_write_lock(sb, AuLock_FLUSH);
-	if (au_opt_test(au_mntflags(sb), PLINK)) {
-		spin_lock(&sbinfo->si_plink_maint_lock);
-		if (!sbinfo->si_plink_maint_pid)
-			sbinfo->si_plink_maint_pid = current->pid;
-		else
-			err = -EBUSY;
-		spin_unlock(&sbinfo->si_plink_maint_lock);
-	}
-	si_write_unlock(sb);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_DEBUG
-void au_plink_list(struct super_block *sb)
-{
-	int i;
-	struct au_sbinfo *sbinfo;
-	struct hlist_head *plink_hlist;
-	struct pseudo_link *plink;
-
-	SiMustAnyLock(sb);
-
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
-	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
-
-	for (i = 0; i < AuPlink_NHASH; i++) {
-		plink_hlist = &sbinfo->si_plink[i].head;
-		rcu_read_lock();
-		hlist_for_each_entry_rcu(plink, plink_hlist, hlist)
-			AuDbg("%lu\n", plink->inode->i_ino);
-		rcu_read_unlock();
-	}
-}
-#endif
-
-/* is the inode pseudo-linked? */
-int au_plink_test(struct inode *inode)
-{
-	int found, i;
-	struct au_sbinfo *sbinfo;
-	struct hlist_head *plink_hlist;
-	struct pseudo_link *plink;
-
-	sbinfo = au_sbi(inode->i_sb);
-	AuRwMustAnyLock(&sbinfo->si_rwsem);
-	AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK));
-	AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM));
-
-	found = 0;
-	i = au_plink_hash(inode->i_ino);
-	plink_hlist = &sbinfo->si_plink[i].head;
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(plink, plink_hlist, hlist)
-		if (plink->inode == inode) {
-			found = 1;
-			break;
-		}
-	rcu_read_unlock();
-	return found;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * generate a name for plink.
- * the file will be stored under AUFS_WH_PLINKDIR.
- */
-/* 20 is max digits length of ulong 64 */
-#define PLINK_NAME_LEN	((20 + 1) * 2)
-
-static int plink_name(char *name, int len, struct inode *inode,
-		      aufs_bindex_t bindex)
-{
-	int rlen;
-	struct inode *h_inode;
-
-	h_inode = au_h_iptr(inode, bindex);
-	rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino);
-	return rlen;
-}
-
-struct au_do_plink_lkup_args {
-	struct dentry **errp;
-	struct qstr *tgtname;
-	struct dentry *h_parent;
-	struct au_branch *br;
-};
-
-static struct dentry *au_do_plink_lkup(struct qstr *tgtname,
-				       struct dentry *h_parent,
-				       struct au_branch *br)
-{
-	struct dentry *h_dentry;
-	struct mutex *h_mtx;
-
-	h_mtx = &d_inode(h_parent)->i_mutex;
-	mutex_lock_nested(h_mtx, AuLsc_I_CHILD2);
-	h_dentry = vfsub_lkup_one(tgtname, h_parent);
-	mutex_unlock(h_mtx);
-	return h_dentry;
-}
-
-static void au_call_do_plink_lkup(void *args)
-{
-	struct au_do_plink_lkup_args *a = args;
-	*a->errp = au_do_plink_lkup(a->tgtname, a->h_parent, a->br);
-}
-
-/* lookup the plink-ed @inode under the branch at @bindex */
-struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex)
-{
-	struct dentry *h_dentry, *h_parent;
-	struct au_branch *br;
-	int wkq_err;
-	char a[PLINK_NAME_LEN];
-	struct qstr tgtname = QSTR_INIT(a, 0);
-
-	AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM));
-
-	br = au_sbr(inode->i_sb, bindex);
-	h_parent = br->br_wbr->wbr_plink;
-	tgtname.len = plink_name(a, sizeof(a), inode, bindex);
-
-	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) {
-		struct au_do_plink_lkup_args args = {
-			.errp		= &h_dentry,
-			.tgtname	= &tgtname,
-			.h_parent	= h_parent,
-			.br		= br
-		};
-
-		wkq_err = au_wkq_wait(au_call_do_plink_lkup, &args);
-		if (unlikely(wkq_err))
-			h_dentry = ERR_PTR(wkq_err);
-	} else
-		h_dentry = au_do_plink_lkup(&tgtname, h_parent, br);
-
-	return h_dentry;
-}
-
-/* create a pseudo-link */
-static int do_whplink(struct qstr *tgt, struct dentry *h_parent,
-		      struct dentry *h_dentry, struct au_branch *br)
-{
-	int err;
-	struct path h_path = {
-		.mnt = au_br_mnt(br)
-	};
-	struct inode *h_dir, *delegated;
-
-	h_dir = d_inode(h_parent);
-	mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2);
-again:
-	h_path.dentry = vfsub_lkup_one(tgt, h_parent);
-	err = PTR_ERR(h_path.dentry);
-	if (IS_ERR(h_path.dentry))
-		goto out;
-
-	err = 0;
-	/* wh.plink dir is not monitored */
-	/* todo: is it really safe? */
-	if (d_is_positive(h_path.dentry)
-	    && d_inode(h_path.dentry) != d_inode(h_dentry)) {
-		delegated = NULL;
-		err = vfsub_unlink(h_dir, &h_path, &delegated, /*force*/0);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal unlink\n");
-			iput(delegated);
-		}
-		dput(h_path.dentry);
-		h_path.dentry = NULL;
-		if (!err)
-			goto again;
-	}
-	if (!err && d_is_negative(h_path.dentry)) {
-		delegated = NULL;
-		err = vfsub_link(h_dentry, h_dir, &h_path, &delegated);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal link\n");
-			iput(delegated);
-		}
-	}
-	dput(h_path.dentry);
-
-out:
-	mutex_unlock(&h_dir->i_mutex);
-	return err;
-}
-
-struct do_whplink_args {
-	int *errp;
-	struct qstr *tgt;
-	struct dentry *h_parent;
-	struct dentry *h_dentry;
-	struct au_branch *br;
-};
-
-static void call_do_whplink(void *args)
-{
-	struct do_whplink_args *a = args;
-	*a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br);
-}
-
-static int whplink(struct dentry *h_dentry, struct inode *inode,
-		   aufs_bindex_t bindex, struct au_branch *br)
-{
-	int err, wkq_err;
-	struct au_wbr *wbr;
-	struct dentry *h_parent;
-	char a[PLINK_NAME_LEN];
-	struct qstr tgtname = QSTR_INIT(a, 0);
-
-	wbr = au_sbr(inode->i_sb, bindex)->br_wbr;
-	h_parent = wbr->wbr_plink;
-	tgtname.len = plink_name(a, sizeof(a), inode, bindex);
-
-	/* always superio. */
-	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) {
-		struct do_whplink_args args = {
-			.errp		= &err,
-			.tgt		= &tgtname,
-			.h_parent	= h_parent,
-			.h_dentry	= h_dentry,
-			.br		= br
-		};
-		wkq_err = au_wkq_wait(call_do_whplink, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	} else
-		err = do_whplink(&tgtname, h_parent, h_dentry, br);
-
-	return err;
-}
-
-/* free a single plink */
-static void do_put_plink(struct pseudo_link *plink, int do_del)
-{
-	if (do_del)
-		hlist_del(&plink->hlist);
-	iput(plink->inode);
-	kfree(plink);
-}
-
-static void do_put_plink_rcu(struct rcu_head *rcu)
-{
-	struct pseudo_link *plink;
-
-	plink = container_of(rcu, struct pseudo_link, rcu);
-	iput(plink->inode);
-	kfree(plink);
-}
-
-/*
- * create a new pseudo-link for @h_dentry on @bindex.
- * the linked inode is held in aufs @inode.
- */
-void au_plink_append(struct inode *inode, aufs_bindex_t bindex,
-		     struct dentry *h_dentry)
-{
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-	struct hlist_head *plink_hlist;
-	struct pseudo_link *plink, *tmp;
-	struct au_sphlhead *sphl;
-	int found, err, cnt, i;
-
-	sb = inode->i_sb;
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
-	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
-
-	found = au_plink_test(inode);
-	if (found)
-		return;
-
-	i = au_plink_hash(inode->i_ino);
-	sphl = sbinfo->si_plink + i;
-	plink_hlist = &sphl->head;
-	tmp = kmalloc(sizeof(*plink), GFP_NOFS);
-	if (tmp)
-		tmp->inode = au_igrab(inode);
-	else {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	spin_lock(&sphl->spin);
-	hlist_for_each_entry(plink, plink_hlist, hlist) {
-		if (plink->inode == inode) {
-			found = 1;
-			break;
-		}
-	}
-	if (!found)
-		hlist_add_head_rcu(&tmp->hlist, plink_hlist);
-	spin_unlock(&sphl->spin);
-	if (!found) {
-		cnt = au_sphl_count(sphl);
-#define msg "unexpectedly unblanced or too many pseudo-links"
-		if (cnt > AUFS_PLINK_WARN)
-			AuWarn1(msg ", %d\n", cnt);
-#undef msg
-		err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex));
-	} else {
-		do_put_plink(tmp, 0);
-		return;
-	}
-
-out:
-	if (unlikely(err)) {
-		pr_warn("err %d, damaged pseudo link.\n", err);
-		if (tmp) {
-			au_sphl_del_rcu(&tmp->hlist, sphl);
-			call_rcu(&tmp->rcu, do_put_plink_rcu);
-		}
-	}
-}
-
-/* free all plinks */
-void au_plink_put(struct super_block *sb, int verbose)
-{
-	int i, warned;
-	struct au_sbinfo *sbinfo;
-	struct hlist_head *plink_hlist;
-	struct hlist_node *tmp;
-	struct pseudo_link *plink;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
-	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
-
-	/* no spin_lock since sbinfo is write-locked */
-	warned = 0;
-	for (i = 0; i < AuPlink_NHASH; i++) {
-		plink_hlist = &sbinfo->si_plink[i].head;
-		if (!warned && verbose && !hlist_empty(plink_hlist)) {
-			pr_warn("pseudo-link is not flushed");
-			warned = 1;
-		}
-		hlist_for_each_entry_safe(plink, tmp, plink_hlist, hlist)
-			do_put_plink(plink, 0);
-		INIT_HLIST_HEAD(plink_hlist);
-	}
-}
-
-void au_plink_clean(struct super_block *sb, int verbose)
-{
-	struct dentry *root;
-
-	root = sb->s_root;
-	aufs_write_lock(root);
-	if (au_opt_test(au_mntflags(sb), PLINK))
-		au_plink_put(sb, verbose);
-	aufs_write_unlock(root);
-}
-
-static int au_plink_do_half_refresh(struct inode *inode, aufs_bindex_t br_id)
-{
-	int do_put;
-	aufs_bindex_t bstart, bend, bindex;
-
-	do_put = 0;
-	bstart = au_ibstart(inode);
-	bend = au_ibend(inode);
-	if (bstart >= 0) {
-		for (bindex = bstart; bindex <= bend; bindex++) {
-			if (!au_h_iptr(inode, bindex)
-			    || au_ii_br_id(inode, bindex) != br_id)
-				continue;
-			au_set_h_iptr(inode, bindex, NULL, 0);
-			do_put = 1;
-			break;
-		}
-		if (do_put)
-			for (bindex = bstart; bindex <= bend; bindex++)
-				if (au_h_iptr(inode, bindex)) {
-					do_put = 0;
-					break;
-				}
-	} else
-		do_put = 1;
-
-	return do_put;
-}
-
-/* free the plinks on a branch specified by @br_id */
-void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id)
-{
-	struct au_sbinfo *sbinfo;
-	struct hlist_head *plink_hlist;
-	struct hlist_node *tmp;
-	struct pseudo_link *plink;
-	struct inode *inode;
-	int i, do_put;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
-	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
-
-	/* no spin_lock since sbinfo is write-locked */
-	for (i = 0; i < AuPlink_NHASH; i++) {
-		plink_hlist = &sbinfo->si_plink[i].head;
-		hlist_for_each_entry_safe(plink, tmp, plink_hlist, hlist) {
-			inode = au_igrab(plink->inode);
-			ii_write_lock_child(inode);
-			do_put = au_plink_do_half_refresh(inode, br_id);
-			if (do_put)
-				do_put_plink(plink, 1);
-			ii_write_unlock(inode);
-			iput(inode);
-		}
-	}
-}
diff --git a/fs/aufs/poll.c b/fs/aufs/poll.c
deleted file mode 100644
index 262a792fd..000000000
--- a/fs/aufs/poll.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * poll operation
- * There is only one filesystem which implements ->poll operation, currently.
- */
-
-#include "aufs.h"
-
-unsigned int aufs_poll(struct file *file, poll_table *wait)
-{
-	unsigned int mask;
-	int err;
-	struct file *h_file;
-	struct super_block *sb;
-
-	/* We should pretend an error happened. */
-	mask = POLLERR /* | POLLIN | POLLOUT */;
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	/* it is not an error if h_file has no operation */
-	mask = DEFAULT_POLLMASK;
-	if (h_file->f_op->poll)
-		mask = h_file->f_op->poll(h_file, wait);
-	fput(h_file); /* instead of au_read_post() */
-
-out:
-	si_read_unlock(sb);
-	AuTraceErr((int)mask);
-	return mask;
-}
diff --git a/fs/aufs/posix_acl.c b/fs/aufs/posix_acl.c
deleted file mode 100644
index d5266ef76..000000000
--- a/fs/aufs/posix_acl.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (C) 2014-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * posix acl operations
- */
-
-#include <linux/fs.h>
-#include <linux/posix_acl.h>
-#include "aufs.h"
-
-struct posix_acl *aufs_get_acl(struct inode *inode, int type)
-{
-	struct posix_acl *acl;
-	int err;
-	aufs_bindex_t bindex;
-	struct inode *h_inode;
-	struct super_block *sb;
-
-	acl = NULL;
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	ii_read_lock_child(inode);
-	if (!(sb->s_flags & MS_POSIXACL))
-		goto out;
-
-	bindex = au_ibstart(inode);
-	h_inode = au_h_iptr(inode, bindex);
-	if (unlikely(!h_inode
-		     || ((h_inode->i_mode & S_IFMT)
-			 != (inode->i_mode & S_IFMT)))) {
-		err = au_busy_or_stale();
-		acl = ERR_PTR(err);
-		goto out;
-	}
-
-	/* always topmost only */
-	acl = get_acl(h_inode, type);
-
-out:
-	ii_read_unlock(inode);
-	si_read_unlock(sb);
-
-	AuTraceErrPtr(acl);
-	return acl;
-}
-
-int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
-{
-	int err;
-	ssize_t ssz;
-	struct dentry *dentry;
-	struct au_srxattr arg = {
-		.type = AU_ACL_SET,
-		.u.acl_set = {
-			.acl	= acl,
-			.type	= type
-		},
-	};
-
-	mutex_lock(&inode->i_mutex);
-	if (inode->i_ino == AUFS_ROOT_INO)
-		dentry = dget(inode->i_sb->s_root);
-	else {
-		dentry = d_find_alias(inode);
-		if (!dentry)
-			dentry = d_find_any_alias(inode);
-		if (!dentry) {
-			pr_warn("cannot handle this inode, "
-				"please report to aufs-users ML\n");
-			err = -ENOENT;
-			goto out;
-		}
-	}
-
-	ssz = au_srxattr(dentry, &arg);
-	dput(dentry);
-	err = ssz;
-	if (ssz >= 0)
-		err = 0;
-
-out:
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
diff --git a/fs/aufs/procfs.c b/fs/aufs/procfs.c
deleted file mode 100644
index 772be91a9..000000000
--- a/fs/aufs/procfs.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (C) 2010-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * procfs interfaces
- */
-
-#include <linux/proc_fs.h>
-#include "aufs.h"
-
-static int au_procfs_plm_release(struct inode *inode, struct file *file)
-{
-	struct au_sbinfo *sbinfo;
-
-	sbinfo = file->private_data;
-	if (sbinfo) {
-		au_plink_maint_leave(sbinfo);
-		kobject_put(&sbinfo->si_kobj);
-	}
-
-	return 0;
-}
-
-static void au_procfs_plm_write_clean(struct file *file)
-{
-	struct au_sbinfo *sbinfo;
-
-	sbinfo = file->private_data;
-	if (sbinfo)
-		au_plink_clean(sbinfo->si_sb, /*verbose*/0);
-}
-
-static int au_procfs_plm_write_si(struct file *file, unsigned long id)
-{
-	int err;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-
-	err = -EBUSY;
-	if (unlikely(file->private_data))
-		goto out;
-
-	sb = NULL;
-	/* don't use au_sbilist_lock() here */
-	spin_lock(&au_sbilist.spin);
-	list_for_each_entry(sbinfo, &au_sbilist.head, si_list)
-		if (id == sysaufs_si_id(sbinfo)) {
-			kobject_get(&sbinfo->si_kobj);
-			sb = sbinfo->si_sb;
-			break;
-		}
-	spin_unlock(&au_sbilist.spin);
-
-	err = -EINVAL;
-	if (unlikely(!sb))
-		goto out;
-
-	err = au_plink_maint_enter(sb);
-	if (!err)
-		/* keep kobject_get() */
-		file->private_data = sbinfo;
-	else
-		kobject_put(&sbinfo->si_kobj);
-out:
-	return err;
-}
-
-/*
- * Accept a valid "si=xxxx" only.
- * Once it is accepted successfully, accept "clean" too.
- */
-static ssize_t au_procfs_plm_write(struct file *file, const char __user *ubuf,
-				   size_t count, loff_t *ppos)
-{
-	ssize_t err;
-	unsigned long id;
-	/* last newline is allowed */
-	char buf[3 + sizeof(unsigned long) * 2 + 1];
-
-	err = -EACCES;
-	if (unlikely(!capable(CAP_SYS_ADMIN)))
-		goto out;
-
-	err = -EINVAL;
-	if (unlikely(count > sizeof(buf)))
-		goto out;
-
-	err = copy_from_user(buf, ubuf, count);
-	if (unlikely(err)) {
-		err = -EFAULT;
-		goto out;
-	}
-	buf[count] = 0;
-
-	err = -EINVAL;
-	if (!strcmp("clean", buf)) {
-		au_procfs_plm_write_clean(file);
-		goto out_success;
-	} else if (unlikely(strncmp("si=", buf, 3)))
-		goto out;
-
-	err = kstrtoul(buf + 3, 16, &id);
-	if (unlikely(err))
-		goto out;
-
-	err = au_procfs_plm_write_si(file, id);
-	if (unlikely(err))
-		goto out;
-
-out_success:
-	err = count; /* success */
-out:
-	return err;
-}
-
-static const struct file_operations au_procfs_plm_fop = {
-	.write		= au_procfs_plm_write,
-	.release	= au_procfs_plm_release,
-	.owner		= THIS_MODULE
-};
-
-/* ---------------------------------------------------------------------- */
-
-static struct proc_dir_entry *au_procfs_dir;
-
-void au_procfs_fin(void)
-{
-	remove_proc_entry(AUFS_PLINK_MAINT_NAME, au_procfs_dir);
-	remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL);
-}
-
-int __init au_procfs_init(void)
-{
-	int err;
-	struct proc_dir_entry *entry;
-
-	err = -ENOMEM;
-	au_procfs_dir = proc_mkdir(AUFS_PLINK_MAINT_DIR, NULL);
-	if (unlikely(!au_procfs_dir))
-		goto out;
-
-	entry = proc_create(AUFS_PLINK_MAINT_NAME, S_IFREG | S_IWUSR,
-			    au_procfs_dir, &au_procfs_plm_fop);
-	if (unlikely(!entry))
-		goto out_dir;
-
-	err = 0;
-	goto out; /* success */
-
-
-out_dir:
-	remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL);
-out:
-	return err;
-}
diff --git a/fs/aufs/rdu.c b/fs/aufs/rdu.c
deleted file mode 100644
index 3beba1806..000000000
--- a/fs/aufs/rdu.c
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * readdir in userspace.
- */
-
-#include <linux/compat.h>
-#include <linux/fs_stack.h>
-#include <linux/security.h>
-#include "aufs.h"
-
-/* bits for struct aufs_rdu.flags */
-#define	AuRdu_CALLED	1
-#define	AuRdu_CONT	(1 << 1)
-#define	AuRdu_FULL	(1 << 2)
-#define au_ftest_rdu(flags, name)	((flags) & AuRdu_##name)
-#define au_fset_rdu(flags, name) \
-	do { (flags) |= AuRdu_##name; } while (0)
-#define au_fclr_rdu(flags, name) \
-	do { (flags) &= ~AuRdu_##name; } while (0)
-
-struct au_rdu_arg {
-	struct dir_context		ctx;
-	struct aufs_rdu			*rdu;
-	union au_rdu_ent_ul		ent;
-	unsigned long			end;
-
-	struct super_block		*sb;
-	int				err;
-};
-
-static int au_rdu_fill(struct dir_context *ctx, const char *name, int nlen,
-		       loff_t offset, u64 h_ino, unsigned int d_type)
-{
-	int err, len;
-	struct au_rdu_arg *arg = container_of(ctx, struct au_rdu_arg, ctx);
-	struct aufs_rdu *rdu = arg->rdu;
-	struct au_rdu_ent ent;
-
-	err = 0;
-	arg->err = 0;
-	au_fset_rdu(rdu->cookie.flags, CALLED);
-	len = au_rdu_len(nlen);
-	if (arg->ent.ul + len  < arg->end) {
-		ent.ino = h_ino;
-		ent.bindex = rdu->cookie.bindex;
-		ent.type = d_type;
-		ent.nlen = nlen;
-		if (unlikely(nlen > AUFS_MAX_NAMELEN))
-			ent.type = DT_UNKNOWN;
-
-		/* unnecessary to support mmap_sem since this is a dir */
-		err = -EFAULT;
-		if (copy_to_user(arg->ent.e, &ent, sizeof(ent)))
-			goto out;
-		if (copy_to_user(arg->ent.e->name, name, nlen))
-			goto out;
-		/* the terminating NULL */
-		if (__put_user(0, arg->ent.e->name + nlen))
-			goto out;
-		err = 0;
-		/* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */
-		arg->ent.ul += len;
-		rdu->rent++;
-	} else {
-		err = -EFAULT;
-		au_fset_rdu(rdu->cookie.flags, FULL);
-		rdu->full = 1;
-		rdu->tail = arg->ent;
-	}
-
-out:
-	/* AuTraceErr(err); */
-	return err;
-}
-
-static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg)
-{
-	int err;
-	loff_t offset;
-	struct au_rdu_cookie *cookie = &arg->rdu->cookie;
-
-	/* we don't have to care (FMODE_32BITHASH | FMODE_64BITHASH) for ext4 */
-	offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET);
-	err = offset;
-	if (unlikely(offset != cookie->h_pos))
-		goto out;
-
-	err = 0;
-	do {
-		arg->err = 0;
-		au_fclr_rdu(cookie->flags, CALLED);
-		/* smp_mb(); */
-		err = vfsub_iterate_dir(h_file, &arg->ctx);
-		if (err >= 0)
-			err = arg->err;
-	} while (!err
-		 && au_ftest_rdu(cookie->flags, CALLED)
-		 && !au_ftest_rdu(cookie->flags, FULL));
-	cookie->h_pos = h_file->f_pos;
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_rdu(struct file *file, struct aufs_rdu *rdu)
-{
-	int err;
-	aufs_bindex_t bend;
-	struct au_rdu_arg arg = {
-		.ctx = {
-			.actor = au_rdu_fill
-		}
-	};
-	struct dentry *dentry;
-	struct inode *inode;
-	struct file *h_file;
-	struct au_rdu_cookie *cookie = &rdu->cookie;
-
-	err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz);
-	if (unlikely(err)) {
-		err = -EFAULT;
-		AuTraceErr(err);
-		goto out;
-	}
-	rdu->rent = 0;
-	rdu->tail = rdu->ent;
-	rdu->full = 0;
-	arg.rdu = rdu;
-	arg.ent = rdu->ent;
-	arg.end = arg.ent.ul;
-	arg.end += rdu->sz;
-
-	err = -ENOTDIR;
-	if (unlikely(!file->f_op->iterate))
-		goto out;
-
-	err = security_file_permission(file, MAY_READ);
-	AuTraceErr(err);
-	if (unlikely(err))
-		goto out;
-
-	dentry = file->f_path.dentry;
-	inode = d_inode(dentry);
-#if 1
-	mutex_lock(&inode->i_mutex);
-#else
-	err = mutex_lock_killable(&inode->i_mutex);
-	AuTraceErr(err);
-	if (unlikely(err))
-		goto out;
-#endif
-
-	arg.sb = inode->i_sb;
-	err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out_mtx;
-	err = au_alive_dir(dentry);
-	if (unlikely(err))
-		goto out_si;
-	/* todo: reval? */
-	fi_read_lock(file);
-
-	err = -EAGAIN;
-	if (unlikely(au_ftest_rdu(cookie->flags, CONT)
-		     && cookie->generation != au_figen(file)))
-		goto out_unlock;
-
-	err = 0;
-	if (!rdu->blk) {
-		rdu->blk = au_sbi(arg.sb)->si_rdblk;
-		if (!rdu->blk)
-			rdu->blk = au_dir_size(file, /*dentry*/NULL);
-	}
-	bend = au_fbstart(file);
-	if (cookie->bindex < bend)
-		cookie->bindex = bend;
-	bend = au_fbend_dir(file);
-	/* AuDbg("b%d, b%d\n", cookie->bindex, bend); */
-	for (; !err && cookie->bindex <= bend;
-	     cookie->bindex++, cookie->h_pos = 0) {
-		h_file = au_hf_dir(file, cookie->bindex);
-		if (!h_file)
-			continue;
-
-		au_fclr_rdu(cookie->flags, FULL);
-		err = au_rdu_do(h_file, &arg);
-		AuTraceErr(err);
-		if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err))
-			break;
-	}
-	AuDbg("rent %llu\n", rdu->rent);
-
-	if (!err && !au_ftest_rdu(cookie->flags, CONT)) {
-		rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH);
-		au_fset_rdu(cookie->flags, CONT);
-		cookie->generation = au_figen(file);
-	}
-
-	ii_read_lock_child(inode);
-	fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibstart(inode)));
-	ii_read_unlock(inode);
-
-out_unlock:
-	fi_read_unlock(file);
-out_si:
-	si_read_unlock(arg.sb);
-out_mtx:
-	mutex_unlock(&inode->i_mutex);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu)
-{
-	int err;
-	ino_t ino;
-	unsigned long long nent;
-	union au_rdu_ent_ul *u;
-	struct au_rdu_ent ent;
-	struct super_block *sb;
-
-	err = 0;
-	nent = rdu->nent;
-	u = &rdu->ent;
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	while (nent-- > 0) {
-		/* unnecessary to support mmap_sem since this is a dir */
-		err = copy_from_user(&ent, u->e, sizeof(ent));
-		if (!err)
-			err = !access_ok(VERIFY_WRITE, &u->e->ino, sizeof(ino));
-		if (unlikely(err)) {
-			err = -EFAULT;
-			AuTraceErr(err);
-			break;
-		}
-
-		/* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */
-		if (!ent.wh)
-			err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino);
-		else
-			err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type,
-					&ino);
-		if (unlikely(err)) {
-			AuTraceErr(err);
-			break;
-		}
-
-		err = __put_user(ino, &u->e->ino);
-		if (unlikely(err)) {
-			err = -EFAULT;
-			AuTraceErr(err);
-			break;
-		}
-		u->ul += au_rdu_len(ent.nlen);
-	}
-	si_read_unlock(sb);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_rdu_verify(struct aufs_rdu *rdu)
-{
-	AuDbg("rdu{%llu, %p, %u | %u | %llu, %u, %u | "
-	      "%llu, b%d, 0x%x, g%u}\n",
-	      rdu->sz, rdu->ent.e, rdu->verify[AufsCtlRduV_SZ],
-	      rdu->blk,
-	      rdu->rent, rdu->shwh, rdu->full,
-	      rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags,
-	      rdu->cookie.generation);
-
-	if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu))
-		return 0;
-
-	AuDbg("%u:%u\n",
-	      rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu));
-	return -EINVAL;
-}
-
-long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	long err, e;
-	struct aufs_rdu rdu;
-	void __user *p = (void __user *)arg;
-
-	err = copy_from_user(&rdu, p, sizeof(rdu));
-	if (unlikely(err)) {
-		err = -EFAULT;
-		AuTraceErr(err);
-		goto out;
-	}
-	err = au_rdu_verify(&rdu);
-	if (unlikely(err))
-		goto out;
-
-	switch (cmd) {
-	case AUFS_CTL_RDU:
-		err = au_rdu(file, &rdu);
-		if (unlikely(err))
-			break;
-
-		e = copy_to_user(p, &rdu, sizeof(rdu));
-		if (unlikely(e)) {
-			err = -EFAULT;
-			AuTraceErr(err);
-		}
-		break;
-	case AUFS_CTL_RDU_INO:
-		err = au_rdu_ino(file, &rdu);
-		break;
-
-	default:
-		/* err = -ENOTTY; */
-		err = -EINVAL;
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-#ifdef CONFIG_COMPAT
-long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	long err, e;
-	struct aufs_rdu rdu;
-	void __user *p = compat_ptr(arg);
-
-	/* todo: get_user()? */
-	err = copy_from_user(&rdu, p, sizeof(rdu));
-	if (unlikely(err)) {
-		err = -EFAULT;
-		AuTraceErr(err);
-		goto out;
-	}
-	rdu.ent.e = compat_ptr(rdu.ent.ul);
-	err = au_rdu_verify(&rdu);
-	if (unlikely(err))
-		goto out;
-
-	switch (cmd) {
-	case AUFS_CTL_RDU:
-		err = au_rdu(file, &rdu);
-		if (unlikely(err))
-			break;
-
-		rdu.ent.ul = ptr_to_compat(rdu.ent.e);
-		rdu.tail.ul = ptr_to_compat(rdu.tail.e);
-		e = copy_to_user(p, &rdu, sizeof(rdu));
-		if (unlikely(e)) {
-			err = -EFAULT;
-			AuTraceErr(err);
-		}
-		break;
-	case AUFS_CTL_RDU_INO:
-		err = au_rdu_ino(file, &rdu);
-		break;
-
-	default:
-		/* err = -ENOTTY; */
-		err = -EINVAL;
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-#endif
diff --git a/fs/aufs/rwsem.h b/fs/aufs/rwsem.h
deleted file mode 100644
index deb813f3e..000000000
--- a/fs/aufs/rwsem.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * simple read-write semaphore wrappers
- */
-
-#ifndef __AUFS_RWSEM_H__
-#define __AUFS_RWSEM_H__
-
-#ifdef __KERNEL__
-
-#include "debug.h"
-
-struct au_rwsem {
-	struct rw_semaphore	rwsem;
-#ifdef CONFIG_AUFS_DEBUG
-	/* just for debugging, not almighty counter */
-	atomic_t		rcnt, wcnt;
-#endif
-};
-
-#ifdef CONFIG_AUFS_DEBUG
-#define AuDbgCntInit(rw) do { \
-	atomic_set(&(rw)->rcnt, 0); \
-	atomic_set(&(rw)->wcnt, 0); \
-	smp_mb(); /* atomic set */ \
-} while (0)
-
-#define AuDbgRcntInc(rw)	atomic_inc(&(rw)->rcnt)
-#define AuDbgRcntDec(rw)	WARN_ON(atomic_dec_return(&(rw)->rcnt) < 0)
-#define AuDbgWcntInc(rw)	atomic_inc(&(rw)->wcnt)
-#define AuDbgWcntDec(rw)	WARN_ON(atomic_dec_return(&(rw)->wcnt) < 0)
-#else
-#define AuDbgCntInit(rw)	do {} while (0)
-#define AuDbgRcntInc(rw)	do {} while (0)
-#define AuDbgRcntDec(rw)	do {} while (0)
-#define AuDbgWcntInc(rw)	do {} while (0)
-#define AuDbgWcntDec(rw)	do {} while (0)
-#endif /* CONFIG_AUFS_DEBUG */
-
-/* to debug easier, do not make them inlined functions */
-#define AuRwMustNoWaiters(rw)	AuDebugOn(!list_empty(&(rw)->rwsem.wait_list))
-/* rwsem_is_locked() is unusable */
-#define AuRwMustReadLock(rw)	AuDebugOn(atomic_read(&(rw)->rcnt) <= 0)
-#define AuRwMustWriteLock(rw)	AuDebugOn(atomic_read(&(rw)->wcnt) <= 0)
-#define AuRwMustAnyLock(rw)	AuDebugOn(atomic_read(&(rw)->rcnt) <= 0 \
-					&& atomic_read(&(rw)->wcnt) <= 0)
-#define AuRwDestroy(rw)		AuDebugOn(atomic_read(&(rw)->rcnt) \
-					|| atomic_read(&(rw)->wcnt))
-
-#define au_rw_class(rw, key)	lockdep_set_class(&(rw)->rwsem, key)
-
-static inline void au_rw_init(struct au_rwsem *rw)
-{
-	AuDbgCntInit(rw);
-	init_rwsem(&rw->rwsem);
-}
-
-static inline void au_rw_init_wlock(struct au_rwsem *rw)
-{
-	au_rw_init(rw);
-	down_write(&rw->rwsem);
-	AuDbgWcntInc(rw);
-}
-
-static inline void au_rw_init_wlock_nested(struct au_rwsem *rw,
-					   unsigned int lsc)
-{
-	au_rw_init(rw);
-	down_write_nested(&rw->rwsem, lsc);
-	AuDbgWcntInc(rw);
-}
-
-static inline void au_rw_read_lock(struct au_rwsem *rw)
-{
-	down_read(&rw->rwsem);
-	AuDbgRcntInc(rw);
-}
-
-static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc)
-{
-	down_read_nested(&rw->rwsem, lsc);
-	AuDbgRcntInc(rw);
-}
-
-static inline void au_rw_read_unlock(struct au_rwsem *rw)
-{
-	AuRwMustReadLock(rw);
-	AuDbgRcntDec(rw);
-	up_read(&rw->rwsem);
-}
-
-static inline void au_rw_dgrade_lock(struct au_rwsem *rw)
-{
-	AuRwMustWriteLock(rw);
-	AuDbgRcntInc(rw);
-	AuDbgWcntDec(rw);
-	downgrade_write(&rw->rwsem);
-}
-
-static inline void au_rw_write_lock(struct au_rwsem *rw)
-{
-	down_write(&rw->rwsem);
-	AuDbgWcntInc(rw);
-}
-
-static inline void au_rw_write_lock_nested(struct au_rwsem *rw,
-					   unsigned int lsc)
-{
-	down_write_nested(&rw->rwsem, lsc);
-	AuDbgWcntInc(rw);
-}
-
-static inline void au_rw_write_unlock(struct au_rwsem *rw)
-{
-	AuRwMustWriteLock(rw);
-	AuDbgWcntDec(rw);
-	up_write(&rw->rwsem);
-}
-
-/* why is not _nested version defined */
-static inline int au_rw_read_trylock(struct au_rwsem *rw)
-{
-	int ret;
-
-	ret = down_read_trylock(&rw->rwsem);
-	if (ret)
-		AuDbgRcntInc(rw);
-	return ret;
-}
-
-static inline int au_rw_write_trylock(struct au_rwsem *rw)
-{
-	int ret;
-
-	ret = down_write_trylock(&rw->rwsem);
-	if (ret)
-		AuDbgWcntInc(rw);
-	return ret;
-}
-
-#undef AuDbgCntInit
-#undef AuDbgRcntInc
-#undef AuDbgRcntDec
-#undef AuDbgWcntInc
-#undef AuDbgWcntDec
-
-#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \
-static inline void prefix##_read_lock(param) \
-{ au_rw_read_lock(rwsem); } \
-static inline void prefix##_write_lock(param) \
-{ au_rw_write_lock(rwsem); } \
-static inline int prefix##_read_trylock(param) \
-{ return au_rw_read_trylock(rwsem); } \
-static inline int prefix##_write_trylock(param) \
-{ return au_rw_write_trylock(rwsem); }
-/* why is not _nested version defined */
-/* static inline void prefix##_read_trylock_nested(param, lsc)
-{ au_rw_read_trylock_nested(rwsem, lsc)); }
-static inline void prefix##_write_trylock_nestd(param, lsc)
-{ au_rw_write_trylock_nested(rwsem, lsc); } */
-
-#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \
-static inline void prefix##_read_unlock(param) \
-{ au_rw_read_unlock(rwsem); } \
-static inline void prefix##_write_unlock(param) \
-{ au_rw_write_unlock(rwsem); } \
-static inline void prefix##_downgrade_lock(param) \
-{ au_rw_dgrade_lock(rwsem); }
-
-#define AuSimpleRwsemFuncs(prefix, param, rwsem) \
-	AuSimpleLockRwsemFuncs(prefix, param, rwsem) \
-	AuSimpleUnlockRwsemFuncs(prefix, param, rwsem)
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_RWSEM_H__ */
diff --git a/fs/aufs/sbinfo.c b/fs/aufs/sbinfo.c
deleted file mode 100644
index 4e59efc08..000000000
--- a/fs/aufs/sbinfo.c
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * superblock private data
- */
-
-#include "aufs.h"
-
-/*
- * they are necessary regardless sysfs is disabled.
- */
-void au_si_free(struct kobject *kobj)
-{
-	int i;
-	struct au_sbinfo *sbinfo;
-	char *locked __maybe_unused; /* debug only */
-
-	sbinfo = container_of(kobj, struct au_sbinfo, si_kobj);
-	for (i = 0; i < AuPlink_NHASH; i++)
-		AuDebugOn(!hlist_empty(&sbinfo->si_plink[i].head));
-	AuDebugOn(atomic_read(&sbinfo->si_nowait.nw_len));
-
-	au_rw_write_lock(&sbinfo->si_rwsem);
-	au_br_free(sbinfo);
-	au_rw_write_unlock(&sbinfo->si_rwsem);
-
-	AuDebugOn(radix_tree_gang_lookup
-		  (&sbinfo->au_si_pid.tree, (void **)&locked,
-		   /*first_index*/PID_MAX_DEFAULT - 1,
-		   /*max_items*/sizeof(locked)/sizeof(*locked)));
-
-	kfree(sbinfo->si_branch);
-	kfree(sbinfo->au_si_pid.bitmap);
-	mutex_destroy(&sbinfo->si_xib_mtx);
-	AuRwDestroy(&sbinfo->si_rwsem);
-
-	kfree(sbinfo);
-}
-
-int au_si_alloc(struct super_block *sb)
-{
-	int err, i;
-	struct au_sbinfo *sbinfo;
-	static struct lock_class_key aufs_si;
-
-	err = -ENOMEM;
-	sbinfo = kzalloc(sizeof(*sbinfo), GFP_NOFS);
-	if (unlikely(!sbinfo))
-		goto out;
-
-	BUILD_BUG_ON(sizeof(unsigned long) !=
-		     sizeof(*sbinfo->au_si_pid.bitmap));
-	sbinfo->au_si_pid.bitmap = kcalloc(BITS_TO_LONGS(PID_MAX_DEFAULT),
-					sizeof(*sbinfo->au_si_pid.bitmap),
-					GFP_NOFS);
-	if (unlikely(!sbinfo->au_si_pid.bitmap))
-		goto out_sbinfo;
-
-	/* will be reallocated separately */
-	sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS);
-	if (unlikely(!sbinfo->si_branch))
-		goto out_pidmap;
-
-	err = sysaufs_si_init(sbinfo);
-	if (unlikely(err))
-		goto out_br;
-
-	au_nwt_init(&sbinfo->si_nowait);
-	au_rw_init_wlock(&sbinfo->si_rwsem);
-	au_rw_class(&sbinfo->si_rwsem, &aufs_si);
-	spin_lock_init(&sbinfo->au_si_pid.tree_lock);
-	INIT_RADIX_TREE(&sbinfo->au_si_pid.tree, GFP_ATOMIC | __GFP_NOFAIL);
-
-	atomic_long_set(&sbinfo->si_ninodes, 0);
-	atomic_long_set(&sbinfo->si_nfiles, 0);
-
-	sbinfo->si_bend = -1;
-	sbinfo->si_last_br_id = AUFS_BRANCH_MAX / 2;
-
-	sbinfo->si_wbr_copyup = AuWbrCopyup_Def;
-	sbinfo->si_wbr_create = AuWbrCreate_Def;
-	sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + sbinfo->si_wbr_copyup;
-	sbinfo->si_wbr_create_ops = au_wbr_create_ops + sbinfo->si_wbr_create;
-
-	au_fhsm_init(sbinfo);
-
-	sbinfo->si_mntflags = au_opts_plink(AuOpt_Def);
-
-	sbinfo->si_xino_jiffy = jiffies;
-	sbinfo->si_xino_expire
-		= msecs_to_jiffies(AUFS_XINO_DEF_SEC * MSEC_PER_SEC);
-	mutex_init(&sbinfo->si_xib_mtx);
-	sbinfo->si_xino_brid = -1;
-	/* leave si_xib_last_pindex and si_xib_next_bit */
-
-	au_sphl_init(&sbinfo->si_aopen);
-
-	sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC);
-	sbinfo->si_rdblk = AUFS_RDBLK_DEF;
-	sbinfo->si_rdhash = AUFS_RDHASH_DEF;
-	sbinfo->si_dirwh = AUFS_DIRWH_DEF;
-
-	for (i = 0; i < AuPlink_NHASH; i++)
-		au_sphl_init(sbinfo->si_plink + i);
-	init_waitqueue_head(&sbinfo->si_plink_wq);
-	spin_lock_init(&sbinfo->si_plink_maint_lock);
-
-	au_sphl_init(&sbinfo->si_files);
-
-	/* leave other members for sysaufs and si_mnt. */
-	sbinfo->si_sb = sb;
-	sb->s_fs_info = sbinfo;
-	si_pid_set(sb);
-	return 0; /* success */
-
-out_br:
-	kfree(sbinfo->si_branch);
-out_pidmap:
-	kfree(sbinfo->au_si_pid.bitmap);
-out_sbinfo:
-	kfree(sbinfo);
-out:
-	return err;
-}
-
-int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr)
-{
-	int err, sz;
-	struct au_branch **brp;
-
-	AuRwMustWriteLock(&sbinfo->si_rwsem);
-
-	err = -ENOMEM;
-	sz = sizeof(*brp) * (sbinfo->si_bend + 1);
-	if (unlikely(!sz))
-		sz = sizeof(*brp);
-	brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS);
-	if (brp) {
-		sbinfo->si_branch = brp;
-		err = 0;
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-unsigned int au_sigen_inc(struct super_block *sb)
-{
-	unsigned int gen;
-	struct inode *inode;
-
-	SiMustWriteLock(sb);
-
-	gen = ++au_sbi(sb)->si_generation;
-	au_update_digen(sb->s_root);
-	inode = d_inode(sb->s_root);
-	au_update_iigen(inode, /*half*/0);
-	inode->i_version++;
-	return gen;
-}
-
-aufs_bindex_t au_new_br_id(struct super_block *sb)
-{
-	aufs_bindex_t br_id;
-	int i;
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	for (i = 0; i <= AUFS_BRANCH_MAX; i++) {
-		br_id = ++sbinfo->si_last_br_id;
-		AuDebugOn(br_id < 0);
-		if (br_id && au_br_index(sb, br_id) < 0)
-			return br_id;
-	}
-
-	return -1;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* it is ok that new 'nwt' tasks are appended while we are sleeping */
-int si_read_lock(struct super_block *sb, int flags)
-{
-	int err;
-
-	err = 0;
-	if (au_ftest_lock(flags, FLUSH))
-		au_nwt_flush(&au_sbi(sb)->si_nowait);
-
-	si_noflush_read_lock(sb);
-	err = au_plink_maint(sb, flags);
-	if (unlikely(err))
-		si_read_unlock(sb);
-
-	return err;
-}
-
-int si_write_lock(struct super_block *sb, int flags)
-{
-	int err;
-
-	if (au_ftest_lock(flags, FLUSH))
-		au_nwt_flush(&au_sbi(sb)->si_nowait);
-
-	si_noflush_write_lock(sb);
-	err = au_plink_maint(sb, flags);
-	if (unlikely(err))
-		si_write_unlock(sb);
-
-	return err;
-}
-
-/* dentry and super_block lock. call at entry point */
-int aufs_read_lock(struct dentry *dentry, int flags)
-{
-	int err;
-	struct super_block *sb;
-
-	sb = dentry->d_sb;
-	err = si_read_lock(sb, flags);
-	if (unlikely(err))
-		goto out;
-
-	if (au_ftest_lock(flags, DW))
-		di_write_lock_child(dentry);
-	else
-		di_read_lock_child(dentry, flags);
-
-	if (au_ftest_lock(flags, GEN)) {
-		err = au_digen_test(dentry, au_sigen(sb));
-		AuDebugOn(!err && au_dbrange_test(dentry));
-		if (unlikely(err))
-			aufs_read_unlock(dentry, flags);
-	}
-
-out:
-	return err;
-}
-
-void aufs_read_unlock(struct dentry *dentry, int flags)
-{
-	if (au_ftest_lock(flags, DW))
-		di_write_unlock(dentry);
-	else
-		di_read_unlock(dentry, flags);
-	si_read_unlock(dentry->d_sb);
-}
-
-void aufs_write_lock(struct dentry *dentry)
-{
-	si_write_lock(dentry->d_sb, AuLock_FLUSH | AuLock_NOPLMW);
-	di_write_lock_child(dentry);
-}
-
-void aufs_write_unlock(struct dentry *dentry)
-{
-	di_write_unlock(dentry);
-	si_write_unlock(dentry->d_sb);
-}
-
-int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags)
-{
-	int err;
-	unsigned int sigen;
-	struct super_block *sb;
-
-	sb = d1->d_sb;
-	err = si_read_lock(sb, flags);
-	if (unlikely(err))
-		goto out;
-
-	di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIR));
-
-	if (au_ftest_lock(flags, GEN)) {
-		sigen = au_sigen(sb);
-		err = au_digen_test(d1, sigen);
-		AuDebugOn(!err && au_dbrange_test(d1));
-		if (!err) {
-			err = au_digen_test(d2, sigen);
-			AuDebugOn(!err && au_dbrange_test(d2));
-		}
-		if (unlikely(err))
-			aufs_read_and_write_unlock2(d1, d2);
-	}
-
-out:
-	return err;
-}
-
-void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2)
-{
-	di_write_unlock2(d1, d2);
-	si_read_unlock(d1->d_sb);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int si_pid_test_slow(struct super_block *sb)
-{
-	void *p;
-
-	rcu_read_lock();
-	p = radix_tree_lookup(&au_sbi(sb)->au_si_pid.tree, current->pid);
-	rcu_read_unlock();
-
-	return (long)!!p;
-}
-
-void si_pid_set_slow(struct super_block *sb)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-
-	AuDebugOn(si_pid_test_slow(sb));
-
-	sbinfo = au_sbi(sb);
-	err = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
-	AuDebugOn(err);
-	spin_lock(&sbinfo->au_si_pid.tree_lock);
-	err = radix_tree_insert(&sbinfo->au_si_pid.tree, current->pid,
-				/*any valid ptr*/sb);
-	spin_unlock(&sbinfo->au_si_pid.tree_lock);
-	AuDebugOn(err);
-	radix_tree_preload_end();
-}
-
-void si_pid_clr_slow(struct super_block *sb)
-{
-	void *p;
-	struct au_sbinfo *sbinfo;
-
-	AuDebugOn(!si_pid_test_slow(sb));
-
-	sbinfo = au_sbi(sb);
-	spin_lock(&sbinfo->au_si_pid.tree_lock);
-	p = radix_tree_delete(&sbinfo->au_si_pid.tree, current->pid);
-	spin_unlock(&sbinfo->au_si_pid.tree_lock);
-}
diff --git a/fs/aufs/spl.h b/fs/aufs/spl.h
deleted file mode 100644
index 94f09f298..000000000
--- a/fs/aufs/spl.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * simple list protected by a spinlock
- */
-
-#ifndef __AUFS_SPL_H__
-#define __AUFS_SPL_H__
-
-#ifdef __KERNEL__
-
-struct au_splhead {
-	spinlock_t		spin;
-	struct list_head	head;
-};
-
-static inline void au_spl_init(struct au_splhead *spl)
-{
-	spin_lock_init(&spl->spin);
-	INIT_LIST_HEAD(&spl->head);
-}
-
-static inline void au_spl_add(struct list_head *list, struct au_splhead *spl)
-{
-	spin_lock(&spl->spin);
-	list_add(list, &spl->head);
-	spin_unlock(&spl->spin);
-}
-
-static inline void au_spl_del(struct list_head *list, struct au_splhead *spl)
-{
-	spin_lock(&spl->spin);
-	list_del(list);
-	spin_unlock(&spl->spin);
-}
-
-static inline void au_spl_del_rcu(struct list_head *list,
-				  struct au_splhead *spl)
-{
-	spin_lock(&spl->spin);
-	list_del_rcu(list);
-	spin_unlock(&spl->spin);
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_sphlhead {
-	spinlock_t		spin;
-	struct hlist_head	head;
-};
-
-static inline void au_sphl_init(struct au_sphlhead *sphl)
-{
-	spin_lock_init(&sphl->spin);
-	INIT_HLIST_HEAD(&sphl->head);
-}
-
-static inline void au_sphl_add(struct hlist_node *hlist,
-			       struct au_sphlhead *sphl)
-{
-	spin_lock(&sphl->spin);
-	hlist_add_head(hlist, &sphl->head);
-	spin_unlock(&sphl->spin);
-}
-
-static inline void au_sphl_del(struct hlist_node *hlist,
-			       struct au_sphlhead *sphl)
-{
-	spin_lock(&sphl->spin);
-	hlist_del(hlist);
-	spin_unlock(&sphl->spin);
-}
-
-static inline void au_sphl_del_rcu(struct hlist_node *hlist,
-				   struct au_sphlhead *sphl)
-{
-	spin_lock(&sphl->spin);
-	hlist_del_rcu(hlist);
-	spin_unlock(&sphl->spin);
-}
-
-static inline unsigned long au_sphl_count(struct au_sphlhead *sphl)
-{
-	unsigned long cnt;
-	struct hlist_node *pos;
-
-	cnt = 0;
-	spin_lock(&sphl->spin);
-	hlist_for_each(pos, &sphl->head)
-		cnt++;
-	spin_unlock(&sphl->spin);
-	return cnt;
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_SPL_H__ */
diff --git a/fs/aufs/super.c b/fs/aufs/super.c
deleted file mode 100644
index 6b7f38e48..000000000
--- a/fs/aufs/super.c
+++ /dev/null
@@ -1,1004 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * mount and super_block operations
- */
-
-#include <linux/mm.h>
-#include <linux/seq_file.h>
-#include <linux/statfs.h>
-#include <linux/vmalloc.h>
-#include "aufs.h"
-
-/*
- * super_operations
- */
-static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused)
-{
-	struct au_icntnr *c;
-
-	c = au_cache_alloc_icntnr();
-	if (c) {
-		au_icntnr_init(c);
-		c->vfs_inode.i_version = 1; /* sigen(sb); */
-		c->iinfo.ii_hinode = NULL;
-		return &c->vfs_inode;
-	}
-	return NULL;
-}
-
-static void aufs_destroy_inode_cb(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-
-	INIT_HLIST_HEAD(&inode->i_dentry);
-	au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode));
-}
-
-static void aufs_destroy_inode(struct inode *inode)
-{
-	au_iinfo_fin(inode);
-	call_rcu(&inode->i_rcu, aufs_destroy_inode_cb);
-}
-
-struct inode *au_iget_locked(struct super_block *sb, ino_t ino)
-{
-	struct inode *inode;
-	int err;
-
-	inode = iget_locked(sb, ino);
-	if (unlikely(!inode)) {
-		inode = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-	if (!(inode->i_state & I_NEW))
-		goto out;
-
-	err = au_xigen_new(inode);
-	if (!err)
-		err = au_iinfo_init(inode);
-	if (!err)
-		inode->i_version++;
-	else {
-		iget_failed(inode);
-		inode = ERR_PTR(err);
-	}
-
-out:
-	/* never return NULL */
-	AuDebugOn(!inode);
-	AuTraceErrPtr(inode);
-	return inode;
-}
-
-/* lock free root dinfo */
-static int au_show_brs(struct seq_file *seq, struct super_block *sb)
-{
-	int err;
-	aufs_bindex_t bindex, bend;
-	struct path path;
-	struct au_hdentry *hdp;
-	struct au_branch *br;
-	au_br_perm_str_t perm;
-
-	err = 0;
-	bend = au_sbend(sb);
-	hdp = au_di(sb->s_root)->di_hdentry;
-	for (bindex = 0; !err && bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		path.mnt = au_br_mnt(br);
-		path.dentry = hdp[bindex].hd_dentry;
-		err = au_seq_path(seq, &path);
-		if (err > 0) {
-			au_optstr_br_perm(&perm, br->br_perm);
-			err = seq_printf(seq, "=%s", perm.a);
-			if (err == -1)
-				err = -E2BIG;
-		}
-		if (!err && bindex != bend)
-			err = seq_putc(seq, ':');
-	}
-
-	return err;
-}
-
-static void au_show_wbr_create(struct seq_file *m, int v,
-			       struct au_sbinfo *sbinfo)
-{
-	const char *pat;
-
-	AuRwMustAnyLock(&sbinfo->si_rwsem);
-
-	seq_puts(m, ",create=");
-	pat = au_optstr_wbr_create(v);
-	switch (v) {
-	case AuWbrCreate_TDP:
-	case AuWbrCreate_RR:
-	case AuWbrCreate_MFS:
-	case AuWbrCreate_PMFS:
-		seq_puts(m, pat);
-		break;
-	case AuWbrCreate_MFSV:
-		seq_printf(m, /*pat*/"mfs:%lu",
-			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
-			   / MSEC_PER_SEC);
-		break;
-	case AuWbrCreate_PMFSV:
-		seq_printf(m, /*pat*/"pmfs:%lu",
-			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
-			   / MSEC_PER_SEC);
-		break;
-	case AuWbrCreate_MFSRR:
-		seq_printf(m, /*pat*/"mfsrr:%llu",
-			   sbinfo->si_wbr_mfs.mfsrr_watermark);
-		break;
-	case AuWbrCreate_MFSRRV:
-		seq_printf(m, /*pat*/"mfsrr:%llu:%lu",
-			   sbinfo->si_wbr_mfs.mfsrr_watermark,
-			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
-			   / MSEC_PER_SEC);
-		break;
-	case AuWbrCreate_PMFSRR:
-		seq_printf(m, /*pat*/"pmfsrr:%llu",
-			   sbinfo->si_wbr_mfs.mfsrr_watermark);
-		break;
-	case AuWbrCreate_PMFSRRV:
-		seq_printf(m, /*pat*/"pmfsrr:%llu:%lu",
-			   sbinfo->si_wbr_mfs.mfsrr_watermark,
-			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
-			   / MSEC_PER_SEC);
-		break;
-	}
-}
-
-static int au_show_xino(struct seq_file *seq, struct super_block *sb)
-{
-#ifdef CONFIG_SYSFS
-	return 0;
-#else
-	int err;
-	const int len = sizeof(AUFS_XINO_FNAME) - 1;
-	aufs_bindex_t bindex, brid;
-	struct qstr *name;
-	struct file *f;
-	struct dentry *d, *h_root;
-	struct au_hdentry *hdp;
-
-	AuRwMustAnyLock(&sbinfo->si_rwsem);
-
-	err = 0;
-	f = au_sbi(sb)->si_xib;
-	if (!f)
-		goto out;
-
-	/* stop printing the default xino path on the first writable branch */
-	h_root = NULL;
-	brid = au_xino_brid(sb);
-	if (brid >= 0) {
-		bindex = au_br_index(sb, brid);
-		hdp = au_di(sb->s_root)->di_hdentry;
-		h_root = hdp[0 + bindex].hd_dentry;
-	}
-	d = f->f_path.dentry;
-	name = &d->d_name;
-	/* safe ->d_parent because the file is unlinked */
-	if (d->d_parent == h_root
-	    && name->len == len
-	    && !memcmp(name->name, AUFS_XINO_FNAME, len))
-		goto out;
-
-	seq_puts(seq, ",xino=");
-	err = au_xino_path(seq, f);
-
-out:
-	return err;
-#endif
-}
-
-/* seq_file will re-call me in case of too long string */
-static int aufs_show_options(struct seq_file *m, struct dentry *dentry)
-{
-	int err;
-	unsigned int mnt_flags, v;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-
-#define AuBool(name, str) do { \
-	v = au_opt_test(mnt_flags, name); \
-	if (v != au_opt_test(AuOpt_Def, name)) \
-		seq_printf(m, ",%s" #str, v ? "" : "no"); \
-} while (0)
-
-#define AuStr(name, str) do { \
-	v = mnt_flags & AuOptMask_##name; \
-	if (v != (AuOpt_Def & AuOptMask_##name)) \
-		seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \
-} while (0)
-
-#define AuUInt(name, str, val) do { \
-	if (val != AUFS_##name##_DEF) \
-		seq_printf(m, "," #str "=%u", val); \
-} while (0)
-
-	sb = dentry->d_sb;
-	if (sb->s_flags & MS_POSIXACL)
-		seq_puts(m, ",acl");
-
-	/* lock free root dinfo */
-	si_noflush_read_lock(sb);
-	sbinfo = au_sbi(sb);
-	seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo));
-
-	mnt_flags = au_mntflags(sb);
-	if (au_opt_test(mnt_flags, XINO)) {
-		err = au_show_xino(m, sb);
-		if (unlikely(err))
-			goto out;
-	} else
-		seq_puts(m, ",noxino");
-
-	AuBool(TRUNC_XINO, trunc_xino);
-	AuStr(UDBA, udba);
-	AuBool(SHWH, shwh);
-	AuBool(PLINK, plink);
-	AuBool(DIO, dio);
-	AuBool(DIRPERM1, dirperm1);
-
-	v = sbinfo->si_wbr_create;
-	if (v != AuWbrCreate_Def)
-		au_show_wbr_create(m, v, sbinfo);
-
-	v = sbinfo->si_wbr_copyup;
-	if (v != AuWbrCopyup_Def)
-		seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v));
-
-	v = au_opt_test(mnt_flags, ALWAYS_DIROPQ);
-	if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ))
-		seq_printf(m, ",diropq=%c", v ? 'a' : 'w');
-
-	AuUInt(DIRWH, dirwh, sbinfo->si_dirwh);
-
-	v = jiffies_to_msecs(sbinfo->si_rdcache) / MSEC_PER_SEC;
-	AuUInt(RDCACHE, rdcache, v);
-
-	AuUInt(RDBLK, rdblk, sbinfo->si_rdblk);
-	AuUInt(RDHASH, rdhash, sbinfo->si_rdhash);
-
-	au_fhsm_show(m, sbinfo);
-
-	AuBool(SUM, sum);
-	/* AuBool(SUM_W, wsum); */
-	AuBool(WARN_PERM, warn_perm);
-	AuBool(VERBOSE, verbose);
-
-out:
-	/* be sure to print "br:" last */
-	if (!sysaufs_brs) {
-		seq_puts(m, ",br:");
-		au_show_brs(m, sb);
-	}
-	si_read_unlock(sb);
-	return 0;
-
-#undef AuBool
-#undef AuStr
-#undef AuUInt
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* sum mode which returns the summation for statfs(2) */
-
-static u64 au_add_till_max(u64 a, u64 b)
-{
-	u64 old;
-
-	old = a;
-	a += b;
-	if (old <= a)
-		return a;
-	return ULLONG_MAX;
-}
-
-static u64 au_mul_till_max(u64 a, long mul)
-{
-	u64 old;
-
-	old = a;
-	a *= mul;
-	if (old <= a)
-		return a;
-	return ULLONG_MAX;
-}
-
-static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf)
-{
-	int err;
-	long bsize, factor;
-	u64 blocks, bfree, bavail, files, ffree;
-	aufs_bindex_t bend, bindex, i;
-	unsigned char shared;
-	struct path h_path;
-	struct super_block *h_sb;
-
-	err = 0;
-	bsize = LONG_MAX;
-	files = 0;
-	ffree = 0;
-	blocks = 0;
-	bfree = 0;
-	bavail = 0;
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		h_path.mnt = au_sbr_mnt(sb, bindex);
-		h_sb = h_path.mnt->mnt_sb;
-		shared = 0;
-		for (i = 0; !shared && i < bindex; i++)
-			shared = (au_sbr_sb(sb, i) == h_sb);
-		if (shared)
-			continue;
-
-		/* sb->s_root for NFS is unreliable */
-		h_path.dentry = h_path.mnt->mnt_root;
-		err = vfs_statfs(&h_path, buf);
-		if (unlikely(err))
-			goto out;
-
-		if (bsize > buf->f_bsize) {
-			/*
-			 * we will reduce bsize, so we have to expand blocks
-			 * etc. to match them again
-			 */
-			factor = (bsize / buf->f_bsize);
-			blocks = au_mul_till_max(blocks, factor);
-			bfree = au_mul_till_max(bfree, factor);
-			bavail = au_mul_till_max(bavail, factor);
-			bsize = buf->f_bsize;
-		}
-
-		factor = (buf->f_bsize / bsize);
-		blocks = au_add_till_max(blocks,
-				au_mul_till_max(buf->f_blocks, factor));
-		bfree = au_add_till_max(bfree,
-				au_mul_till_max(buf->f_bfree, factor));
-		bavail = au_add_till_max(bavail,
-				au_mul_till_max(buf->f_bavail, factor));
-		files = au_add_till_max(files, buf->f_files);
-		ffree = au_add_till_max(ffree, buf->f_ffree);
-	}
-
-	buf->f_bsize = bsize;
-	buf->f_blocks = blocks;
-	buf->f_bfree = bfree;
-	buf->f_bavail = bavail;
-	buf->f_files = files;
-	buf->f_ffree = ffree;
-	buf->f_frsize = 0;
-
-out:
-	return err;
-}
-
-static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	int err;
-	struct path h_path;
-	struct super_block *sb;
-
-	/* lock free root dinfo */
-	sb = dentry->d_sb;
-	si_noflush_read_lock(sb);
-	if (!au_opt_test(au_mntflags(sb), SUM)) {
-		/* sb->s_root for NFS is unreliable */
-		h_path.mnt = au_sbr_mnt(sb, 0);
-		h_path.dentry = h_path.mnt->mnt_root;
-		err = vfs_statfs(&h_path, buf);
-	} else
-		err = au_statfs_sum(sb, buf);
-	si_read_unlock(sb);
-
-	if (!err) {
-		buf->f_type = AUFS_SUPER_MAGIC;
-		buf->f_namelen = AUFS_MAX_NAMELEN;
-		memset(&buf->f_fsid, 0, sizeof(buf->f_fsid));
-	}
-	/* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_sync_fs(struct super_block *sb, int wait)
-{
-	int err, e;
-	aufs_bindex_t bend, bindex;
-	struct au_branch *br;
-	struct super_block *h_sb;
-
-	err = 0;
-	si_noflush_read_lock(sb);
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		if (!au_br_writable(br->br_perm))
-			continue;
-
-		h_sb = au_sbr_sb(sb, bindex);
-		if (h_sb->s_op->sync_fs) {
-			e = h_sb->s_op->sync_fs(h_sb, wait);
-			if (unlikely(e && !err))
-				err = e;
-			/* go on even if an error happens */
-		}
-	}
-	si_read_unlock(sb);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* final actions when unmounting a file system */
-static void aufs_put_super(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-
-	sbinfo = au_sbi(sb);
-	if (!sbinfo)
-		return;
-
-	dbgaufs_si_fin(sbinfo);
-	kobject_put(&sbinfo->si_kobj);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_array_free(void *array)
-{
-	if (array) {
-		if (!is_vmalloc_addr(array))
-			kfree(array);
-		else
-			vfree(array);
-	}
-}
-
-void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, void *arg)
-{
-	void *array;
-	unsigned long long n, sz;
-
-	array = NULL;
-	n = 0;
-	if (!*hint)
-		goto out;
-
-	if (*hint > ULLONG_MAX / sizeof(array)) {
-		array = ERR_PTR(-EMFILE);
-		pr_err("hint %llu\n", *hint);
-		goto out;
-	}
-
-	sz = sizeof(array) * *hint;
-	array = kzalloc(sz, GFP_NOFS);
-	if (unlikely(!array))
-		array = vzalloc(sz);
-	if (unlikely(!array)) {
-		array = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	n = cb(array, *hint, arg);
-	AuDebugOn(n > *hint);
-
-out:
-	*hint = n;
-	return array;
-}
-
-static unsigned long long au_iarray_cb(void *a,
-				       unsigned long long max __maybe_unused,
-				       void *arg)
-{
-	unsigned long long n;
-	struct inode **p, *inode;
-	struct list_head *head;
-
-	n = 0;
-	p = a;
-	head = arg;
-	spin_lock(&inode_sb_list_lock);
-	list_for_each_entry(inode, head, i_sb_list) {
-		if (!is_bad_inode(inode)
-		    && au_ii(inode)->ii_bstart >= 0) {
-			spin_lock(&inode->i_lock);
-			if (atomic_read(&inode->i_count)) {
-				au_igrab(inode);
-				*p++ = inode;
-				n++;
-				AuDebugOn(n > max);
-			}
-			spin_unlock(&inode->i_lock);
-		}
-	}
-	spin_unlock(&inode_sb_list_lock);
-
-	return n;
-}
-
-struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max)
-{
-	*max = atomic_long_read(&au_sbi(sb)->si_ninodes);
-	return au_array_alloc(max, au_iarray_cb, &sb->s_inodes);
-}
-
-void au_iarray_free(struct inode **a, unsigned long long max)
-{
-	unsigned long long ull;
-
-	for (ull = 0; ull < max; ull++)
-		iput(a[ull]);
-	au_array_free(a);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * refresh dentry and inode at remount time.
- */
-/* todo: consolidate with simple_reval_dpath() and au_reval_for_attr() */
-static int au_do_refresh(struct dentry *dentry, unsigned int dir_flags,
-		      struct dentry *parent)
-{
-	int err;
-
-	di_write_lock_child(dentry);
-	di_read_lock_parent(parent, AuLock_IR);
-	err = au_refresh_dentry(dentry, parent);
-	if (!err && dir_flags)
-		au_hn_reset(d_inode(dentry), dir_flags);
-	di_read_unlock(parent, AuLock_IR);
-	di_write_unlock(dentry);
-
-	return err;
-}
-
-static int au_do_refresh_d(struct dentry *dentry, unsigned int sigen,
-			   struct au_sbinfo *sbinfo,
-			   const unsigned int dir_flags)
-{
-	int err;
-	struct dentry *parent;
-
-	err = 0;
-	parent = dget_parent(dentry);
-	if (!au_digen_test(parent, sigen) && au_digen_test(dentry, sigen)) {
-		if (d_really_is_positive(dentry)) {
-			if (!d_is_dir(dentry))
-				err = au_do_refresh(dentry, /*dir_flags*/0,
-						 parent);
-			else {
-				err = au_do_refresh(dentry, dir_flags, parent);
-				if (unlikely(err))
-					au_fset_si(sbinfo, FAILED_REFRESH_DIR);
-			}
-		} else
-			err = au_do_refresh(dentry, /*dir_flags*/0, parent);
-		AuDbgDentry(dentry);
-	}
-	dput(parent);
-
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_refresh_d(struct super_block *sb)
-{
-	int err, i, j, ndentry, e;
-	unsigned int sigen;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry **dentries, *d;
-	struct au_sbinfo *sbinfo;
-	struct dentry *root = sb->s_root;
-	const unsigned int dir_flags = au_hi_flags(d_inode(root), /*isdir*/1);
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	err = au_dcsub_pages(&dpages, root, NULL, NULL);
-	if (unlikely(err))
-		goto out_dpages;
-
-	sigen = au_sigen(sb);
-	sbinfo = au_sbi(sb);
-	for (i = 0; i < dpages.ndpage; i++) {
-		dpage = dpages.dpages + i;
-		dentries = dpage->dentries;
-		ndentry = dpage->ndentry;
-		for (j = 0; j < ndentry; j++) {
-			d = dentries[j];
-			e = au_do_refresh_d(d, sigen, sbinfo, dir_flags);
-			if (unlikely(e && !err))
-				err = e;
-			/* go on even err */
-		}
-	}
-
-out_dpages:
-	au_dpages_free(&dpages);
-out:
-	return err;
-}
-
-static int au_refresh_i(struct super_block *sb)
-{
-	int err, e;
-	unsigned int sigen;
-	unsigned long long max, ull;
-	struct inode *inode, **array;
-
-	array = au_iarray_alloc(sb, &max);
-	err = PTR_ERR(array);
-	if (IS_ERR(array))
-		goto out;
-
-	err = 0;
-	sigen = au_sigen(sb);
-	for (ull = 0; ull < max; ull++) {
-		inode = array[ull];
-		if (unlikely(!inode))
-			break;
-		if (au_iigen(inode, NULL) != sigen) {
-			ii_write_lock_child(inode);
-			e = au_refresh_hinode_self(inode);
-			ii_write_unlock(inode);
-			if (unlikely(e)) {
-				pr_err("error %d, i%lu\n", e, inode->i_ino);
-				if (!err)
-					err = e;
-				/* go on even if err */
-			}
-		}
-	}
-
-	au_iarray_free(array, max);
-
-out:
-	return err;
-}
-
-static void au_remount_refresh(struct super_block *sb)
-{
-	int err, e;
-	unsigned int udba;
-	aufs_bindex_t bindex, bend;
-	struct dentry *root;
-	struct inode *inode;
-	struct au_branch *br;
-
-	au_sigen_inc(sb);
-	au_fclr_si(au_sbi(sb), FAILED_REFRESH_DIR);
-
-	root = sb->s_root;
-	DiMustNoWaiters(root);
-	inode = d_inode(root);
-	IiMustNoWaiters(inode);
-
-	udba = au_opt_udba(sb);
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		err = au_hnotify_reset_br(udba, br, br->br_perm);
-		if (unlikely(err))
-			AuIOErr("hnotify failed on br %d, %d, ignored\n",
-				bindex, err);
-		/* go on even if err */
-	}
-	au_hn_reset(inode, au_hi_flags(inode, /*isdir*/1));
-
-	di_write_unlock(root);
-	err = au_refresh_d(sb);
-	e = au_refresh_i(sb);
-	if (unlikely(e && !err))
-		err = e;
-	/* aufs_write_lock() calls ..._child() */
-	di_write_lock_child(root);
-
-	au_cpup_attr_all(inode, /*force*/1);
-
-	if (unlikely(err))
-		AuIOErr("refresh failed, ignored, %d\n", err);
-}
-
-/* stop extra interpretation of errno in mount(8), and strange error messages */
-static int cvt_err(int err)
-{
-	AuTraceErr(err);
-
-	switch (err) {
-	case -ENOENT:
-	case -ENOTDIR:
-	case -EEXIST:
-	case -EIO:
-		err = -EINVAL;
-	}
-	return err;
-}
-
-static int aufs_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-	int err, do_dx;
-	unsigned int mntflags;
-	struct au_opts opts;
-	struct dentry *root;
-	struct inode *inode;
-	struct au_sbinfo *sbinfo;
-
-	err = 0;
-	root = sb->s_root;
-	if (!data || !*data) {
-		err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-		if (!err) {
-			di_write_lock_child(root);
-			err = au_opts_verify(sb, *flags, /*pending*/0);
-			aufs_write_unlock(root);
-		}
-		goto out;
-	}
-
-	err = -ENOMEM;
-	memset(&opts, 0, sizeof(opts));
-	opts.opt = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!opts.opt))
-		goto out;
-	opts.max_opt = PAGE_SIZE / sizeof(*opts.opt);
-	opts.flags = AuOpts_REMOUNT;
-	opts.sb_flags = *flags;
-
-	/* parse it before aufs lock */
-	err = au_opts_parse(sb, data, &opts);
-	if (unlikely(err))
-		goto out_opts;
-
-	sbinfo = au_sbi(sb);
-	inode = d_inode(root);
-	mutex_lock(&inode->i_mutex);
-	err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out_mtx;
-	di_write_lock_child(root);
-
-	/* au_opts_remount() may return an error */
-	err = au_opts_remount(sb, &opts);
-	au_opts_free(&opts);
-
-	if (au_ftest_opts(opts.flags, REFRESH))
-		au_remount_refresh(sb);
-
-	if (au_ftest_opts(opts.flags, REFRESH_DYAOP)) {
-		mntflags = au_mntflags(sb);
-		do_dx = !!au_opt_test(mntflags, DIO);
-		au_dy_arefresh(do_dx);
-	}
-
-	au_fhsm_wrote_all(sb, /*force*/1); /* ?? */
-	aufs_write_unlock(root);
-
-out_mtx:
-	mutex_unlock(&inode->i_mutex);
-out_opts:
-	free_page((unsigned long)opts.opt);
-out:
-	err = cvt_err(err);
-	AuTraceErr(err);
-	return err;
-}
-
-static const struct super_operations aufs_sop = {
-	.alloc_inode	= aufs_alloc_inode,
-	.destroy_inode	= aufs_destroy_inode,
-	/* always deleting, no clearing */
-	.drop_inode	= generic_delete_inode,
-	.show_options	= aufs_show_options,
-	.statfs		= aufs_statfs,
-	.put_super	= aufs_put_super,
-	.sync_fs	= aufs_sync_fs,
-	.remount_fs	= aufs_remount_fs
-};
-
-/* ---------------------------------------------------------------------- */
-
-static int alloc_root(struct super_block *sb)
-{
-	int err;
-	struct inode *inode;
-	struct dentry *root;
-
-	err = -ENOMEM;
-	inode = au_iget_locked(sb, AUFS_ROOT_INO);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto out;
-
-	inode->i_op = &aufs_dir_iop;
-	inode->i_fop = &aufs_dir_fop;
-	inode->i_mode = S_IFDIR;
-	set_nlink(inode, 2);
-	unlock_new_inode(inode);
-
-	root = d_make_root(inode);
-	if (unlikely(!root))
-		goto out;
-	err = PTR_ERR(root);
-	if (IS_ERR(root))
-		goto out;
-
-	err = au_di_init(root);
-	if (!err) {
-		sb->s_root = root;
-		return 0; /* success */
-	}
-	dput(root);
-
-out:
-	return err;
-}
-
-static int aufs_fill_super(struct super_block *sb, void *raw_data,
-			   int silent __maybe_unused)
-{
-	int err;
-	struct au_opts opts;
-	struct dentry *root;
-	struct inode *inode;
-	char *arg = raw_data;
-
-	if (unlikely(!arg || !*arg)) {
-		err = -EINVAL;
-		pr_err("no arg\n");
-		goto out;
-	}
-
-	err = -ENOMEM;
-	memset(&opts, 0, sizeof(opts));
-	opts.opt = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!opts.opt))
-		goto out;
-	opts.max_opt = PAGE_SIZE / sizeof(*opts.opt);
-	opts.sb_flags = sb->s_flags;
-
-	err = au_si_alloc(sb);
-	if (unlikely(err))
-		goto out_opts;
-
-	/* all timestamps always follow the ones on the branch */
-	sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
-	sb->s_op = &aufs_sop;
-	sb->s_d_op = &aufs_dop;
-	sb->s_magic = AUFS_SUPER_MAGIC;
-	sb->s_maxbytes = 0;
-	sb->s_stack_depth = 1;
-	au_export_init(sb);
-	/* au_xattr_init(sb); */
-
-	err = alloc_root(sb);
-	if (unlikely(err)) {
-		si_write_unlock(sb);
-		goto out_info;
-	}
-	root = sb->s_root;
-	inode = d_inode(root);
-
-	/*
-	 * actually we can parse options regardless aufs lock here.
-	 * but at remount time, parsing must be done before aufs lock.
-	 * so we follow the same rule.
-	 */
-	ii_write_lock_parent(inode);
-	aufs_write_unlock(root);
-	err = au_opts_parse(sb, arg, &opts);
-	if (unlikely(err))
-		goto out_root;
-
-	/* lock vfs_inode first, then aufs. */
-	mutex_lock(&inode->i_mutex);
-	aufs_write_lock(root);
-	err = au_opts_mount(sb, &opts);
-	au_opts_free(&opts);
-	aufs_write_unlock(root);
-	mutex_unlock(&inode->i_mutex);
-	if (!err)
-		goto out_opts; /* success */
-
-out_root:
-	dput(root);
-	sb->s_root = NULL;
-out_info:
-	dbgaufs_si_fin(au_sbi(sb));
-	kobject_put(&au_sbi(sb)->si_kobj);
-	sb->s_fs_info = NULL;
-out_opts:
-	free_page((unsigned long)opts.opt);
-out:
-	AuTraceErr(err);
-	err = cvt_err(err);
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct dentry *aufs_mount(struct file_system_type *fs_type, int flags,
-				 const char *dev_name __maybe_unused,
-				 void *raw_data)
-{
-	struct dentry *root;
-	struct super_block *sb;
-
-	/* all timestamps always follow the ones on the branch */
-	/* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */
-	root = mount_nodev(fs_type, flags, raw_data, aufs_fill_super);
-	if (IS_ERR(root))
-		goto out;
-
-	sb = root->d_sb;
-	si_write_lock(sb, !AuLock_FLUSH);
-	sysaufs_brs_add(sb, 0);
-	si_write_unlock(sb);
-	au_sbilist_add(sb);
-
-out:
-	return root;
-}
-
-static void aufs_kill_sb(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-
-	sbinfo = au_sbi(sb);
-	if (sbinfo) {
-		au_sbilist_del(sb);
-		aufs_write_lock(sb->s_root);
-		au_fhsm_fin(sb);
-		if (sbinfo->si_wbr_create_ops->fin)
-			sbinfo->si_wbr_create_ops->fin(sb);
-		if (au_opt_test(sbinfo->si_mntflags, UDBA_HNOTIFY)) {
-			au_opt_set_udba(sbinfo->si_mntflags, UDBA_NONE);
-			au_remount_refresh(sb);
-		}
-		if (au_opt_test(sbinfo->si_mntflags, PLINK))
-			au_plink_put(sb, /*verbose*/1);
-		au_xino_clr(sb);
-		sbinfo->si_sb = NULL;
-		aufs_write_unlock(sb->s_root);
-		au_nwt_flush(&sbinfo->si_nowait);
-	}
-	kill_anon_super(sb);
-}
-
-struct file_system_type aufs_fs_type = {
-	.name		= AUFS_FSTYPE,
-	/* a race between rename and others */
-	.fs_flags	= FS_RENAME_DOES_D_MOVE,
-	.mount		= aufs_mount,
-	.kill_sb	= aufs_kill_sb,
-	/* no need to __module_get() and module_put(). */
-	.owner		= THIS_MODULE,
-};
diff --git a/fs/aufs/super.h b/fs/aufs/super.h
deleted file mode 100644
index 95ffbbb29..000000000
--- a/fs/aufs/super.h
+++ /dev/null
@@ -1,635 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * super_block operations
- */
-
-#ifndef __AUFS_SUPER_H__
-#define __AUFS_SUPER_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-#include <linux/kobject.h>
-#include "rwsem.h"
-#include "spl.h"
-#include "wkq.h"
-
-/* policies to select one among multiple writable branches */
-struct au_wbr_copyup_operations {
-	int (*copyup)(struct dentry *dentry);
-};
-
-#define AuWbr_DIR	1		/* target is a dir */
-#define AuWbr_PARENT	(1 << 1)	/* always require a parent */
-
-#define au_ftest_wbr(flags, name)	((flags) & AuWbr_##name)
-#define au_fset_wbr(flags, name)	{ (flags) |= AuWbr_##name; }
-#define au_fclr_wbr(flags, name)	{ (flags) &= ~AuWbr_##name; }
-
-struct au_wbr_create_operations {
-	int (*create)(struct dentry *dentry, unsigned int flags);
-	int (*init)(struct super_block *sb);
-	int (*fin)(struct super_block *sb);
-};
-
-struct au_wbr_mfs {
-	struct mutex	mfs_lock; /* protect this structure */
-	unsigned long	mfs_jiffy;
-	unsigned long	mfs_expire;
-	aufs_bindex_t	mfs_bindex;
-
-	unsigned long long	mfsrr_bytes;
-	unsigned long long	mfsrr_watermark;
-};
-
-struct pseudo_link {
-	union {
-		struct hlist_node hlist;
-		struct rcu_head rcu;
-	};
-	struct inode *inode;
-};
-
-#define AuPlink_NHASH 100
-static inline int au_plink_hash(ino_t ino)
-{
-	return ino % AuPlink_NHASH;
-}
-
-/* File-based Hierarchical Storage Management */
-struct au_fhsm {
-#ifdef CONFIG_AUFS_FHSM
-	/* allow only one process who can receive the notification */
-	spinlock_t		fhsm_spin;
-	pid_t			fhsm_pid;
-	wait_queue_head_t	fhsm_wqh;
-	atomic_t		fhsm_readable;
-
-	/* these are protected by si_rwsem */
-	unsigned long		fhsm_expire;
-	aufs_bindex_t		fhsm_bottom;
-#endif
-};
-
-struct au_branch;
-struct au_sbinfo {
-	/* nowait tasks in the system-wide workqueue */
-	struct au_nowait_tasks	si_nowait;
-
-	/*
-	 * tried sb->s_umount, but failed due to the dependecy between i_mutex.
-	 * rwsem for au_sbinfo is necessary.
-	 */
-	struct au_rwsem		si_rwsem;
-
-	/* prevent recursive locking in deleting inode */
-	struct {
-		unsigned long		*bitmap;
-		spinlock_t		tree_lock;
-		struct radix_tree_root	tree;
-	} au_si_pid;
-
-	/*
-	 * dirty approach to protect sb->sb_inodes and ->s_files (gone) from
-	 * remount.
-	 */
-	atomic_long_t		si_ninodes, si_nfiles;
-
-	/* branch management */
-	unsigned int		si_generation;
-
-	/* see AuSi_ flags */
-	unsigned char		au_si_status;
-
-	aufs_bindex_t		si_bend;
-
-	/* dirty trick to keep br_id plus */
-	unsigned int		si_last_br_id :
-				sizeof(aufs_bindex_t) * BITS_PER_BYTE - 1;
-	struct au_branch	**si_branch;
-
-	/* policy to select a writable branch */
-	unsigned char		si_wbr_copyup;
-	unsigned char		si_wbr_create;
-	struct au_wbr_copyup_operations *si_wbr_copyup_ops;
-	struct au_wbr_create_operations *si_wbr_create_ops;
-
-	/* round robin */
-	atomic_t		si_wbr_rr_next;
-
-	/* most free space */
-	struct au_wbr_mfs	si_wbr_mfs;
-
-	/* File-based Hierarchical Storage Management */
-	struct au_fhsm		si_fhsm;
-
-	/* mount flags */
-	/* include/asm-ia64/siginfo.h defines a macro named si_flags */
-	unsigned int		si_mntflags;
-
-	/* external inode number (bitmap and translation table) */
-	vfs_readf_t		si_xread;
-	vfs_writef_t		si_xwrite;
-	struct file		*si_xib;
-	struct mutex		si_xib_mtx; /* protect xib members */
-	unsigned long		*si_xib_buf;
-	unsigned long		si_xib_last_pindex;
-	int			si_xib_next_bit;
-	aufs_bindex_t		si_xino_brid;
-	unsigned long		si_xino_jiffy;
-	unsigned long		si_xino_expire;
-	/* reserved for future use */
-	/* unsigned long long	si_xib_limit; */	/* Max xib file size */
-
-#ifdef CONFIG_AUFS_EXPORT
-	/* i_generation */
-	struct file		*si_xigen;
-	atomic_t		si_xigen_next;
-#endif
-
-	/* dirty trick to suppoer atomic_open */
-	struct au_sphlhead	si_aopen;
-
-	/* vdir parameters */
-	unsigned long		si_rdcache;	/* max cache time in jiffies */
-	unsigned int		si_rdblk;	/* deblk size */
-	unsigned int		si_rdhash;	/* hash size */
-
-	/*
-	 * If the number of whiteouts are larger than si_dirwh, leave all of
-	 * them after au_whtmp_ren to reduce the cost of rmdir(2).
-	 * future fsck.aufs or kernel thread will remove them later.
-	 * Otherwise, remove all whiteouts and the dir in rmdir(2).
-	 */
-	unsigned int		si_dirwh;
-
-	/* pseudo_link list */
-	struct au_sphlhead	si_plink[AuPlink_NHASH];
-	wait_queue_head_t	si_plink_wq;
-	spinlock_t		si_plink_maint_lock;
-	pid_t			si_plink_maint_pid;
-
-	/* file list */
-	struct au_sphlhead	si_files;
-
-	/*
-	 * sysfs and lifetime management.
-	 * this is not a small structure and it may be a waste of memory in case
-	 * of sysfs is disabled, particulary when many aufs-es are mounted.
-	 * but using sysfs is majority.
-	 */
-	struct kobject		si_kobj;
-#ifdef CONFIG_DEBUG_FS
-	struct dentry		 *si_dbgaufs;
-	struct dentry		 *si_dbgaufs_plink;
-	struct dentry		 *si_dbgaufs_xib;
-#ifdef CONFIG_AUFS_EXPORT
-	struct dentry		 *si_dbgaufs_xigen;
-#endif
-#endif
-
-#ifdef CONFIG_AUFS_SBILIST
-	struct list_head	si_list;
-#endif
-
-	/* dirty, necessary for unmounting, sysfs and sysrq */
-	struct super_block	*si_sb;
-};
-
-/* sbinfo status flags */
-/*
- * set true when refresh_dirs() failed at remount time.
- * then try refreshing dirs at access time again.
- * if it is false, refreshing dirs at access time is unnecesary
- */
-#define AuSi_FAILED_REFRESH_DIR	1
-
-#define AuSi_FHSM		(1 << 1)	/* fhsm is active now */
-
-#ifndef CONFIG_AUFS_FHSM
-#undef AuSi_FHSM
-#define AuSi_FHSM		0
-#endif
-
-static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi,
-					   unsigned int flag)
-{
-	AuRwMustAnyLock(&sbi->si_rwsem);
-	return sbi->au_si_status & flag;
-}
-#define au_ftest_si(sbinfo, name)	au_do_ftest_si(sbinfo, AuSi_##name)
-#define au_fset_si(sbinfo, name) do { \
-	AuRwMustWriteLock(&(sbinfo)->si_rwsem); \
-	(sbinfo)->au_si_status |= AuSi_##name; \
-} while (0)
-#define au_fclr_si(sbinfo, name) do { \
-	AuRwMustWriteLock(&(sbinfo)->si_rwsem); \
-	(sbinfo)->au_si_status &= ~AuSi_##name; \
-} while (0)
-
-/* ---------------------------------------------------------------------- */
-
-/* policy to select one among writable branches */
-#define AuWbrCopyup(sbinfo, ...) \
-	((sbinfo)->si_wbr_copyup_ops->copyup(__VA_ARGS__))
-#define AuWbrCreate(sbinfo, ...) \
-	((sbinfo)->si_wbr_create_ops->create(__VA_ARGS__))
-
-/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */
-#define AuLock_DW		1		/* write-lock dentry */
-#define AuLock_IR		(1 << 1)	/* read-lock inode */
-#define AuLock_IW		(1 << 2)	/* write-lock inode */
-#define AuLock_FLUSH		(1 << 3)	/* wait for 'nowait' tasks */
-#define AuLock_DIR		(1 << 4)	/* target is a dir */
-#define AuLock_NOPLM		(1 << 5)	/* return err in plm mode */
-#define AuLock_NOPLMW		(1 << 6)	/* wait for plm mode ends */
-#define AuLock_GEN		(1 << 7)	/* test digen/iigen */
-#define au_ftest_lock(flags, name)	((flags) & AuLock_##name)
-#define au_fset_lock(flags, name) \
-	do { (flags) |= AuLock_##name; } while (0)
-#define au_fclr_lock(flags, name) \
-	do { (flags) &= ~AuLock_##name; } while (0)
-
-/* ---------------------------------------------------------------------- */
-
-/* super.c */
-extern struct file_system_type aufs_fs_type;
-struct inode *au_iget_locked(struct super_block *sb, ino_t ino);
-typedef unsigned long long (*au_arraycb_t)(void *array, unsigned long long max,
-					   void *arg);
-void au_array_free(void *array);
-void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, void *arg);
-struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max);
-void au_iarray_free(struct inode **a, unsigned long long max);
-
-/* sbinfo.c */
-void au_si_free(struct kobject *kobj);
-int au_si_alloc(struct super_block *sb);
-int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr);
-
-unsigned int au_sigen_inc(struct super_block *sb);
-aufs_bindex_t au_new_br_id(struct super_block *sb);
-
-int si_read_lock(struct super_block *sb, int flags);
-int si_write_lock(struct super_block *sb, int flags);
-int aufs_read_lock(struct dentry *dentry, int flags);
-void aufs_read_unlock(struct dentry *dentry, int flags);
-void aufs_write_lock(struct dentry *dentry);
-void aufs_write_unlock(struct dentry *dentry);
-int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags);
-void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2);
-
-int si_pid_test_slow(struct super_block *sb);
-void si_pid_set_slow(struct super_block *sb);
-void si_pid_clr_slow(struct super_block *sb);
-
-/* wbr_policy.c */
-extern struct au_wbr_copyup_operations au_wbr_copyup_ops[];
-extern struct au_wbr_create_operations au_wbr_create_ops[];
-int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst);
-int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex);
-int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t bstart);
-
-/* mvdown.c */
-int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *arg);
-
-#ifdef CONFIG_AUFS_FHSM
-/* fhsm.c */
-
-static inline pid_t au_fhsm_pid(struct au_fhsm *fhsm)
-{
-	pid_t pid;
-
-	spin_lock(&fhsm->fhsm_spin);
-	pid = fhsm->fhsm_pid;
-	spin_unlock(&fhsm->fhsm_spin);
-
-	return pid;
-}
-
-void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force);
-void au_fhsm_wrote_all(struct super_block *sb, int force);
-int au_fhsm_fd(struct super_block *sb, int oflags);
-int au_fhsm_br_alloc(struct au_branch *br);
-void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex);
-void au_fhsm_fin(struct super_block *sb);
-void au_fhsm_init(struct au_sbinfo *sbinfo);
-void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec);
-void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo);
-#else
-AuStubVoid(au_fhsm_wrote, struct super_block *sb, aufs_bindex_t bindex,
-	   int force)
-AuStubVoid(au_fhsm_wrote_all, struct super_block *sb, int force)
-AuStub(int, au_fhsm_fd, return -EOPNOTSUPP, struct super_block *sb, int oflags)
-AuStub(pid_t, au_fhsm_pid, return 0, struct au_fhsm *fhsm)
-AuStubInt0(au_fhsm_br_alloc, struct au_branch *br)
-AuStubVoid(au_fhsm_set_bottom, struct super_block *sb, aufs_bindex_t bindex)
-AuStubVoid(au_fhsm_fin, struct super_block *sb)
-AuStubVoid(au_fhsm_init, struct au_sbinfo *sbinfo)
-AuStubVoid(au_fhsm_set, struct au_sbinfo *sbinfo, unsigned int sec)
-AuStubVoid(au_fhsm_show, struct seq_file *seq, struct au_sbinfo *sbinfo)
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct au_sbinfo *au_sbi(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_EXPORT
-int au_test_nfsd(void);
-void au_export_init(struct super_block *sb);
-void au_xigen_inc(struct inode *inode);
-int au_xigen_new(struct inode *inode);
-int au_xigen_set(struct super_block *sb, struct file *base);
-void au_xigen_clr(struct super_block *sb);
-
-static inline int au_busy_or_stale(void)
-{
-	if (!au_test_nfsd())
-		return -EBUSY;
-	return -ESTALE;
-}
-#else
-AuStubInt0(au_test_nfsd, void)
-AuStubVoid(au_export_init, struct super_block *sb)
-AuStubVoid(au_xigen_inc, struct inode *inode)
-AuStubInt0(au_xigen_new, struct inode *inode)
-AuStubInt0(au_xigen_set, struct super_block *sb, struct file *base)
-AuStubVoid(au_xigen_clr, struct super_block *sb)
-AuStub(int, au_busy_or_stale, return -EBUSY, void)
-#endif /* CONFIG_AUFS_EXPORT */
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_SBILIST
-/* module.c */
-extern struct au_splhead au_sbilist;
-
-static inline void au_sbilist_init(void)
-{
-	au_spl_init(&au_sbilist);
-}
-
-static inline void au_sbilist_add(struct super_block *sb)
-{
-	au_spl_add(&au_sbi(sb)->si_list, &au_sbilist);
-}
-
-static inline void au_sbilist_del(struct super_block *sb)
-{
-	au_spl_del(&au_sbi(sb)->si_list, &au_sbilist);
-}
-
-#ifdef CONFIG_AUFS_MAGIC_SYSRQ
-static inline void au_sbilist_lock(void)
-{
-	spin_lock(&au_sbilist.spin);
-}
-
-static inline void au_sbilist_unlock(void)
-{
-	spin_unlock(&au_sbilist.spin);
-}
-#define AuGFP_SBILIST	GFP_ATOMIC
-#else
-AuStubVoid(au_sbilist_lock, void)
-AuStubVoid(au_sbilist_unlock, void)
-#define AuGFP_SBILIST	GFP_NOFS
-#endif /* CONFIG_AUFS_MAGIC_SYSRQ */
-#else
-AuStubVoid(au_sbilist_init, void)
-AuStubVoid(au_sbilist_add, struct super_block *sb)
-AuStubVoid(au_sbilist_del, struct super_block *sb)
-AuStubVoid(au_sbilist_lock, void)
-AuStubVoid(au_sbilist_unlock, void)
-#define AuGFP_SBILIST	GFP_NOFS
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo)
-{
-	/*
-	 * This function is a dynamic '__init' function actually,
-	 * so the tiny check for si_rwsem is unnecessary.
-	 */
-	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
-#ifdef CONFIG_DEBUG_FS
-	sbinfo->si_dbgaufs = NULL;
-	sbinfo->si_dbgaufs_plink = NULL;
-	sbinfo->si_dbgaufs_xib = NULL;
-#ifdef CONFIG_AUFS_EXPORT
-	sbinfo->si_dbgaufs_xigen = NULL;
-#endif
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
-
-static inline pid_t si_pid_bit(void)
-{
-	/* the origin of pid is 1, but the bitmap's is 0 */
-	return current->pid - 1;
-}
-
-static inline int si_pid_test(struct super_block *sb)
-{
-	pid_t bit;
-
-	bit = si_pid_bit();
-	if (bit < PID_MAX_DEFAULT)
-		return test_bit(bit, au_sbi(sb)->au_si_pid.bitmap);
-	return si_pid_test_slow(sb);
-}
-
-static inline void si_pid_set(struct super_block *sb)
-{
-	pid_t bit;
-
-	bit = si_pid_bit();
-	if (bit < PID_MAX_DEFAULT) {
-		AuDebugOn(test_bit(bit, au_sbi(sb)->au_si_pid.bitmap));
-		set_bit(bit, au_sbi(sb)->au_si_pid.bitmap);
-		/* smp_mb(); */
-	} else
-		si_pid_set_slow(sb);
-}
-
-static inline void si_pid_clr(struct super_block *sb)
-{
-	pid_t bit;
-
-	bit = si_pid_bit();
-	if (bit < PID_MAX_DEFAULT) {
-		AuDebugOn(!test_bit(bit, au_sbi(sb)->au_si_pid.bitmap));
-		clear_bit(bit, au_sbi(sb)->au_si_pid.bitmap);
-		/* smp_mb(); */
-	} else
-		si_pid_clr_slow(sb);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* lock superblock. mainly for entry point functions */
-/*
- * __si_read_lock, __si_write_lock,
- * __si_read_unlock, __si_write_unlock, __si_downgrade_lock
- */
-AuSimpleRwsemFuncs(__si, struct super_block *sb, &au_sbi(sb)->si_rwsem);
-
-#define SiMustNoWaiters(sb)	AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem)
-#define SiMustAnyLock(sb)	AuRwMustAnyLock(&au_sbi(sb)->si_rwsem)
-#define SiMustWriteLock(sb)	AuRwMustWriteLock(&au_sbi(sb)->si_rwsem)
-
-static inline void si_noflush_read_lock(struct super_block *sb)
-{
-	__si_read_lock(sb);
-	si_pid_set(sb);
-}
-
-static inline int si_noflush_read_trylock(struct super_block *sb)
-{
-	int locked;
-
-	locked = __si_read_trylock(sb);
-	if (locked)
-		si_pid_set(sb);
-	return locked;
-}
-
-static inline void si_noflush_write_lock(struct super_block *sb)
-{
-	__si_write_lock(sb);
-	si_pid_set(sb);
-}
-
-static inline int si_noflush_write_trylock(struct super_block *sb)
-{
-	int locked;
-
-	locked = __si_write_trylock(sb);
-	if (locked)
-		si_pid_set(sb);
-	return locked;
-}
-
-#if 0 /* reserved */
-static inline int si_read_trylock(struct super_block *sb, int flags)
-{
-	if (au_ftest_lock(flags, FLUSH))
-		au_nwt_flush(&au_sbi(sb)->si_nowait);
-	return si_noflush_read_trylock(sb);
-}
-#endif
-
-static inline void si_read_unlock(struct super_block *sb)
-{
-	si_pid_clr(sb);
-	__si_read_unlock(sb);
-}
-
-#if 0 /* reserved */
-static inline int si_write_trylock(struct super_block *sb, int flags)
-{
-	if (au_ftest_lock(flags, FLUSH))
-		au_nwt_flush(&au_sbi(sb)->si_nowait);
-	return si_noflush_write_trylock(sb);
-}
-#endif
-
-static inline void si_write_unlock(struct super_block *sb)
-{
-	si_pid_clr(sb);
-	__si_write_unlock(sb);
-}
-
-#if 0 /* reserved */
-static inline void si_downgrade_lock(struct super_block *sb)
-{
-	__si_downgrade_lock(sb);
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static inline aufs_bindex_t au_sbend(struct super_block *sb)
-{
-	SiMustAnyLock(sb);
-	return au_sbi(sb)->si_bend;
-}
-
-static inline unsigned int au_mntflags(struct super_block *sb)
-{
-	SiMustAnyLock(sb);
-	return au_sbi(sb)->si_mntflags;
-}
-
-static inline unsigned int au_sigen(struct super_block *sb)
-{
-	SiMustAnyLock(sb);
-	return au_sbi(sb)->si_generation;
-}
-
-static inline void au_ninodes_inc(struct super_block *sb)
-{
-	atomic_long_inc(&au_sbi(sb)->si_ninodes);
-}
-
-static inline void au_ninodes_dec(struct super_block *sb)
-{
-	AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_ninodes));
-	atomic_long_dec(&au_sbi(sb)->si_ninodes);
-}
-
-static inline void au_nfiles_inc(struct super_block *sb)
-{
-	atomic_long_inc(&au_sbi(sb)->si_nfiles);
-}
-
-static inline void au_nfiles_dec(struct super_block *sb)
-{
-	AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_nfiles));
-	atomic_long_dec(&au_sbi(sb)->si_nfiles);
-}
-
-static inline struct au_branch *au_sbr(struct super_block *sb,
-				       aufs_bindex_t bindex)
-{
-	SiMustAnyLock(sb);
-	return au_sbi(sb)->si_branch[0 + bindex];
-}
-
-static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid)
-{
-	SiMustWriteLock(sb);
-	au_sbi(sb)->si_xino_brid = brid;
-}
-
-static inline aufs_bindex_t au_xino_brid(struct super_block *sb)
-{
-	SiMustAnyLock(sb);
-	return au_sbi(sb)->si_xino_brid;
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_SUPER_H__ */
diff --git a/fs/aufs/sysaufs.c b/fs/aufs/sysaufs.c
deleted file mode 100644
index 42dbc2825..000000000
--- a/fs/aufs/sysaufs.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * sysfs interface and lifetime management
- * they are necessary regardless sysfs is disabled.
- */
-
-#include <linux/random.h>
-#include "aufs.h"
-
-unsigned long sysaufs_si_mask;
-struct kset *sysaufs_kset;
-
-#define AuSiAttr(_name) { \
-	.attr   = { .name = __stringify(_name), .mode = 0444 },	\
-	.show   = sysaufs_si_##_name,				\
-}
-
-static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path);
-struct attribute *sysaufs_si_attrs[] = {
-	&sysaufs_si_attr_xi_path.attr,
-	NULL,
-};
-
-static const struct sysfs_ops au_sbi_ops = {
-	.show   = sysaufs_si_show
-};
-
-static struct kobj_type au_sbi_ktype = {
-	.release	= au_si_free,
-	.sysfs_ops	= &au_sbi_ops,
-	.default_attrs	= sysaufs_si_attrs
-};
-
-/* ---------------------------------------------------------------------- */
-
-int sysaufs_si_init(struct au_sbinfo *sbinfo)
-{
-	int err;
-
-	sbinfo->si_kobj.kset = sysaufs_kset;
-	/* cf. sysaufs_name() */
-	err = kobject_init_and_add
-		(&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_kset->kobj*/NULL,
-		 SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo));
-
-	dbgaufs_si_null(sbinfo);
-	if (!err) {
-		err = dbgaufs_si_init(sbinfo);
-		if (unlikely(err))
-			kobject_put(&sbinfo->si_kobj);
-	}
-	return err;
-}
-
-void sysaufs_fin(void)
-{
-	dbgaufs_fin();
-	sysfs_remove_group(&sysaufs_kset->kobj, sysaufs_attr_group);
-	kset_unregister(sysaufs_kset);
-}
-
-int __init sysaufs_init(void)
-{
-	int err;
-
-	do {
-		get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask));
-	} while (!sysaufs_si_mask);
-
-	err = -EINVAL;
-	sysaufs_kset = kset_create_and_add(AUFS_NAME, NULL, fs_kobj);
-	if (unlikely(!sysaufs_kset))
-		goto out;
-	err = PTR_ERR(sysaufs_kset);
-	if (IS_ERR(sysaufs_kset))
-		goto out;
-	err = sysfs_create_group(&sysaufs_kset->kobj, sysaufs_attr_group);
-	if (unlikely(err)) {
-		kset_unregister(sysaufs_kset);
-		goto out;
-	}
-
-	err = dbgaufs_init();
-	if (unlikely(err))
-		sysaufs_fin();
-out:
-	return err;
-}
diff --git a/fs/aufs/sysaufs.h b/fs/aufs/sysaufs.h
deleted file mode 100644
index 8e0fd02aa..000000000
--- a/fs/aufs/sysaufs.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * sysfs interface and mount lifetime management
- */
-
-#ifndef __SYSAUFS_H__
-#define __SYSAUFS_H__
-
-#ifdef __KERNEL__
-
-#include <linux/sysfs.h>
-#include "module.h"
-
-struct super_block;
-struct au_sbinfo;
-
-struct sysaufs_si_attr {
-	struct attribute attr;
-	int (*show)(struct seq_file *seq, struct super_block *sb);
-};
-
-/* ---------------------------------------------------------------------- */
-
-/* sysaufs.c */
-extern unsigned long sysaufs_si_mask;
-extern struct kset *sysaufs_kset;
-extern struct attribute *sysaufs_si_attrs[];
-int sysaufs_si_init(struct au_sbinfo *sbinfo);
-int __init sysaufs_init(void);
-void sysaufs_fin(void);
-
-/* ---------------------------------------------------------------------- */
-
-/* some people doesn't like to show a pointer in kernel */
-static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo)
-{
-	return sysaufs_si_mask ^ (unsigned long)sbinfo;
-}
-
-#define SysaufsSiNamePrefix	"si_"
-#define SysaufsSiNameLen	(sizeof(SysaufsSiNamePrefix) + 16)
-static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name)
-{
-	snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx",
-		 sysaufs_si_id(sbinfo));
-}
-
-struct au_branch;
-#ifdef CONFIG_SYSFS
-/* sysfs.c */
-extern struct attribute_group *sysaufs_attr_group;
-
-int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb);
-ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf);
-long au_brinfo_ioctl(struct file *file, unsigned long arg);
-#ifdef CONFIG_COMPAT
-long au_brinfo_compat_ioctl(struct file *file, unsigned long arg);
-#endif
-
-void sysaufs_br_init(struct au_branch *br);
-void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex);
-void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex);
-
-#define sysaufs_brs_init()	do {} while (0)
-
-#else
-#define sysaufs_attr_group	NULL
-
-AuStubInt0(sysaufs_si_xi_path, struct seq_file *seq, struct super_block *sb)
-AuStub(ssize_t, sysaufs_si_show, return 0, struct kobject *kobj,
-       struct attribute *attr, char *buf)
-AuStubVoid(sysaufs_br_init, struct au_branch *br)
-AuStubVoid(sysaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex)
-AuStubVoid(sysaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex)
-
-static inline void sysaufs_brs_init(void)
-{
-	sysaufs_brs = 0;
-}
-
-#endif /* CONFIG_SYSFS */
-
-#endif /* __KERNEL__ */
-#endif /* __SYSAUFS_H__ */
diff --git a/fs/aufs/sysfs.c b/fs/aufs/sysfs.c
deleted file mode 100644
index 549f71d1a..000000000
--- a/fs/aufs/sysfs.c
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * sysfs interface
- */
-
-#include <linux/compat.h>
-#include <linux/seq_file.h>
-#include "aufs.h"
-
-#ifdef CONFIG_AUFS_FS_MODULE
-/* this entry violates the "one line per file" policy of sysfs */
-static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr,
-			   char *buf)
-{
-	ssize_t err;
-	static char *conf =
-/* this file is generated at compiling */
-#include "conf.str"
-		;
-
-	err = snprintf(buf, PAGE_SIZE, conf);
-	if (unlikely(err >= PAGE_SIZE))
-		err = -EFBIG;
-	return err;
-}
-
-static struct kobj_attribute au_config_attr = __ATTR_RO(config);
-#endif
-
-static struct attribute *au_attr[] = {
-#ifdef CONFIG_AUFS_FS_MODULE
-	&au_config_attr.attr,
-#endif
-	NULL,	/* need to NULL terminate the list of attributes */
-};
-
-static struct attribute_group sysaufs_attr_group_body = {
-	.attrs = au_attr
-};
-
-struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body;
-
-/* ---------------------------------------------------------------------- */
-
-int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb)
-{
-	int err;
-
-	SiMustAnyLock(sb);
-
-	err = 0;
-	if (au_opt_test(au_mntflags(sb), XINO)) {
-		err = au_xino_path(seq, au_sbi(sb)->si_xib);
-		seq_putc(seq, '\n');
-	}
-	return err;
-}
-
-/*
- * the lifetime of branch is independent from the entry under sysfs.
- * sysfs handles the lifetime of the entry, and never call ->show() after it is
- * unlinked.
- */
-static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb,
-			 aufs_bindex_t bindex, int idx)
-{
-	int err;
-	struct path path;
-	struct dentry *root;
-	struct au_branch *br;
-	au_br_perm_str_t perm;
-
-	AuDbg("b%d\n", bindex);
-
-	err = 0;
-	root = sb->s_root;
-	di_read_lock_parent(root, !AuLock_IR);
-	br = au_sbr(sb, bindex);
-
-	switch (idx) {
-	case AuBrSysfs_BR:
-		path.mnt = au_br_mnt(br);
-		path.dentry = au_h_dptr(root, bindex);
-		au_seq_path(seq, &path);
-		au_optstr_br_perm(&perm, br->br_perm);
-		err = seq_printf(seq, "=%s\n", perm.a);
-		break;
-	case AuBrSysfs_BRID:
-		err = seq_printf(seq, "%d\n", br->br_id);
-		break;
-	}
-	di_read_unlock(root, !AuLock_IR);
-	if (err == -1)
-		err = -E2BIG;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct seq_file *au_seq(char *p, ssize_t len)
-{
-	struct seq_file *seq;
-
-	seq = kzalloc(sizeof(*seq), GFP_NOFS);
-	if (seq) {
-		/* mutex_init(&seq.lock); */
-		seq->buf = p;
-		seq->size = len;
-		return seq; /* success */
-	}
-
-	seq = ERR_PTR(-ENOMEM);
-	return seq;
-}
-
-#define SysaufsBr_PREFIX	"br"
-#define SysaufsBrid_PREFIX	"brid"
-
-/* todo: file size may exceed PAGE_SIZE */
-ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr,
-			char *buf)
-{
-	ssize_t err;
-	int idx;
-	long l;
-	aufs_bindex_t bend;
-	struct au_sbinfo *sbinfo;
-	struct super_block *sb;
-	struct seq_file *seq;
-	char *name;
-	struct attribute **cattr;
-
-	sbinfo = container_of(kobj, struct au_sbinfo, si_kobj);
-	sb = sbinfo->si_sb;
-
-	/*
-	 * prevent a race condition between sysfs and aufs.
-	 * for instance, sysfs_file_read() calls sysfs_get_active_two() which
-	 * prohibits maintaining the sysfs entries.
-	 * hew we acquire read lock after sysfs_get_active_two().
-	 * on the other hand, the remount process may maintain the sysfs/aufs
-	 * entries after acquiring write lock.
-	 * it can cause a deadlock.
-	 * simply we gave up processing read here.
-	 */
-	err = -EBUSY;
-	if (unlikely(!si_noflush_read_trylock(sb)))
-		goto out;
-
-	seq = au_seq(buf, PAGE_SIZE);
-	err = PTR_ERR(seq);
-	if (IS_ERR(seq))
-		goto out_unlock;
-
-	name = (void *)attr->name;
-	cattr = sysaufs_si_attrs;
-	while (*cattr) {
-		if (!strcmp(name, (*cattr)->name)) {
-			err = container_of(*cattr, struct sysaufs_si_attr, attr)
-				->show(seq, sb);
-			goto out_seq;
-		}
-		cattr++;
-	}
-
-	if (!strncmp(name, SysaufsBrid_PREFIX,
-		     sizeof(SysaufsBrid_PREFIX) - 1)) {
-		idx = AuBrSysfs_BRID;
-		name += sizeof(SysaufsBrid_PREFIX) - 1;
-	} else if (!strncmp(name, SysaufsBr_PREFIX,
-			    sizeof(SysaufsBr_PREFIX) - 1)) {
-		idx = AuBrSysfs_BR;
-		name += sizeof(SysaufsBr_PREFIX) - 1;
-	} else
-		  BUG();
-
-	err = kstrtol(name, 10, &l);
-	if (!err) {
-		bend = au_sbend(sb);
-		if (l <= bend)
-			err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l, idx);
-		else
-			err = -ENOENT;
-	}
-
-out_seq:
-	if (!err) {
-		err = seq->count;
-		/* sysfs limit */
-		if (unlikely(err == PAGE_SIZE))
-			err = -EFBIG;
-	}
-	kfree(seq);
-out_unlock:
-	si_read_unlock(sb);
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_brinfo(struct super_block *sb, union aufs_brinfo __user *arg)
-{
-	int err;
-	int16_t brid;
-	aufs_bindex_t bindex, bend;
-	size_t sz;
-	char *buf;
-	struct seq_file *seq;
-	struct au_branch *br;
-
-	si_read_lock(sb, AuLock_FLUSH);
-	bend = au_sbend(sb);
-	err = bend + 1;
-	if (!arg)
-		goto out;
-
-	err = -ENOMEM;
-	buf = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!buf))
-		goto out;
-
-	seq = au_seq(buf, PAGE_SIZE);
-	err = PTR_ERR(seq);
-	if (IS_ERR(seq))
-		goto out_buf;
-
-	sz = sizeof(*arg) - offsetof(union aufs_brinfo, path);
-	for (bindex = 0; bindex <= bend; bindex++, arg++) {
-		err = !access_ok(VERIFY_WRITE, arg, sizeof(*arg));
-		if (unlikely(err))
-			break;
-
-		br = au_sbr(sb, bindex);
-		brid = br->br_id;
-		BUILD_BUG_ON(sizeof(brid) != sizeof(arg->id));
-		err = __put_user(brid, &arg->id);
-		if (unlikely(err))
-			break;
-
-		BUILD_BUG_ON(sizeof(br->br_perm) != sizeof(arg->perm));
-		err = __put_user(br->br_perm, &arg->perm);
-		if (unlikely(err))
-			break;
-
-		au_seq_path(seq, &br->br_path);
-		err = seq_putc(seq, '\0');
-		if (!err && seq->count <= sz) {
-			err = copy_to_user(arg->path, seq->buf, seq->count);
-			seq->count = 0;
-			if (unlikely(err))
-				break;
-		} else {
-			err = -E2BIG;
-			goto out_seq;
-		}
-	}
-	if (unlikely(err))
-		err = -EFAULT;
-
-out_seq:
-	kfree(seq);
-out_buf:
-	free_page((unsigned long)buf);
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-long au_brinfo_ioctl(struct file *file, unsigned long arg)
-{
-	return au_brinfo(file->f_path.dentry->d_sb, (void __user *)arg);
-}
-
-#ifdef CONFIG_COMPAT
-long au_brinfo_compat_ioctl(struct file *file, unsigned long arg)
-{
-	return au_brinfo(file->f_path.dentry->d_sb, compat_ptr(arg));
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-void sysaufs_br_init(struct au_branch *br)
-{
-	int i;
-	struct au_brsysfs *br_sysfs;
-	struct attribute *attr;
-
-	br_sysfs = br->br_sysfs;
-	for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
-		attr = &br_sysfs->attr;
-		sysfs_attr_init(attr);
-		attr->name = br_sysfs->name;
-		attr->mode = S_IRUGO;
-		br_sysfs++;
-	}
-}
-
-void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex)
-{
-	struct au_branch *br;
-	struct kobject *kobj;
-	struct au_brsysfs *br_sysfs;
-	int i;
-	aufs_bindex_t bend;
-
-	dbgaufs_brs_del(sb, bindex);
-
-	if (!sysaufs_brs)
-		return;
-
-	kobj = &au_sbi(sb)->si_kobj;
-	bend = au_sbend(sb);
-	for (; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		br_sysfs = br->br_sysfs;
-		for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
-			sysfs_remove_file(kobj, &br_sysfs->attr);
-			br_sysfs++;
-		}
-	}
-}
-
-void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex)
-{
-	int err, i;
-	aufs_bindex_t bend;
-	struct kobject *kobj;
-	struct au_branch *br;
-	struct au_brsysfs *br_sysfs;
-
-	dbgaufs_brs_add(sb, bindex);
-
-	if (!sysaufs_brs)
-		return;
-
-	kobj = &au_sbi(sb)->si_kobj;
-	bend = au_sbend(sb);
-	for (; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		br_sysfs = br->br_sysfs;
-		snprintf(br_sysfs[AuBrSysfs_BR].name, sizeof(br_sysfs->name),
-			 SysaufsBr_PREFIX "%d", bindex);
-		snprintf(br_sysfs[AuBrSysfs_BRID].name, sizeof(br_sysfs->name),
-			 SysaufsBrid_PREFIX "%d", bindex);
-		for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
-			err = sysfs_create_file(kobj, &br_sysfs->attr);
-			if (unlikely(err))
-				pr_warn("failed %s under sysfs(%d)\n",
-					br_sysfs->name, err);
-			br_sysfs++;
-		}
-	}
-}
diff --git a/fs/aufs/sysrq.c b/fs/aufs/sysrq.c
deleted file mode 100644
index a3e95bce6..000000000
--- a/fs/aufs/sysrq.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * magic sysrq hanlder
- */
-
-/* #include <linux/sysrq.h> */
-#include <linux/writeback.h>
-#include "aufs.h"
-
-/* ---------------------------------------------------------------------- */
-
-static void sysrq_sb(struct super_block *sb)
-{
-	char *plevel;
-	struct au_sbinfo *sbinfo;
-	struct file *file;
-	struct au_sphlhead *files;
-	struct au_finfo *finfo;
-
-	plevel = au_plevel;
-	au_plevel = KERN_WARNING;
-
-	/* since we define pr_fmt, call printk directly */
-#define pr(str) printk(KERN_WARNING AUFS_NAME ": " str)
-
-	sbinfo = au_sbi(sb);
-	printk(KERN_WARNING "si=%lx\n", sysaufs_si_id(sbinfo));
-	pr("superblock\n");
-	au_dpri_sb(sb);
-
-#if 0
-	pr("root dentry\n");
-	au_dpri_dentry(sb->s_root);
-	pr("root inode\n");
-	au_dpri_inode(d_inode(sb->s_root));
-#endif
-
-#if 0
-	do {
-		int err, i, j, ndentry;
-		struct au_dcsub_pages dpages;
-		struct au_dpage *dpage;
-
-		err = au_dpages_init(&dpages, GFP_ATOMIC);
-		if (unlikely(err))
-			break;
-		err = au_dcsub_pages(&dpages, sb->s_root, NULL, NULL);
-		if (!err)
-			for (i = 0; i < dpages.ndpage; i++) {
-				dpage = dpages.dpages + i;
-				ndentry = dpage->ndentry;
-				for (j = 0; j < ndentry; j++)
-					au_dpri_dentry(dpage->dentries[j]);
-			}
-		au_dpages_free(&dpages);
-	} while (0);
-#endif
-
-#if 1
-	{
-		struct inode *i;
-
-		pr("isolated inode\n");
-		spin_lock(&inode_sb_list_lock);
-		list_for_each_entry(i, &sb->s_inodes, i_sb_list) {
-			spin_lock(&i->i_lock);
-			if (1 || hlist_empty(&i->i_dentry))
-				au_dpri_inode(i);
-			spin_unlock(&i->i_lock);
-		}
-		spin_unlock(&inode_sb_list_lock);
-	}
-#endif
-	pr("files\n");
-	files = &au_sbi(sb)->si_files;
-	spin_lock(&files->spin);
-	hlist_for_each_entry(finfo, &files->head, fi_hlist) {
-		umode_t mode;
-
-		file = finfo->fi_file;
-		mode = file_inode(file)->i_mode;
-		if (!special_file(mode))
-			au_dpri_file(file);
-	}
-	spin_unlock(&files->spin);
-	pr("done\n");
-
-#undef pr
-	au_plevel = plevel;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* module parameter */
-static char *aufs_sysrq_key = "a";
-module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO);
-MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME);
-
-static void au_sysrq(int key __maybe_unused)
-{
-	struct au_sbinfo *sbinfo;
-
-	lockdep_off();
-	au_sbilist_lock();
-	list_for_each_entry(sbinfo, &au_sbilist.head, si_list)
-		sysrq_sb(sbinfo->si_sb);
-	au_sbilist_unlock();
-	lockdep_on();
-}
-
-static struct sysrq_key_op au_sysrq_op = {
-	.handler	= au_sysrq,
-	.help_msg	= "Aufs",
-	.action_msg	= "Aufs",
-	.enable_mask	= SYSRQ_ENABLE_DUMP
-};
-
-/* ---------------------------------------------------------------------- */
-
-int __init au_sysrq_init(void)
-{
-	int err;
-	char key;
-
-	err = -1;
-	key = *aufs_sysrq_key;
-	if ('a' <= key && key <= 'z')
-		err = register_sysrq_key(key, &au_sysrq_op);
-	if (unlikely(err))
-		pr_err("err %d, sysrq=%c\n", err, key);
-	return err;
-}
-
-void au_sysrq_fin(void)
-{
-	int err;
-
-	err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op);
-	if (unlikely(err))
-		pr_err("err %d (ignored)\n", err);
-}
diff --git a/fs/aufs/vdir.c b/fs/aufs/vdir.c
deleted file mode 100644
index 0929d393a..000000000
--- a/fs/aufs/vdir.c
+++ /dev/null
@@ -1,888 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * virtual or vertical directory
- */
-
-#include "aufs.h"
-
-static unsigned int calc_size(int nlen)
-{
-	return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t));
-}
-
-static int set_deblk_end(union au_vdir_deblk_p *p,
-			 union au_vdir_deblk_p *deblk_end)
-{
-	if (calc_size(0) <= deblk_end->deblk - p->deblk) {
-		p->de->de_str.len = 0;
-		/* smp_mb(); */
-		return 0;
-	}
-	return -1; /* error */
-}
-
-/* returns true or false */
-static int is_deblk_end(union au_vdir_deblk_p *p,
-			union au_vdir_deblk_p *deblk_end)
-{
-	if (calc_size(0) <= deblk_end->deblk - p->deblk)
-		return !p->de->de_str.len;
-	return 1;
-}
-
-static unsigned char *last_deblk(struct au_vdir *vdir)
-{
-	return vdir->vd_deblk[vdir->vd_nblk - 1];
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* estimate the apropriate size for name hash table */
-unsigned int au_rdhash_est(loff_t sz)
-{
-	unsigned int n;
-
-	n = UINT_MAX;
-	sz >>= 10;
-	if (sz < n)
-		n = sz;
-	if (sz < AUFS_RDHASH_DEF)
-		n = AUFS_RDHASH_DEF;
-	/* pr_info("n %u\n", n); */
-	return n;
-}
-
-/*
- * the allocated memory has to be freed by
- * au_nhash_wh_free() or au_nhash_de_free().
- */
-int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp)
-{
-	struct hlist_head *head;
-	unsigned int u;
-	size_t sz;
-
-	sz = sizeof(*nhash->nh_head) * num_hash;
-	head = kmalloc(sz, gfp);
-	if (head) {
-		nhash->nh_num = num_hash;
-		nhash->nh_head = head;
-		for (u = 0; u < num_hash; u++)
-			INIT_HLIST_HEAD(head++);
-		return 0; /* success */
-	}
-
-	return -ENOMEM;
-}
-
-static void nhash_count(struct hlist_head *head)
-{
-#if 0
-	unsigned long n;
-	struct hlist_node *pos;
-
-	n = 0;
-	hlist_for_each(pos, head)
-		n++;
-	pr_info("%lu\n", n);
-#endif
-}
-
-static void au_nhash_wh_do_free(struct hlist_head *head)
-{
-	struct au_vdir_wh *pos;
-	struct hlist_node *node;
-
-	hlist_for_each_entry_safe(pos, node, head, wh_hash)
-		kfree(pos);
-}
-
-static void au_nhash_de_do_free(struct hlist_head *head)
-{
-	struct au_vdir_dehstr *pos;
-	struct hlist_node *node;
-
-	hlist_for_each_entry_safe(pos, node, head, hash)
-		au_cache_free_vdir_dehstr(pos);
-}
-
-static void au_nhash_do_free(struct au_nhash *nhash,
-			     void (*free)(struct hlist_head *head))
-{
-	unsigned int n;
-	struct hlist_head *head;
-
-	n = nhash->nh_num;
-	if (!n)
-		return;
-
-	head = nhash->nh_head;
-	while (n-- > 0) {
-		nhash_count(head);
-		free(head++);
-	}
-	kfree(nhash->nh_head);
-}
-
-void au_nhash_wh_free(struct au_nhash *whlist)
-{
-	au_nhash_do_free(whlist, au_nhash_wh_do_free);
-}
-
-static void au_nhash_de_free(struct au_nhash *delist)
-{
-	au_nhash_do_free(delist, au_nhash_de_do_free);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt,
-			    int limit)
-{
-	int num;
-	unsigned int u, n;
-	struct hlist_head *head;
-	struct au_vdir_wh *pos;
-
-	num = 0;
-	n = whlist->nh_num;
-	head = whlist->nh_head;
-	for (u = 0; u < n; u++, head++)
-		hlist_for_each_entry(pos, head, wh_hash)
-			if (pos->wh_bindex == btgt && ++num > limit)
-				return 1;
-	return 0;
-}
-
-static struct hlist_head *au_name_hash(struct au_nhash *nhash,
-				       unsigned char *name,
-				       unsigned int len)
-{
-	unsigned int v;
-	/* const unsigned int magic_bit = 12; */
-
-	AuDebugOn(!nhash->nh_num || !nhash->nh_head);
-
-	v = 0;
-	while (len--)
-		v += *name++;
-	/* v = hash_long(v, magic_bit); */
-	v %= nhash->nh_num;
-	return nhash->nh_head + v;
-}
-
-static int au_nhash_test_name(struct au_vdir_destr *str, const char *name,
-			      int nlen)
-{
-	return str->len == nlen && !memcmp(str->name, name, nlen);
-}
-
-/* returns found or not */
-int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen)
-{
-	struct hlist_head *head;
-	struct au_vdir_wh *pos;
-	struct au_vdir_destr *str;
-
-	head = au_name_hash(whlist, name, nlen);
-	hlist_for_each_entry(pos, head, wh_hash) {
-		str = &pos->wh_str;
-		AuDbg("%.*s\n", str->len, str->name);
-		if (au_nhash_test_name(str, name, nlen))
-			return 1;
-	}
-	return 0;
-}
-
-/* returns found(true) or not */
-static int test_known(struct au_nhash *delist, char *name, int nlen)
-{
-	struct hlist_head *head;
-	struct au_vdir_dehstr *pos;
-	struct au_vdir_destr *str;
-
-	head = au_name_hash(delist, name, nlen);
-	hlist_for_each_entry(pos, head, hash) {
-		str = pos->str;
-		AuDbg("%.*s\n", str->len, str->name);
-		if (au_nhash_test_name(str, name, nlen))
-			return 1;
-	}
-	return 0;
-}
-
-static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino,
-			    unsigned char d_type)
-{
-#ifdef CONFIG_AUFS_SHWH
-	wh->wh_ino = ino;
-	wh->wh_type = d_type;
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino,
-		       unsigned int d_type, aufs_bindex_t bindex,
-		       unsigned char shwh)
-{
-	int err;
-	struct au_vdir_destr *str;
-	struct au_vdir_wh *wh;
-
-	AuDbg("%.*s\n", nlen, name);
-	AuDebugOn(!whlist->nh_num || !whlist->nh_head);
-
-	err = -ENOMEM;
-	wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS);
-	if (unlikely(!wh))
-		goto out;
-
-	err = 0;
-	wh->wh_bindex = bindex;
-	if (shwh)
-		au_shwh_init_wh(wh, ino, d_type);
-	str = &wh->wh_str;
-	str->len = nlen;
-	memcpy(str->name, name, nlen);
-	hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen));
-	/* smp_mb(); */
-
-out:
-	return err;
-}
-
-static int append_deblk(struct au_vdir *vdir)
-{
-	int err;
-	unsigned long ul;
-	const unsigned int deblk_sz = vdir->vd_deblk_sz;
-	union au_vdir_deblk_p p, deblk_end;
-	unsigned char **o;
-
-	err = -ENOMEM;
-	o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1),
-		     GFP_NOFS);
-	if (unlikely(!o))
-		goto out;
-
-	vdir->vd_deblk = o;
-	p.deblk = kmalloc(deblk_sz, GFP_NOFS);
-	if (p.deblk) {
-		ul = vdir->vd_nblk++;
-		vdir->vd_deblk[ul] = p.deblk;
-		vdir->vd_last.ul = ul;
-		vdir->vd_last.p.deblk = p.deblk;
-		deblk_end.deblk = p.deblk + deblk_sz;
-		err = set_deblk_end(&p, &deblk_end);
-	}
-
-out:
-	return err;
-}
-
-static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino,
-		     unsigned int d_type, struct au_nhash *delist)
-{
-	int err;
-	unsigned int sz;
-	const unsigned int deblk_sz = vdir->vd_deblk_sz;
-	union au_vdir_deblk_p p, *room, deblk_end;
-	struct au_vdir_dehstr *dehstr;
-
-	p.deblk = last_deblk(vdir);
-	deblk_end.deblk = p.deblk + deblk_sz;
-	room = &vdir->vd_last.p;
-	AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk
-		  || !is_deblk_end(room, &deblk_end));
-
-	sz = calc_size(nlen);
-	if (unlikely(sz > deblk_end.deblk - room->deblk)) {
-		err = append_deblk(vdir);
-		if (unlikely(err))
-			goto out;
-
-		p.deblk = last_deblk(vdir);
-		deblk_end.deblk = p.deblk + deblk_sz;
-		/* smp_mb(); */
-		AuDebugOn(room->deblk != p.deblk);
-	}
-
-	err = -ENOMEM;
-	dehstr = au_cache_alloc_vdir_dehstr();
-	if (unlikely(!dehstr))
-		goto out;
-
-	dehstr->str = &room->de->de_str;
-	hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen));
-	room->de->de_ino = ino;
-	room->de->de_type = d_type;
-	room->de->de_str.len = nlen;
-	memcpy(room->de->de_str.name, name, nlen);
-
-	err = 0;
-	room->deblk += sz;
-	if (unlikely(set_deblk_end(room, &deblk_end)))
-		err = append_deblk(vdir);
-	/* smp_mb(); */
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_vdir_free(struct au_vdir *vdir)
-{
-	unsigned char **deblk;
-
-	deblk = vdir->vd_deblk;
-	while (vdir->vd_nblk--)
-		kfree(*deblk++);
-	kfree(vdir->vd_deblk);
-	au_cache_free_vdir(vdir);
-}
-
-static struct au_vdir *alloc_vdir(struct file *file)
-{
-	struct au_vdir *vdir;
-	struct super_block *sb;
-	int err;
-
-	sb = file->f_path.dentry->d_sb;
-	SiMustAnyLock(sb);
-
-	err = -ENOMEM;
-	vdir = au_cache_alloc_vdir();
-	if (unlikely(!vdir))
-		goto out;
-
-	vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS);
-	if (unlikely(!vdir->vd_deblk))
-		goto out_free;
-
-	vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk;
-	if (!vdir->vd_deblk_sz) {
-		/* estimate the apropriate size for deblk */
-		vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL);
-		/* pr_info("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */
-	}
-	vdir->vd_nblk = 0;
-	vdir->vd_version = 0;
-	vdir->vd_jiffy = 0;
-	err = append_deblk(vdir);
-	if (!err)
-		return vdir; /* success */
-
-	kfree(vdir->vd_deblk);
-
-out_free:
-	au_cache_free_vdir(vdir);
-out:
-	vdir = ERR_PTR(err);
-	return vdir;
-}
-
-static int reinit_vdir(struct au_vdir *vdir)
-{
-	int err;
-	union au_vdir_deblk_p p, deblk_end;
-
-	while (vdir->vd_nblk > 1) {
-		kfree(vdir->vd_deblk[vdir->vd_nblk - 1]);
-		/* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */
-		vdir->vd_nblk--;
-	}
-	p.deblk = vdir->vd_deblk[0];
-	deblk_end.deblk = p.deblk + vdir->vd_deblk_sz;
-	err = set_deblk_end(&p, &deblk_end);
-	/* keep vd_dblk_sz */
-	vdir->vd_last.ul = 0;
-	vdir->vd_last.p.deblk = vdir->vd_deblk[0];
-	vdir->vd_version = 0;
-	vdir->vd_jiffy = 0;
-	/* smp_mb(); */
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#define AuFillVdir_CALLED	1
-#define AuFillVdir_WHABLE	(1 << 1)
-#define AuFillVdir_SHWH		(1 << 2)
-#define au_ftest_fillvdir(flags, name)	((flags) & AuFillVdir_##name)
-#define au_fset_fillvdir(flags, name) \
-	do { (flags) |= AuFillVdir_##name; } while (0)
-#define au_fclr_fillvdir(flags, name) \
-	do { (flags) &= ~AuFillVdir_##name; } while (0)
-
-#ifndef CONFIG_AUFS_SHWH
-#undef AuFillVdir_SHWH
-#define AuFillVdir_SHWH		0
-#endif
-
-struct fillvdir_arg {
-	struct dir_context	ctx;
-	struct file		*file;
-	struct au_vdir		*vdir;
-	struct au_nhash		delist;
-	struct au_nhash		whlist;
-	aufs_bindex_t		bindex;
-	unsigned int		flags;
-	int			err;
-};
-
-static int fillvdir(struct dir_context *ctx, const char *__name, int nlen,
-		    loff_t offset __maybe_unused, u64 h_ino,
-		    unsigned int d_type)
-{
-	struct fillvdir_arg *arg = container_of(ctx, struct fillvdir_arg, ctx);
-	char *name = (void *)__name;
-	struct super_block *sb;
-	ino_t ino;
-	const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH);
-
-	arg->err = 0;
-	sb = arg->file->f_path.dentry->d_sb;
-	au_fset_fillvdir(arg->flags, CALLED);
-	/* smp_mb(); */
-	if (nlen <= AUFS_WH_PFX_LEN
-	    || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
-		if (test_known(&arg->delist, name, nlen)
-		    || au_nhash_test_known_wh(&arg->whlist, name, nlen))
-			goto out; /* already exists or whiteouted */
-
-		arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino);
-		if (!arg->err) {
-			if (unlikely(nlen > AUFS_MAX_NAMELEN))
-				d_type = DT_UNKNOWN;
-			arg->err = append_de(arg->vdir, name, nlen, ino,
-					     d_type, &arg->delist);
-		}
-	} else if (au_ftest_fillvdir(arg->flags, WHABLE)) {
-		name += AUFS_WH_PFX_LEN;
-		nlen -= AUFS_WH_PFX_LEN;
-		if (au_nhash_test_known_wh(&arg->whlist, name, nlen))
-			goto out; /* already whiteouted */
-
-		if (shwh)
-			arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type,
-					     &ino);
-		if (!arg->err) {
-			if (nlen <= AUFS_MAX_NAMELEN + AUFS_WH_PFX_LEN)
-				d_type = DT_UNKNOWN;
-			arg->err = au_nhash_append_wh
-				(&arg->whlist, name, nlen, ino, d_type,
-				 arg->bindex, shwh);
-		}
-	}
-
-out:
-	if (!arg->err)
-		arg->vdir->vd_jiffy = jiffies;
-	/* smp_mb(); */
-	AuTraceErr(arg->err);
-	return arg->err;
-}
-
-static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir,
-			  struct au_nhash *whlist, struct au_nhash *delist)
-{
-#ifdef CONFIG_AUFS_SHWH
-	int err;
-	unsigned int nh, u;
-	struct hlist_head *head;
-	struct au_vdir_wh *pos;
-	struct hlist_node *n;
-	char *p, *o;
-	struct au_vdir_destr *destr;
-
-	AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH));
-
-	err = -ENOMEM;
-	o = p = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!p))
-		goto out;
-
-	err = 0;
-	nh = whlist->nh_num;
-	memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
-	p += AUFS_WH_PFX_LEN;
-	for (u = 0; u < nh; u++) {
-		head = whlist->nh_head + u;
-		hlist_for_each_entry_safe(pos, n, head, wh_hash) {
-			destr = &pos->wh_str;
-			memcpy(p, destr->name, destr->len);
-			err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN,
-					pos->wh_ino, pos->wh_type, delist);
-			if (unlikely(err))
-				break;
-		}
-	}
-
-	free_page((unsigned long)o);
-
-out:
-	AuTraceErr(err);
-	return err;
-#else
-	return 0;
-#endif
-}
-
-static int au_do_read_vdir(struct fillvdir_arg *arg)
-{
-	int err;
-	unsigned int rdhash;
-	loff_t offset;
-	aufs_bindex_t bend, bindex, bstart;
-	unsigned char shwh;
-	struct file *hf, *file;
-	struct super_block *sb;
-
-	file = arg->file;
-	sb = file->f_path.dentry->d_sb;
-	SiMustAnyLock(sb);
-
-	rdhash = au_sbi(sb)->si_rdhash;
-	if (!rdhash)
-		rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL));
-	err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS);
-	if (unlikely(err))
-		goto out_delist;
-
-	err = 0;
-	arg->flags = 0;
-	shwh = 0;
-	if (au_opt_test(au_mntflags(sb), SHWH)) {
-		shwh = 1;
-		au_fset_fillvdir(arg->flags, SHWH);
-	}
-	bstart = au_fbstart(file);
-	bend = au_fbend_dir(file);
-	for (bindex = bstart; !err && bindex <= bend; bindex++) {
-		hf = au_hf_dir(file, bindex);
-		if (!hf)
-			continue;
-
-		offset = vfsub_llseek(hf, 0, SEEK_SET);
-		err = offset;
-		if (unlikely(offset))
-			break;
-
-		arg->bindex = bindex;
-		au_fclr_fillvdir(arg->flags, WHABLE);
-		if (shwh
-		    || (bindex != bend
-			&& au_br_whable(au_sbr_perm(sb, bindex))))
-			au_fset_fillvdir(arg->flags, WHABLE);
-		do {
-			arg->err = 0;
-			au_fclr_fillvdir(arg->flags, CALLED);
-			/* smp_mb(); */
-			err = vfsub_iterate_dir(hf, &arg->ctx);
-			if (err >= 0)
-				err = arg->err;
-		} while (!err && au_ftest_fillvdir(arg->flags, CALLED));
-
-		/*
-		 * dir_relax() may be good for concurrency, but aufs should not
-		 * use it since it will cause a lockdep problem.
-		 */
-	}
-
-	if (!err && shwh)
-		err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist);
-
-	au_nhash_wh_free(&arg->whlist);
-
-out_delist:
-	au_nhash_de_free(&arg->delist);
-out:
-	return err;
-}
-
-static int read_vdir(struct file *file, int may_read)
-{
-	int err;
-	unsigned long expire;
-	unsigned char do_read;
-	struct fillvdir_arg arg = {
-		.ctx = {
-			.actor = fillvdir
-		}
-	};
-	struct inode *inode;
-	struct au_vdir *vdir, *allocated;
-
-	err = 0;
-	inode = file_inode(file);
-	IMustLock(inode);
-	SiMustAnyLock(inode->i_sb);
-
-	allocated = NULL;
-	do_read = 0;
-	expire = au_sbi(inode->i_sb)->si_rdcache;
-	vdir = au_ivdir(inode);
-	if (!vdir) {
-		do_read = 1;
-		vdir = alloc_vdir(file);
-		err = PTR_ERR(vdir);
-		if (IS_ERR(vdir))
-			goto out;
-		err = 0;
-		allocated = vdir;
-	} else if (may_read
-		   && (inode->i_version != vdir->vd_version
-		       || time_after(jiffies, vdir->vd_jiffy + expire))) {
-		do_read = 1;
-		err = reinit_vdir(vdir);
-		if (unlikely(err))
-			goto out;
-	}
-
-	if (!do_read)
-		return 0; /* success */
-
-	arg.file = file;
-	arg.vdir = vdir;
-	err = au_do_read_vdir(&arg);
-	if (!err) {
-		/* file->f_pos = 0; */ /* todo: ctx->pos? */
-		vdir->vd_version = inode->i_version;
-		vdir->vd_last.ul = 0;
-		vdir->vd_last.p.deblk = vdir->vd_deblk[0];
-		if (allocated)
-			au_set_ivdir(inode, allocated);
-	} else if (allocated)
-		au_vdir_free(allocated);
-
-out:
-	return err;
-}
-
-static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src)
-{
-	int err, rerr;
-	unsigned long ul, n;
-	const unsigned int deblk_sz = src->vd_deblk_sz;
-
-	AuDebugOn(tgt->vd_nblk != 1);
-
-	err = -ENOMEM;
-	if (tgt->vd_nblk < src->vd_nblk) {
-		unsigned char **p;
-
-		p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk,
-			     GFP_NOFS);
-		if (unlikely(!p))
-			goto out;
-		tgt->vd_deblk = p;
-	}
-
-	if (tgt->vd_deblk_sz != deblk_sz) {
-		unsigned char *p;
-
-		tgt->vd_deblk_sz = deblk_sz;
-		p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS);
-		if (unlikely(!p))
-			goto out;
-		tgt->vd_deblk[0] = p;
-	}
-	memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz);
-	tgt->vd_version = src->vd_version;
-	tgt->vd_jiffy = src->vd_jiffy;
-
-	n = src->vd_nblk;
-	for (ul = 1; ul < n; ul++) {
-		tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz,
-					    GFP_NOFS);
-		if (unlikely(!tgt->vd_deblk[ul]))
-			goto out;
-		tgt->vd_nblk++;
-	}
-	tgt->vd_nblk = n;
-	tgt->vd_last.ul = tgt->vd_last.ul;
-	tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul];
-	tgt->vd_last.p.deblk += src->vd_last.p.deblk
-		- src->vd_deblk[src->vd_last.ul];
-	/* smp_mb(); */
-	return 0; /* success */
-
-out:
-	rerr = reinit_vdir(tgt);
-	BUG_ON(rerr);
-	return err;
-}
-
-int au_vdir_init(struct file *file)
-{
-	int err;
-	struct inode *inode;
-	struct au_vdir *vdir_cache, *allocated;
-
-	/* test file->f_pos here instead of ctx->pos */
-	err = read_vdir(file, !file->f_pos);
-	if (unlikely(err))
-		goto out;
-
-	allocated = NULL;
-	vdir_cache = au_fvdir_cache(file);
-	if (!vdir_cache) {
-		vdir_cache = alloc_vdir(file);
-		err = PTR_ERR(vdir_cache);
-		if (IS_ERR(vdir_cache))
-			goto out;
-		allocated = vdir_cache;
-	} else if (!file->f_pos && vdir_cache->vd_version != file->f_version) {
-		/* test file->f_pos here instead of ctx->pos */
-		err = reinit_vdir(vdir_cache);
-		if (unlikely(err))
-			goto out;
-	} else
-		return 0; /* success */
-
-	inode = file_inode(file);
-	err = copy_vdir(vdir_cache, au_ivdir(inode));
-	if (!err) {
-		file->f_version = inode->i_version;
-		if (allocated)
-			au_set_fvdir_cache(file, allocated);
-	} else if (allocated)
-		au_vdir_free(allocated);
-
-out:
-	return err;
-}
-
-static loff_t calc_offset(struct au_vdir *vdir)
-{
-	loff_t offset;
-	union au_vdir_deblk_p p;
-
-	p.deblk = vdir->vd_deblk[vdir->vd_last.ul];
-	offset = vdir->vd_last.p.deblk - p.deblk;
-	offset += vdir->vd_deblk_sz * vdir->vd_last.ul;
-	return offset;
-}
-
-/* returns true or false */
-static int seek_vdir(struct file *file, struct dir_context *ctx)
-{
-	int valid;
-	unsigned int deblk_sz;
-	unsigned long ul, n;
-	loff_t offset;
-	union au_vdir_deblk_p p, deblk_end;
-	struct au_vdir *vdir_cache;
-
-	valid = 1;
-	vdir_cache = au_fvdir_cache(file);
-	offset = calc_offset(vdir_cache);
-	AuDbg("offset %lld\n", offset);
-	if (ctx->pos == offset)
-		goto out;
-
-	vdir_cache->vd_last.ul = 0;
-	vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0];
-	if (!ctx->pos)
-		goto out;
-
-	valid = 0;
-	deblk_sz = vdir_cache->vd_deblk_sz;
-	ul = div64_u64(ctx->pos, deblk_sz);
-	AuDbg("ul %lu\n", ul);
-	if (ul >= vdir_cache->vd_nblk)
-		goto out;
-
-	n = vdir_cache->vd_nblk;
-	for (; ul < n; ul++) {
-		p.deblk = vdir_cache->vd_deblk[ul];
-		deblk_end.deblk = p.deblk + deblk_sz;
-		offset = ul;
-		offset *= deblk_sz;
-		while (!is_deblk_end(&p, &deblk_end) && offset < ctx->pos) {
-			unsigned int l;
-
-			l = calc_size(p.de->de_str.len);
-			offset += l;
-			p.deblk += l;
-		}
-		if (!is_deblk_end(&p, &deblk_end)) {
-			valid = 1;
-			vdir_cache->vd_last.ul = ul;
-			vdir_cache->vd_last.p = p;
-			break;
-		}
-	}
-
-out:
-	/* smp_mb(); */
-	AuTraceErr(!valid);
-	return valid;
-}
-
-int au_vdir_fill_de(struct file *file, struct dir_context *ctx)
-{
-	unsigned int l, deblk_sz;
-	union au_vdir_deblk_p deblk_end;
-	struct au_vdir *vdir_cache;
-	struct au_vdir_de *de;
-
-	vdir_cache = au_fvdir_cache(file);
-	if (!seek_vdir(file, ctx))
-		return 0;
-
-	deblk_sz = vdir_cache->vd_deblk_sz;
-	while (1) {
-		deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul];
-		deblk_end.deblk += deblk_sz;
-		while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) {
-			de = vdir_cache->vd_last.p.de;
-			AuDbg("%.*s, off%lld, i%lu, dt%d\n",
-			      de->de_str.len, de->de_str.name, ctx->pos,
-			      (unsigned long)de->de_ino, de->de_type);
-			if (unlikely(!dir_emit(ctx, de->de_str.name,
-					       de->de_str.len, de->de_ino,
-					       de->de_type))) {
-				/* todo: ignore the error caused by udba? */
-				/* return err; */
-				return 0;
-			}
-
-			l = calc_size(de->de_str.len);
-			vdir_cache->vd_last.p.deblk += l;
-			ctx->pos += l;
-		}
-		if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) {
-			vdir_cache->vd_last.ul++;
-			vdir_cache->vd_last.p.deblk
-				= vdir_cache->vd_deblk[vdir_cache->vd_last.ul];
-			ctx->pos = deblk_sz * vdir_cache->vd_last.ul;
-			continue;
-		}
-		break;
-	}
-
-	/* smp_mb(); */
-	return 0;
-}
diff --git a/fs/aufs/vfsub.c b/fs/aufs/vfsub.c
deleted file mode 100644
index d2af7ce8d..000000000
--- a/fs/aufs/vfsub.c
+++ /dev/null
@@ -1,848 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * sub-routines for VFS
- */
-
-#include <linux/namei.h>
-#include <linux/security.h>
-#include <linux/splice.h>
-#include "aufs.h"
-
-int vfsub_update_h_iattr(struct path *h_path, int *did)
-{
-	int err;
-	struct kstat st;
-	struct super_block *h_sb;
-
-	/* for remote fs, leave work for its getattr or d_revalidate */
-	/* for bad i_attr fs, handle them in aufs_getattr() */
-	/* still some fs may acquire i_mutex. we need to skip them */
-	err = 0;
-	if (!did)
-		did = &err;
-	h_sb = h_path->dentry->d_sb;
-	*did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb));
-	if (*did)
-		err = vfs_getattr(h_path, &st);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct file *vfsub_dentry_open(struct path *path, int flags)
-{
-	struct file *file;
-
-	file = dentry_open(path, flags /* | __FMODE_NONOTIFY */,
-			   current_cred());
-	if (!IS_ERR_OR_NULL(file)
-	    && (file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
-		i_readcount_inc(d_inode(path->dentry));
-
-	return file;
-}
-
-struct file *vfsub_filp_open(const char *path, int oflags, int mode)
-{
-	struct file *file;
-
-	lockdep_off();
-	file = filp_open(path,
-			 oflags /* | __FMODE_NONOTIFY */,
-			 mode);
-	lockdep_on();
-	if (IS_ERR(file))
-		goto out;
-	vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
-
-out:
-	return file;
-}
-
-/*
- * Ideally this function should call VFS:do_last() in order to keep all its
- * checkings. But it is very hard for aufs to regenerate several VFS internal
- * structure such as nameidata. This is a second (or third) best approach.
- * cf. linux/fs/namei.c:do_last(), lookup_open() and atomic_open().
- */
-int vfsub_atomic_open(struct inode *dir, struct dentry *dentry,
-		      struct vfsub_aopen_args *args, struct au_branch *br)
-{
-	int err;
-	struct file *file = args->file;
-	/* copied from linux/fs/namei.c:atomic_open() */
-	struct dentry *const DENTRY_NOT_SET = (void *)-1UL;
-
-	IMustLock(dir);
-	AuDebugOn(!dir->i_op->atomic_open);
-
-	err = au_br_test_oflag(args->open_flag, br);
-	if (unlikely(err))
-		goto out;
-
-	args->file->f_path.dentry = DENTRY_NOT_SET;
-	args->file->f_path.mnt = au_br_mnt(br);
-	err = dir->i_op->atomic_open(dir, dentry, file, args->open_flag,
-				     args->create_mode, args->opened);
-	if (err >= 0) {
-		/* some filesystems don't set FILE_CREATED while succeeded? */
-		if (*args->opened & FILE_CREATED)
-			fsnotify_create(dir, dentry);
-	} else
-		goto out;
-
-
-	if (!err) {
-		/* todo: call VFS:may_open() here */
-		err = open_check_o_direct(file);
-		/* todo: ima_file_check() too? */
-		if (!err && (args->open_flag & __FMODE_EXEC))
-			err = deny_write_access(file);
-		if (unlikely(err))
-			/* note that the file is created and still opened */
-			goto out;
-	}
-
-	atomic_inc(&br->br_count);
-	fsnotify_open(file);
-
-out:
-	return err;
-}
-
-int vfsub_kern_path(const char *name, unsigned int flags, struct path *path)
-{
-	int err;
-
-	err = kern_path(name, flags, path);
-	if (!err && d_is_positive(path->dentry))
-		vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent,
-				    int len)
-{
-	struct path path = {
-		.mnt = NULL
-	};
-
-	/* VFS checks it too, but by WARN_ON_ONCE() */
-	IMustLock(d_inode(parent));
-
-	path.dentry = lookup_one_len(name, parent, len);
-	if (IS_ERR(path.dentry))
-		goto out;
-	if (d_is_positive(path.dentry))
-		vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/
-
-out:
-	AuTraceErrPtr(path.dentry);
-	return path.dentry;
-}
-
-void vfsub_call_lkup_one(void *args)
-{
-	struct vfsub_lkup_one_args *a = args;
-	*a->errp = vfsub_lkup_one(a->name, a->parent);
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1,
-				 struct dentry *d2, struct au_hinode *hdir2)
-{
-	struct dentry *d;
-
-	lockdep_off();
-	d = lock_rename(d1, d2);
-	lockdep_on();
-	au_hn_suspend(hdir1);
-	if (hdir1 != hdir2)
-		au_hn_suspend(hdir2);
-
-	return d;
-}
-
-void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1,
-			 struct dentry *d2, struct au_hinode *hdir2)
-{
-	au_hn_resume(hdir1);
-	if (hdir1 != hdir2)
-		au_hn_resume(hdir2);
-	lockdep_off();
-	unlock_rename(d1, d2);
-	lockdep_on();
-}
-
-/* ---------------------------------------------------------------------- */
-
-int vfsub_create(struct inode *dir, struct path *path, int mode, bool want_excl)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_mknod(path, d, mode, 0);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_create(dir, path->dentry, mode, want_excl);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = *path;
-		int did;
-
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = path->dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-int vfsub_symlink(struct inode *dir, struct path *path, const char *symname)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_symlink(path, d, symname);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_symlink(dir, path->dentry, symname);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = *path;
-		int did;
-
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = path->dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_mknod(path, d, mode, new_encode_dev(dev));
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_mknod(dir, path->dentry, mode, dev);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = *path;
-		int did;
-
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = path->dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-static int au_test_nlink(struct inode *inode)
-{
-	const unsigned int link_max = UINT_MAX >> 1; /* rough margin */
-
-	if (!au_test_fs_no_limit_nlink(inode->i_sb)
-	    || inode->i_nlink < link_max)
-		return 0;
-	return -EMLINK;
-}
-
-int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path,
-	       struct inode **delegated_inode)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	err = au_test_nlink(d_inode(src_dentry));
-	if (unlikely(err))
-		return err;
-
-	/* we don't call may_linkat() */
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_link(src_dentry, path, d);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_link(src_dentry, dir, path->dentry, delegated_inode);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = *path;
-		int did;
-
-		/* fuse has different memory inode for the same inumber */
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = path->dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-			tmp.dentry = src_dentry;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry,
-		 struct inode *dir, struct path *path,
-		 struct inode **delegated_inode)
-{
-	int err;
-	struct path tmp = {
-		.mnt	= path->mnt
-	};
-	struct dentry *d;
-
-	IMustLock(dir);
-	IMustLock(src_dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	tmp.dentry = src_dentry->d_parent;
-	err = security_path_rename(&tmp, src_dentry, path, d, /*flags*/0);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_rename(src_dir, src_dentry, dir, path->dentry,
-			 delegated_inode, /*flags*/0);
-	lockdep_on();
-	if (!err) {
-		int did;
-
-		tmp.dentry = d->d_parent;
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = src_dentry;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-			tmp.dentry = src_dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-int vfsub_mkdir(struct inode *dir, struct path *path, int mode)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_mkdir(path, d, mode);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_mkdir(dir, path->dentry, mode);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = *path;
-		int did;
-
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = path->dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-int vfsub_rmdir(struct inode *dir, struct path *path)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_rmdir(path, d);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_rmdir(dir, path->dentry);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = {
-			.dentry	= path->dentry->d_parent,
-			.mnt	= path->mnt
-		};
-
-		vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/
-	}
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* todo: support mmap_sem? */
-ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count,
-		     loff_t *ppos)
-{
-	ssize_t err;
-
-	lockdep_off();
-	err = vfs_read(file, ubuf, count, ppos);
-	lockdep_on();
-	if (err >= 0)
-		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-/* todo: kernel_read()? */
-ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count,
-		     loff_t *ppos)
-{
-	ssize_t err;
-	mm_segment_t oldfs;
-	union {
-		void *k;
-		char __user *u;
-	} buf;
-
-	buf.k = kbuf;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = vfsub_read_u(file, buf.u, count, ppos);
-	set_fs(oldfs);
-	return err;
-}
-
-ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count,
-		      loff_t *ppos)
-{
-	ssize_t err;
-
-	lockdep_off();
-	err = vfs_write(file, ubuf, count, ppos);
-	lockdep_on();
-	if (err >= 0)
-		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos)
-{
-	ssize_t err;
-	mm_segment_t oldfs;
-	union {
-		void *k;
-		const char __user *u;
-	} buf;
-
-	buf.k = kbuf;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = vfsub_write_u(file, buf.u, count, ppos);
-	set_fs(oldfs);
-	return err;
-}
-
-int vfsub_flush(struct file *file, fl_owner_t id)
-{
-	int err;
-
-	err = 0;
-	if (file->f_op->flush) {
-		if (!au_test_nfs(file->f_path.dentry->d_sb))
-			err = file->f_op->flush(file, id);
-		else {
-			lockdep_off();
-			err = file->f_op->flush(file, id);
-			lockdep_on();
-		}
-		if (!err)
-			vfsub_update_h_iattr(&file->f_path, /*did*/NULL);
-		/*ignore*/
-	}
-	return err;
-}
-
-int vfsub_iterate_dir(struct file *file, struct dir_context *ctx)
-{
-	int err;
-
-	AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos);
-
-	lockdep_off();
-	err = iterate_dir(file, ctx);
-	lockdep_on();
-	if (err >= 0)
-		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-long vfsub_splice_to(struct file *in, loff_t *ppos,
-		     struct pipe_inode_info *pipe, size_t len,
-		     unsigned int flags)
-{
-	long err;
-
-	lockdep_off();
-	err = do_splice_to(in, ppos, pipe, len, flags);
-	lockdep_on();
-	file_accessed(in);
-	if (err >= 0)
-		vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out,
-		       loff_t *ppos, size_t len, unsigned int flags)
-{
-	long err;
-
-	lockdep_off();
-	err = do_splice_from(pipe, out, ppos, len, flags);
-	lockdep_on();
-	if (err >= 0)
-		vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-int vfsub_fsync(struct file *file, struct path *path, int datasync)
-{
-	int err;
-
-	/* file can be NULL */
-	lockdep_off();
-	err = vfs_fsync(file, datasync);
-	lockdep_on();
-	if (!err) {
-		if (!path) {
-			AuDebugOn(!file);
-			path = &file->f_path;
-		}
-		vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/
-	}
-	return err;
-}
-
-/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */
-int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr,
-		struct file *h_file)
-{
-	int err;
-	struct inode *h_inode;
-	struct super_block *h_sb;
-
-	if (!h_file) {
-		err = vfsub_truncate(h_path, length);
-		goto out;
-	}
-
-	h_inode = d_inode(h_path->dentry);
-	h_sb = h_inode->i_sb;
-	lockdep_off();
-	sb_start_write(h_sb);
-	lockdep_on();
-	err = locks_verify_truncate(h_inode, h_file, length);
-	if (!err)
-		err = security_path_truncate(h_path);
-	if (!err) {
-		lockdep_off();
-		err = do_truncate(h_path->dentry, length, attr, h_file);
-		lockdep_on();
-	}
-	lockdep_off();
-	sb_end_write(h_sb);
-	lockdep_on();
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_vfsub_mkdir_args {
-	int *errp;
-	struct inode *dir;
-	struct path *path;
-	int mode;
-};
-
-static void au_call_vfsub_mkdir(void *args)
-{
-	struct au_vfsub_mkdir_args *a = args;
-	*a->errp = vfsub_mkdir(a->dir, a->path, a->mode);
-}
-
-int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode)
-{
-	int err, do_sio, wkq_err;
-
-	do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE);
-	if (!do_sio) {
-		lockdep_off();
-		err = vfsub_mkdir(dir, path, mode);
-		lockdep_on();
-	} else {
-		struct au_vfsub_mkdir_args args = {
-			.errp	= &err,
-			.dir	= dir,
-			.path	= path,
-			.mode	= mode
-		};
-		wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	return err;
-}
-
-struct au_vfsub_rmdir_args {
-	int *errp;
-	struct inode *dir;
-	struct path *path;
-};
-
-static void au_call_vfsub_rmdir(void *args)
-{
-	struct au_vfsub_rmdir_args *a = args;
-	*a->errp = vfsub_rmdir(a->dir, a->path);
-}
-
-int vfsub_sio_rmdir(struct inode *dir, struct path *path)
-{
-	int err, do_sio, wkq_err;
-
-	do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE);
-	if (!do_sio) {
-		lockdep_off();
-		err = vfsub_rmdir(dir, path);
-		lockdep_on();
-	} else {
-		struct au_vfsub_rmdir_args args = {
-			.errp	= &err,
-			.dir	= dir,
-			.path	= path
-		};
-		wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct notify_change_args {
-	int *errp;
-	struct path *path;
-	struct iattr *ia;
-	struct inode **delegated_inode;
-};
-
-static void call_notify_change(void *args)
-{
-	struct notify_change_args *a = args;
-	struct inode *h_inode;
-
-	h_inode = d_inode(a->path->dentry);
-	IMustLock(h_inode);
-
-	*a->errp = -EPERM;
-	if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) {
-		lockdep_off();
-		*a->errp = notify_change(a->path->dentry, a->ia,
-					 a->delegated_inode);
-		lockdep_on();
-		if (!*a->errp)
-			vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/
-	}
-	AuTraceErr(*a->errp);
-}
-
-int vfsub_notify_change(struct path *path, struct iattr *ia,
-			struct inode **delegated_inode)
-{
-	int err;
-	struct notify_change_args args = {
-		.errp			= &err,
-		.path			= path,
-		.ia			= ia,
-		.delegated_inode	= delegated_inode
-	};
-
-	call_notify_change(&args);
-
-	return err;
-}
-
-int vfsub_sio_notify_change(struct path *path, struct iattr *ia,
-			    struct inode **delegated_inode)
-{
-	int err, wkq_err;
-	struct notify_change_args args = {
-		.errp			= &err,
-		.path			= path,
-		.ia			= ia,
-		.delegated_inode	= delegated_inode
-	};
-
-	wkq_err = au_wkq_wait(call_notify_change, &args);
-	if (unlikely(wkq_err))
-		err = wkq_err;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct unlink_args {
-	int *errp;
-	struct inode *dir;
-	struct path *path;
-	struct inode **delegated_inode;
-};
-
-static void call_unlink(void *args)
-{
-	struct unlink_args *a = args;
-	struct dentry *d = a->path->dentry;
-	struct inode *h_inode;
-	const int stop_sillyrename = (au_test_nfs(d->d_sb)
-				      && au_dcount(d) == 1);
-
-	IMustLock(a->dir);
-
-	a->path->dentry = d->d_parent;
-	*a->errp = security_path_unlink(a->path, d);
-	a->path->dentry = d;
-	if (unlikely(*a->errp))
-		return;
-
-	if (!stop_sillyrename)
-		dget(d);
-	h_inode = NULL;
-	if (d_is_positive(d)) {
-		h_inode = d_inode(d);
-		ihold(h_inode);
-	}
-
-	lockdep_off();
-	*a->errp = vfs_unlink(a->dir, d, a->delegated_inode);
-	lockdep_on();
-	if (!*a->errp) {
-		struct path tmp = {
-			.dentry = d->d_parent,
-			.mnt	= a->path->mnt
-		};
-		vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/
-	}
-
-	if (!stop_sillyrename)
-		dput(d);
-	if (h_inode)
-		iput(h_inode);
-
-	AuTraceErr(*a->errp);
-}
-
-/*
- * @dir: must be locked.
- * @dentry: target dentry.
- */
-int vfsub_unlink(struct inode *dir, struct path *path,
-		 struct inode **delegated_inode, int force)
-{
-	int err;
-	struct unlink_args args = {
-		.errp			= &err,
-		.dir			= dir,
-		.path			= path,
-		.delegated_inode	= delegated_inode
-	};
-
-	if (!force)
-		call_unlink(&args);
-	else {
-		int wkq_err;
-
-		wkq_err = au_wkq_wait(call_unlink, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	return err;
-}
diff --git a/fs/aufs/vfsub.h b/fs/aufs/vfsub.h
deleted file mode 100644
index 5c89f8e56..000000000
--- a/fs/aufs/vfsub.h
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * sub-routines for VFS
- */
-
-#ifndef __AUFS_VFSUB_H__
-#define __AUFS_VFSUB_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/xattr.h>
-#include "debug.h"
-
-/* copied from linux/fs/internal.h */
-/* todo: BAD approach!! */
-extern void __mnt_drop_write(struct vfsmount *);
-extern spinlock_t inode_sb_list_lock;
-extern int open_check_o_direct(struct file *f);
-
-/* ---------------------------------------------------------------------- */
-
-/* lock subclass for lower inode */
-/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */
-/* reduce? gave up. */
-enum {
-	AuLsc_I_Begin = I_MUTEX_PARENT2, /* 5 */
-	AuLsc_I_PARENT,		/* lower inode, parent first */
-	AuLsc_I_PARENT2,	/* copyup dirs */
-	AuLsc_I_PARENT3,	/* copyup wh */
-	AuLsc_I_CHILD,
-	AuLsc_I_CHILD2,
-	AuLsc_I_End
-};
-
-/* to debug easier, do not make them inlined functions */
-#define MtxMustLock(mtx)	AuDebugOn(!mutex_is_locked(mtx))
-#define IMustLock(i)		MtxMustLock(&(i)->i_mutex)
-
-/* ---------------------------------------------------------------------- */
-
-static inline void vfsub_drop_nlink(struct inode *inode)
-{
-	AuDebugOn(!inode->i_nlink);
-	drop_nlink(inode);
-}
-
-static inline void vfsub_dead_dir(struct inode *inode)
-{
-	AuDebugOn(!S_ISDIR(inode->i_mode));
-	inode->i_flags |= S_DEAD;
-	clear_nlink(inode);
-}
-
-static inline int vfsub_native_ro(struct inode *inode)
-{
-	return (inode->i_sb->s_flags & MS_RDONLY)
-		|| IS_RDONLY(inode)
-		/* || IS_APPEND(inode) */
-		|| IS_IMMUTABLE(inode);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int vfsub_update_h_iattr(struct path *h_path, int *did);
-struct file *vfsub_dentry_open(struct path *path, int flags);
-struct file *vfsub_filp_open(const char *path, int oflags, int mode);
-struct vfsub_aopen_args {
-	struct file	*file;
-	unsigned int	open_flag;
-	umode_t		create_mode;
-	int		*opened;
-};
-struct au_branch;
-int vfsub_atomic_open(struct inode *dir, struct dentry *dentry,
-		      struct vfsub_aopen_args *args, struct au_branch *br);
-int vfsub_kern_path(const char *name, unsigned int flags, struct path *path);
-
-struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent,
-				    int len);
-
-struct vfsub_lkup_one_args {
-	struct dentry **errp;
-	struct qstr *name;
-	struct dentry *parent;
-};
-
-static inline struct dentry *vfsub_lkup_one(struct qstr *name,
-					    struct dentry *parent)
-{
-	return vfsub_lookup_one_len(name->name, parent, name->len);
-}
-
-void vfsub_call_lkup_one(void *args);
-
-/* ---------------------------------------------------------------------- */
-
-static inline int vfsub_mnt_want_write(struct vfsmount *mnt)
-{
-	int err;
-
-	lockdep_off();
-	err = mnt_want_write(mnt);
-	lockdep_on();
-	return err;
-}
-
-static inline void vfsub_mnt_drop_write(struct vfsmount *mnt)
-{
-	lockdep_off();
-	mnt_drop_write(mnt);
-	lockdep_on();
-}
-
-#if 0 /* reserved */
-static inline void vfsub_mnt_drop_write_file(struct file *file)
-{
-	lockdep_off();
-	mnt_drop_write_file(file);
-	lockdep_on();
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-struct au_hinode;
-struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1,
-				 struct dentry *d2, struct au_hinode *hdir2);
-void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1,
-			 struct dentry *d2, struct au_hinode *hdir2);
-
-int vfsub_create(struct inode *dir, struct path *path, int mode,
-		 bool want_excl);
-int vfsub_symlink(struct inode *dir, struct path *path,
-		  const char *symname);
-int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev);
-int vfsub_link(struct dentry *src_dentry, struct inode *dir,
-	       struct path *path, struct inode **delegated_inode);
-int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry,
-		 struct inode *hdir, struct path *path,
-		 struct inode **delegated_inode);
-int vfsub_mkdir(struct inode *dir, struct path *path, int mode);
-int vfsub_rmdir(struct inode *dir, struct path *path);
-
-/* ---------------------------------------------------------------------- */
-
-ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count,
-		     loff_t *ppos);
-ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count,
-			loff_t *ppos);
-ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count,
-		      loff_t *ppos);
-ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count,
-		      loff_t *ppos);
-int vfsub_flush(struct file *file, fl_owner_t id);
-int vfsub_iterate_dir(struct file *file, struct dir_context *ctx);
-
-static inline loff_t vfsub_f_size_read(struct file *file)
-{
-	return i_size_read(file_inode(file));
-}
-
-static inline unsigned int vfsub_file_flags(struct file *file)
-{
-	unsigned int flags;
-
-	spin_lock(&file->f_lock);
-	flags = file->f_flags;
-	spin_unlock(&file->f_lock);
-
-	return flags;
-}
-
-#if 0 /* reserved */
-static inline void vfsub_file_accessed(struct file *h_file)
-{
-	file_accessed(h_file);
-	vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/
-}
-#endif
-
-static inline void vfsub_touch_atime(struct vfsmount *h_mnt,
-				     struct dentry *h_dentry)
-{
-	struct path h_path = {
-		.dentry	= h_dentry,
-		.mnt	= h_mnt
-	};
-	touch_atime(&h_path);
-	vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/
-}
-
-static inline int vfsub_update_time(struct inode *h_inode, struct timespec *ts,
-				    int flags)
-{
-	return generic_update_time(h_inode, ts, flags);
-	/* no vfsub_update_h_iattr() since we don't have struct path */
-}
-
-long vfsub_splice_to(struct file *in, loff_t *ppos,
-		     struct pipe_inode_info *pipe, size_t len,
-		     unsigned int flags);
-long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out,
-		       loff_t *ppos, size_t len, unsigned int flags);
-
-static inline long vfsub_truncate(struct path *path, loff_t length)
-{
-	long err;
-
-	lockdep_off();
-	err = vfs_truncate(path, length);
-	lockdep_on();
-	return err;
-}
-
-int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr,
-		struct file *h_file);
-int vfsub_fsync(struct file *file, struct path *path, int datasync);
-
-/* ---------------------------------------------------------------------- */
-
-static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin)
-{
-	loff_t err;
-
-	lockdep_off();
-	err = vfs_llseek(file, offset, origin);
-	lockdep_on();
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode);
-int vfsub_sio_rmdir(struct inode *dir, struct path *path);
-int vfsub_sio_notify_change(struct path *path, struct iattr *ia,
-			    struct inode **delegated_inode);
-int vfsub_notify_change(struct path *path, struct iattr *ia,
-			struct inode **delegated_inode);
-int vfsub_unlink(struct inode *dir, struct path *path,
-		 struct inode **delegated_inode, int force);
-
-/* ---------------------------------------------------------------------- */
-
-static inline int vfsub_setxattr(struct dentry *dentry, const char *name,
-				 const void *value, size_t size, int flags)
-{
-	int err;
-
-	lockdep_off();
-	err = vfs_setxattr(dentry, name, value, size, flags);
-	lockdep_on();
-
-	return err;
-}
-
-static inline int vfsub_removexattr(struct dentry *dentry, const char *name)
-{
-	int err;
-
-	lockdep_off();
-	err = vfs_removexattr(dentry, name);
-	lockdep_on();
-
-	return err;
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_VFSUB_H__ */
diff --git a/fs/aufs/wbr_policy.c b/fs/aufs/wbr_policy.c
deleted file mode 100644
index 3949efb50..000000000
--- a/fs/aufs/wbr_policy.c
+++ /dev/null
@@ -1,765 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * policies for selecting one among multiple writable branches
- */
-
-#include <linux/statfs.h>
-#include "aufs.h"
-
-/* subset of cpup_attr() */
-static noinline_for_stack
-int au_cpdown_attr(struct path *h_path, struct dentry *h_src)
-{
-	int err, sbits;
-	struct iattr ia;
-	struct inode *h_isrc;
-
-	h_isrc = d_inode(h_src);
-	ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID;
-	ia.ia_mode = h_isrc->i_mode;
-	ia.ia_uid = h_isrc->i_uid;
-	ia.ia_gid = h_isrc->i_gid;
-	sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID));
-	au_cpup_attr_flags(d_inode(h_path->dentry), h_isrc->i_flags);
-	/* no delegation since it is just created */
-	err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL);
-
-	/* is this nfs only? */
-	if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) {
-		ia.ia_valid = ATTR_FORCE | ATTR_MODE;
-		ia.ia_mode = h_isrc->i_mode;
-		err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL);
-	}
-
-	return err;
-}
-
-#define AuCpdown_PARENT_OPQ	1
-#define AuCpdown_WHED		(1 << 1)
-#define AuCpdown_MADE_DIR	(1 << 2)
-#define AuCpdown_DIROPQ		(1 << 3)
-#define au_ftest_cpdown(flags, name)	((flags) & AuCpdown_##name)
-#define au_fset_cpdown(flags, name) \
-	do { (flags) |= AuCpdown_##name; } while (0)
-#define au_fclr_cpdown(flags, name) \
-	do { (flags) &= ~AuCpdown_##name; } while (0)
-
-static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst,
-			     unsigned int *flags)
-{
-	int err;
-	struct dentry *opq_dentry;
-
-	opq_dentry = au_diropq_create(dentry, bdst);
-	err = PTR_ERR(opq_dentry);
-	if (IS_ERR(opq_dentry))
-		goto out;
-	dput(opq_dentry);
-	au_fset_cpdown(*flags, DIROPQ);
-
-out:
-	return err;
-}
-
-static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent,
-			    struct inode *dir, aufs_bindex_t bdst)
-{
-	int err;
-	struct path h_path;
-	struct au_branch *br;
-
-	br = au_sbr(dentry->d_sb, bdst);
-	h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br);
-	err = PTR_ERR(h_path.dentry);
-	if (IS_ERR(h_path.dentry))
-		goto out;
-
-	err = 0;
-	if (d_is_positive(h_path.dentry)) {
-		h_path.mnt = au_br_mnt(br);
-		err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path,
-					  dentry);
-	}
-	dput(h_path.dentry);
-
-out:
-	return err;
-}
-
-static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst,
-			 struct au_pin *pin,
-			 struct dentry *h_parent, void *arg)
-{
-	int err, rerr;
-	aufs_bindex_t bopq, bstart;
-	struct path h_path;
-	struct dentry *parent;
-	struct inode *h_dir, *h_inode, *inode, *dir;
-	unsigned int *flags = arg;
-
-	bstart = au_dbstart(dentry);
-	/* dentry is di-locked */
-	parent = dget_parent(dentry);
-	dir = d_inode(parent);
-	h_dir = d_inode(h_parent);
-	AuDebugOn(h_dir != au_h_iptr(dir, bdst));
-	IMustLock(h_dir);
-
-	err = au_lkup_neg(dentry, bdst, /*wh*/0);
-	if (unlikely(err < 0))
-		goto out;
-	h_path.dentry = au_h_dptr(dentry, bdst);
-	h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst);
-	err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path,
-			      S_IRWXU | S_IRUGO | S_IXUGO);
-	if (unlikely(err))
-		goto out_put;
-	au_fset_cpdown(*flags, MADE_DIR);
-
-	bopq = au_dbdiropq(dentry);
-	au_fclr_cpdown(*flags, WHED);
-	au_fclr_cpdown(*flags, DIROPQ);
-	if (au_dbwh(dentry) == bdst)
-		au_fset_cpdown(*flags, WHED);
-	if (!au_ftest_cpdown(*flags, PARENT_OPQ) && bopq <= bdst)
-		au_fset_cpdown(*flags, PARENT_OPQ);
-	h_inode = d_inode(h_path.dentry);
-	mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD);
-	if (au_ftest_cpdown(*flags, WHED)) {
-		err = au_cpdown_dir_opq(dentry, bdst, flags);
-		if (unlikely(err)) {
-			mutex_unlock(&h_inode->i_mutex);
-			goto out_dir;
-		}
-	}
-
-	err = au_cpdown_attr(&h_path, au_h_dptr(dentry, bstart));
-	mutex_unlock(&h_inode->i_mutex);
-	if (unlikely(err))
-		goto out_opq;
-
-	if (au_ftest_cpdown(*flags, WHED)) {
-		err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst);
-		if (unlikely(err))
-			goto out_opq;
-	}
-
-	inode = d_inode(dentry);
-	if (au_ibend(inode) < bdst)
-		au_set_ibend(inode, bdst);
-	au_set_h_iptr(inode, bdst, au_igrab(h_inode),
-		      au_hi_flags(inode, /*isdir*/1));
-	au_fhsm_wrote(dentry->d_sb, bdst, /*force*/0);
-	goto out; /* success */
-
-	/* revert */
-out_opq:
-	if (au_ftest_cpdown(*flags, DIROPQ)) {
-		mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD);
-		rerr = au_diropq_remove(dentry, bdst);
-		mutex_unlock(&h_inode->i_mutex);
-		if (unlikely(rerr)) {
-			AuIOErr("failed removing diropq for %pd b%d (%d)\n",
-				dentry, bdst, rerr);
-			err = -EIO;
-			goto out;
-		}
-	}
-out_dir:
-	if (au_ftest_cpdown(*flags, MADE_DIR)) {
-		rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path);
-		if (unlikely(rerr)) {
-			AuIOErr("failed removing %pd b%d (%d)\n",
-				dentry, bdst, rerr);
-			err = -EIO;
-		}
-	}
-out_put:
-	au_set_h_dptr(dentry, bdst, NULL);
-	if (au_dbend(dentry) == bdst)
-		au_update_dbend(dentry);
-out:
-	dput(parent);
-	return err;
-}
-
-int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst)
-{
-	int err;
-	unsigned int flags;
-
-	flags = 0;
-	err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &flags);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* policies for create */
-
-int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	int err, i, j, ndentry;
-	aufs_bindex_t bopq;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry **dentries, *parent, *d;
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	parent = dget_parent(dentry);
-	err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/0);
-	if (unlikely(err))
-		goto out_free;
-
-	err = bindex;
-	for (i = 0; i < dpages.ndpage; i++) {
-		dpage = dpages.dpages + i;
-		dentries = dpage->dentries;
-		ndentry = dpage->ndentry;
-		for (j = 0; j < ndentry; j++) {
-			d = dentries[j];
-			di_read_lock_parent2(d, !AuLock_IR);
-			bopq = au_dbdiropq(d);
-			di_read_unlock(d, !AuLock_IR);
-			if (bopq >= 0 && bopq < err)
-				err = bopq;
-		}
-	}
-
-out_free:
-	dput(parent);
-	au_dpages_free(&dpages);
-out:
-	return err;
-}
-
-static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex)
-{
-	for (; bindex >= 0; bindex--)
-		if (!au_br_rdonly(au_sbr(sb, bindex)))
-			return bindex;
-	return -EROFS;
-}
-
-/* top down parent */
-static int au_wbr_create_tdp(struct dentry *dentry,
-			     unsigned int flags __maybe_unused)
-{
-	int err;
-	aufs_bindex_t bstart, bindex;
-	struct super_block *sb;
-	struct dentry *parent, *h_parent;
-
-	sb = dentry->d_sb;
-	bstart = au_dbstart(dentry);
-	err = bstart;
-	if (!au_br_rdonly(au_sbr(sb, bstart)))
-		goto out;
-
-	err = -EROFS;
-	parent = dget_parent(dentry);
-	for (bindex = au_dbstart(parent); bindex < bstart; bindex++) {
-		h_parent = au_h_dptr(parent, bindex);
-		if (!h_parent || d_is_negative(h_parent))
-			continue;
-
-		if (!au_br_rdonly(au_sbr(sb, bindex))) {
-			err = bindex;
-			break;
-		}
-	}
-	dput(parent);
-
-	/* bottom up here */
-	if (unlikely(err < 0)) {
-		err = au_wbr_bu(sb, bstart - 1);
-		if (err >= 0)
-			err = au_wbr_nonopq(dentry, err);
-	}
-
-out:
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* an exception for the policy other than tdp */
-static int au_wbr_create_exp(struct dentry *dentry)
-{
-	int err;
-	aufs_bindex_t bwh, bdiropq;
-	struct dentry *parent;
-
-	err = -1;
-	bwh = au_dbwh(dentry);
-	parent = dget_parent(dentry);
-	bdiropq = au_dbdiropq(parent);
-	if (bwh >= 0) {
-		if (bdiropq >= 0)
-			err = min(bdiropq, bwh);
-		else
-			err = bwh;
-		AuDbg("%d\n", err);
-	} else if (bdiropq >= 0) {
-		err = bdiropq;
-		AuDbg("%d\n", err);
-	}
-	dput(parent);
-
-	if (err >= 0)
-		err = au_wbr_nonopq(dentry, err);
-
-	if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err)))
-		err = -1;
-
-	AuDbg("%d\n", err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* round robin */
-static int au_wbr_create_init_rr(struct super_block *sb)
-{
-	int err;
-
-	err = au_wbr_bu(sb, au_sbend(sb));
-	atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */
-	/* smp_mb(); */
-
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-static int au_wbr_create_rr(struct dentry *dentry, unsigned int flags)
-{
-	int err, nbr;
-	unsigned int u;
-	aufs_bindex_t bindex, bend;
-	struct super_block *sb;
-	atomic_t *next;
-
-	err = au_wbr_create_exp(dentry);
-	if (err >= 0)
-		goto out;
-
-	sb = dentry->d_sb;
-	next = &au_sbi(sb)->si_wbr_rr_next;
-	bend = au_sbend(sb);
-	nbr = bend + 1;
-	for (bindex = 0; bindex <= bend; bindex++) {
-		if (!au_ftest_wbr(flags, DIR)) {
-			err = atomic_dec_return(next) + 1;
-			/* modulo for 0 is meaningless */
-			if (unlikely(!err))
-				err = atomic_dec_return(next) + 1;
-		} else
-			err = atomic_read(next);
-		AuDbg("%d\n", err);
-		u = err;
-		err = u % nbr;
-		AuDbg("%d\n", err);
-		if (!au_br_rdonly(au_sbr(sb, err)))
-			break;
-		err = -EROFS;
-	}
-
-	if (err >= 0)
-		err = au_wbr_nonopq(dentry, err);
-
-out:
-	AuDbg("%d\n", err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* most free space */
-static void au_mfs(struct dentry *dentry, struct dentry *parent)
-{
-	struct super_block *sb;
-	struct au_branch *br;
-	struct au_wbr_mfs *mfs;
-	struct dentry *h_parent;
-	aufs_bindex_t bindex, bend;
-	int err;
-	unsigned long long b, bavail;
-	struct path h_path;
-	/* reduce the stack usage */
-	struct kstatfs *st;
-
-	st = kmalloc(sizeof(*st), GFP_NOFS);
-	if (unlikely(!st)) {
-		AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM);
-		return;
-	}
-
-	bavail = 0;
-	sb = dentry->d_sb;
-	mfs = &au_sbi(sb)->si_wbr_mfs;
-	MtxMustLock(&mfs->mfs_lock);
-	mfs->mfs_bindex = -EROFS;
-	mfs->mfsrr_bytes = 0;
-	if (!parent) {
-		bindex = 0;
-		bend = au_sbend(sb);
-	} else {
-		bindex = au_dbstart(parent);
-		bend = au_dbtaildir(parent);
-	}
-
-	for (; bindex <= bend; bindex++) {
-		if (parent) {
-			h_parent = au_h_dptr(parent, bindex);
-			if (!h_parent || d_is_negative(h_parent))
-				continue;
-		}
-		br = au_sbr(sb, bindex);
-		if (au_br_rdonly(br))
-			continue;
-
-		/* sb->s_root for NFS is unreliable */
-		h_path.mnt = au_br_mnt(br);
-		h_path.dentry = h_path.mnt->mnt_root;
-		err = vfs_statfs(&h_path, st);
-		if (unlikely(err)) {
-			AuWarn1("failed statfs, b%d, %d\n", bindex, err);
-			continue;
-		}
-
-		/* when the available size is equal, select the lower one */
-		BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail)
-			     || sizeof(b) < sizeof(st->f_bsize));
-		b = st->f_bavail * st->f_bsize;
-		br->br_wbr->wbr_bytes = b;
-		if (b >= bavail) {
-			bavail = b;
-			mfs->mfs_bindex = bindex;
-			mfs->mfs_jiffy = jiffies;
-		}
-	}
-
-	mfs->mfsrr_bytes = bavail;
-	AuDbg("b%d\n", mfs->mfs_bindex);
-	kfree(st);
-}
-
-static int au_wbr_create_mfs(struct dentry *dentry, unsigned int flags)
-{
-	int err;
-	struct dentry *parent;
-	struct super_block *sb;
-	struct au_wbr_mfs *mfs;
-
-	err = au_wbr_create_exp(dentry);
-	if (err >= 0)
-		goto out;
-
-	sb = dentry->d_sb;
-	parent = NULL;
-	if (au_ftest_wbr(flags, PARENT))
-		parent = dget_parent(dentry);
-	mfs = &au_sbi(sb)->si_wbr_mfs;
-	mutex_lock(&mfs->mfs_lock);
-	if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire)
-	    || mfs->mfs_bindex < 0
-	    || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex)))
-		au_mfs(dentry, parent);
-	mutex_unlock(&mfs->mfs_lock);
-	err = mfs->mfs_bindex;
-	dput(parent);
-
-	if (err >= 0)
-		err = au_wbr_nonopq(dentry, err);
-
-out:
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-static int au_wbr_create_init_mfs(struct super_block *sb)
-{
-	struct au_wbr_mfs *mfs;
-
-	mfs = &au_sbi(sb)->si_wbr_mfs;
-	mutex_init(&mfs->mfs_lock);
-	mfs->mfs_jiffy = 0;
-	mfs->mfs_bindex = -EROFS;
-
-	return 0;
-}
-
-static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused)
-{
-	mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock);
-	return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* most free space and then round robin */
-static int au_wbr_create_mfsrr(struct dentry *dentry, unsigned int flags)
-{
-	int err;
-	struct au_wbr_mfs *mfs;
-
-	err = au_wbr_create_mfs(dentry, flags);
-	if (err >= 0) {
-		mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs;
-		mutex_lock(&mfs->mfs_lock);
-		if (mfs->mfsrr_bytes < mfs->mfsrr_watermark)
-			err = au_wbr_create_rr(dentry, flags);
-		mutex_unlock(&mfs->mfs_lock);
-	}
-
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-static int au_wbr_create_init_mfsrr(struct super_block *sb)
-{
-	int err;
-
-	au_wbr_create_init_mfs(sb); /* ignore */
-	err = au_wbr_create_init_rr(sb);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* top down parent and most free space */
-static int au_wbr_create_pmfs(struct dentry *dentry, unsigned int flags)
-{
-	int err, e2;
-	unsigned long long b;
-	aufs_bindex_t bindex, bstart, bend;
-	struct super_block *sb;
-	struct dentry *parent, *h_parent;
-	struct au_branch *br;
-
-	err = au_wbr_create_tdp(dentry, flags);
-	if (unlikely(err < 0))
-		goto out;
-	parent = dget_parent(dentry);
-	bstart = au_dbstart(parent);
-	bend = au_dbtaildir(parent);
-	if (bstart == bend)
-		goto out_parent; /* success */
-
-	e2 = au_wbr_create_mfs(dentry, flags);
-	if (e2 < 0)
-		goto out_parent; /* success */
-
-	/* when the available size is equal, select upper one */
-	sb = dentry->d_sb;
-	br = au_sbr(sb, err);
-	b = br->br_wbr->wbr_bytes;
-	AuDbg("b%d, %llu\n", err, b);
-
-	for (bindex = bstart; bindex <= bend; bindex++) {
-		h_parent = au_h_dptr(parent, bindex);
-		if (!h_parent || d_is_negative(h_parent))
-			continue;
-
-		br = au_sbr(sb, bindex);
-		if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) {
-			b = br->br_wbr->wbr_bytes;
-			err = bindex;
-			AuDbg("b%d, %llu\n", err, b);
-		}
-	}
-
-	if (err >= 0)
-		err = au_wbr_nonopq(dentry, err);
-
-out_parent:
-	dput(parent);
-out:
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * - top down parent
- * - most free space with parent
- * - most free space round-robin regardless parent
- */
-static int au_wbr_create_pmfsrr(struct dentry *dentry, unsigned int flags)
-{
-	int err;
-	unsigned long long watermark;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct au_wbr_mfs *mfs;
-
-	err = au_wbr_create_pmfs(dentry, flags | AuWbr_PARENT);
-	if (unlikely(err < 0))
-		goto out;
-
-	sb = dentry->d_sb;
-	br = au_sbr(sb, err);
-	mfs = &au_sbi(sb)->si_wbr_mfs;
-	mutex_lock(&mfs->mfs_lock);
-	watermark = mfs->mfsrr_watermark;
-	mutex_unlock(&mfs->mfs_lock);
-	if (br->br_wbr->wbr_bytes < watermark)
-		/* regardless the parent dir */
-		err = au_wbr_create_mfsrr(dentry, flags);
-
-out:
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* policies for copyup */
-
-/* top down parent */
-static int au_wbr_copyup_tdp(struct dentry *dentry)
-{
-	return au_wbr_create_tdp(dentry, /*flags, anything is ok*/0);
-}
-
-/* bottom up parent */
-static int au_wbr_copyup_bup(struct dentry *dentry)
-{
-	int err;
-	aufs_bindex_t bindex, bstart;
-	struct dentry *parent, *h_parent;
-	struct super_block *sb;
-
-	err = -EROFS;
-	sb = dentry->d_sb;
-	parent = dget_parent(dentry);
-	bstart = au_dbstart(parent);
-	for (bindex = au_dbstart(dentry); bindex >= bstart; bindex--) {
-		h_parent = au_h_dptr(parent, bindex);
-		if (!h_parent || d_is_negative(h_parent))
-			continue;
-
-		if (!au_br_rdonly(au_sbr(sb, bindex))) {
-			err = bindex;
-			break;
-		}
-	}
-	dput(parent);
-
-	/* bottom up here */
-	if (unlikely(err < 0))
-		err = au_wbr_bu(sb, bstart - 1);
-
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-/* bottom up */
-int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t bstart)
-{
-	int err;
-
-	err = au_wbr_bu(dentry->d_sb, bstart);
-	AuDbg("b%d\n", err);
-	if (err > bstart)
-		err = au_wbr_nonopq(dentry, err);
-
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-static int au_wbr_copyup_bu(struct dentry *dentry)
-{
-	int err;
-	aufs_bindex_t bstart;
-
-	bstart = au_dbstart(dentry);
-	err = au_wbr_do_copyup_bu(dentry, bstart);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_wbr_copyup_operations au_wbr_copyup_ops[] = {
-	[AuWbrCopyup_TDP] = {
-		.copyup	= au_wbr_copyup_tdp
-	},
-	[AuWbrCopyup_BUP] = {
-		.copyup	= au_wbr_copyup_bup
-	},
-	[AuWbrCopyup_BU] = {
-		.copyup	= au_wbr_copyup_bu
-	}
-};
-
-struct au_wbr_create_operations au_wbr_create_ops[] = {
-	[AuWbrCreate_TDP] = {
-		.create	= au_wbr_create_tdp
-	},
-	[AuWbrCreate_RR] = {
-		.create	= au_wbr_create_rr,
-		.init	= au_wbr_create_init_rr
-	},
-	[AuWbrCreate_MFS] = {
-		.create	= au_wbr_create_mfs,
-		.init	= au_wbr_create_init_mfs,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_MFSV] = {
-		.create	= au_wbr_create_mfs,
-		.init	= au_wbr_create_init_mfs,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_MFSRR] = {
-		.create	= au_wbr_create_mfsrr,
-		.init	= au_wbr_create_init_mfsrr,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_MFSRRV] = {
-		.create	= au_wbr_create_mfsrr,
-		.init	= au_wbr_create_init_mfsrr,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_PMFS] = {
-		.create	= au_wbr_create_pmfs,
-		.init	= au_wbr_create_init_mfs,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_PMFSV] = {
-		.create	= au_wbr_create_pmfs,
-		.init	= au_wbr_create_init_mfs,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_PMFSRR] = {
-		.create	= au_wbr_create_pmfsrr,
-		.init	= au_wbr_create_init_mfsrr,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_PMFSRRV] = {
-		.create	= au_wbr_create_pmfsrr,
-		.init	= au_wbr_create_init_mfsrr,
-		.fin	= au_wbr_create_fin_mfs
-	}
-};
diff --git a/fs/aufs/whout.c b/fs/aufs/whout.c
deleted file mode 100644
index c3fc2d4bd..000000000
--- a/fs/aufs/whout.c
+++ /dev/null
@@ -1,1063 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * whiteout for logical deletion and opaque directory
- */
-
-#include "aufs.h"
-
-#define WH_MASK			S_IRUGO
-
-/*
- * If a directory contains this file, then it is opaque.  We start with the
- * .wh. flag so that it is blocked by lookup.
- */
-static struct qstr diropq_name = QSTR_INIT(AUFS_WH_DIROPQ,
-					   sizeof(AUFS_WH_DIROPQ) - 1);
-
-/*
- * generate whiteout name, which is NOT terminated by NULL.
- * @name: original d_name.name
- * @len: original d_name.len
- * @wh: whiteout qstr
- * returns zero when succeeds, otherwise error.
- * succeeded value as wh->name should be freed by kfree().
- */
-int au_wh_name_alloc(struct qstr *wh, const struct qstr *name)
-{
-	char *p;
-
-	if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN))
-		return -ENAMETOOLONG;
-
-	wh->len = name->len + AUFS_WH_PFX_LEN;
-	p = kmalloc(wh->len, GFP_NOFS);
-	wh->name = p;
-	if (p) {
-		memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
-		memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len);
-		/* smp_mb(); */
-		return 0;
-	}
-	return -ENOMEM;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * test if the @wh_name exists under @h_parent.
- * @try_sio specifies the necessary of super-io.
- */
-int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio)
-{
-	int err;
-	struct dentry *wh_dentry;
-
-	if (!try_sio)
-		wh_dentry = vfsub_lkup_one(wh_name, h_parent);
-	else
-		wh_dentry = au_sio_lkup_one(wh_name, h_parent);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry)) {
-		if (err == -ENAMETOOLONG)
-			err = 0;
-		goto out;
-	}
-
-	err = 0;
-	if (d_is_negative(wh_dentry))
-		goto out_wh; /* success */
-
-	err = 1;
-	if (d_is_reg(wh_dentry))
-		goto out_wh; /* success */
-
-	err = -EIO;
-	AuIOErr("%pd Invalid whiteout entry type 0%o.\n",
-		wh_dentry, d_inode(wh_dentry)->i_mode);
-
-out_wh:
-	dput(wh_dentry);
-out:
-	return err;
-}
-
-/*
- * test if the @h_dentry sets opaque or not.
- */
-int au_diropq_test(struct dentry *h_dentry)
-{
-	int err;
-	struct inode *h_dir;
-
-	h_dir = d_inode(h_dentry);
-	err = au_wh_test(h_dentry, &diropq_name,
-			 au_test_h_perm_sio(h_dir, MAY_EXEC));
-	return err;
-}
-
-/*
- * returns a negative dentry whose name is unique and temporary.
- */
-struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br,
-			     struct qstr *prefix)
-{
-	struct dentry *dentry;
-	int i;
-	char defname[NAME_MAX - AUFS_MAX_NAMELEN + DNAME_INLINE_LEN + 1],
-		*name, *p;
-	/* strict atomic_t is unnecessary here */
-	static unsigned short cnt;
-	struct qstr qs;
-
-	BUILD_BUG_ON(sizeof(cnt) * 2 > AUFS_WH_TMP_LEN);
-
-	name = defname;
-	qs.len = sizeof(defname) - DNAME_INLINE_LEN + prefix->len - 1;
-	if (unlikely(prefix->len > DNAME_INLINE_LEN)) {
-		dentry = ERR_PTR(-ENAMETOOLONG);
-		if (unlikely(qs.len > NAME_MAX))
-			goto out;
-		dentry = ERR_PTR(-ENOMEM);
-		name = kmalloc(qs.len + 1, GFP_NOFS);
-		if (unlikely(!name))
-			goto out;
-	}
-
-	/* doubly whiteout-ed */
-	memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2);
-	p = name + AUFS_WH_PFX_LEN * 2;
-	memcpy(p, prefix->name, prefix->len);
-	p += prefix->len;
-	*p++ = '.';
-	AuDebugOn(name + qs.len + 1 - p <= AUFS_WH_TMP_LEN);
-
-	qs.name = name;
-	for (i = 0; i < 3; i++) {
-		sprintf(p, "%.*x", AUFS_WH_TMP_LEN, cnt++);
-		dentry = au_sio_lkup_one(&qs, h_parent);
-		if (IS_ERR(dentry) || d_is_negative(dentry))
-			goto out_name;
-		dput(dentry);
-	}
-	/* pr_warn("could not get random name\n"); */
-	dentry = ERR_PTR(-EEXIST);
-	AuDbg("%.*s\n", AuLNPair(&qs));
-	BUG();
-
-out_name:
-	if (name != defname)
-		kfree(name);
-out:
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-/*
- * rename the @h_dentry on @br to the whiteouted temporary name.
- */
-int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br)
-{
-	int err;
-	struct path h_path = {
-		.mnt = au_br_mnt(br)
-	};
-	struct inode *h_dir, *delegated;
-	struct dentry *h_parent;
-
-	h_parent = h_dentry->d_parent; /* dir inode is locked */
-	h_dir = d_inode(h_parent);
-	IMustLock(h_dir);
-
-	h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name);
-	err = PTR_ERR(h_path.dentry);
-	if (IS_ERR(h_path.dentry))
-		goto out;
-
-	/* under the same dir, no need to lock_rename() */
-	delegated = NULL;
-	err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path, &delegated);
-	AuTraceErr(err);
-	if (unlikely(err == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal rename\n");
-		iput(delegated);
-	}
-	dput(h_path.dentry);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * functions for removing a whiteout
- */
-
-static int do_unlink_wh(struct inode *h_dir, struct path *h_path)
-{
-	int err, force;
-	struct inode *delegated;
-
-	/*
-	 * forces superio when the dir has a sticky bit.
-	 * this may be a violation of unix fs semantics.
-	 */
-	force = (h_dir->i_mode & S_ISVTX)
-		&& !uid_eq(current_fsuid(), d_inode(h_path->dentry)->i_uid);
-	delegated = NULL;
-	err = vfsub_unlink(h_dir, h_path, &delegated, force);
-	if (unlikely(err == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal unlink\n");
-		iput(delegated);
-	}
-	return err;
-}
-
-int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path,
-			struct dentry *dentry)
-{
-	int err;
-
-	err = do_unlink_wh(h_dir, h_path);
-	if (!err && dentry)
-		au_set_dbwh(dentry, -1);
-
-	return err;
-}
-
-static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh,
-			  struct au_branch *br)
-{
-	int err;
-	struct path h_path = {
-		.mnt = au_br_mnt(br)
-	};
-
-	err = 0;
-	h_path.dentry = vfsub_lkup_one(wh, h_parent);
-	if (IS_ERR(h_path.dentry))
-		err = PTR_ERR(h_path.dentry);
-	else {
-		if (d_is_reg(h_path.dentry))
-			err = do_unlink_wh(d_inode(h_parent), &h_path);
-		dput(h_path.dentry);
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * initialize/clean whiteout for a branch
- */
-
-static void au_wh_clean(struct inode *h_dir, struct path *whpath,
-			const int isdir)
-{
-	int err;
-	struct inode *delegated;
-
-	if (d_is_negative(whpath->dentry))
-		return;
-
-	if (isdir)
-		err = vfsub_rmdir(h_dir, whpath);
-	else {
-		delegated = NULL;
-		err = vfsub_unlink(h_dir, whpath, &delegated, /*force*/0);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal unlink\n");
-			iput(delegated);
-		}
-	}
-	if (unlikely(err))
-		pr_warn("failed removing %pd (%d), ignored.\n",
-			whpath->dentry, err);
-}
-
-static int test_linkable(struct dentry *h_root)
-{
-	struct inode *h_dir = d_inode(h_root);
-
-	if (h_dir->i_op->link)
-		return 0;
-
-	pr_err("%pd (%s) doesn't support link(2), use noplink and rw+nolwh\n",
-	       h_root, au_sbtype(h_root->d_sb));
-	return -ENOSYS;
-}
-
-/* todo: should this mkdir be done in /sbin/mount.aufs helper? */
-static int au_whdir(struct inode *h_dir, struct path *path)
-{
-	int err;
-
-	err = -EEXIST;
-	if (d_is_negative(path->dentry)) {
-		int mode = S_IRWXU;
-
-		if (au_test_nfs(path->dentry->d_sb))
-			mode |= S_IXUGO;
-		err = vfsub_mkdir(h_dir, path, mode);
-	} else if (d_is_dir(path->dentry))
-		err = 0;
-	else
-		pr_err("unknown %pd exists\n", path->dentry);
-
-	return err;
-}
-
-struct au_wh_base {
-	const struct qstr *name;
-	struct dentry *dentry;
-};
-
-static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[],
-			  struct path *h_path)
-{
-	h_path->dentry = base[AuBrWh_BASE].dentry;
-	au_wh_clean(h_dir, h_path, /*isdir*/0);
-	h_path->dentry = base[AuBrWh_PLINK].dentry;
-	au_wh_clean(h_dir, h_path, /*isdir*/1);
-	h_path->dentry = base[AuBrWh_ORPH].dentry;
-	au_wh_clean(h_dir, h_path, /*isdir*/1);
-}
-
-/*
- * returns tri-state,
- * minus: error, caller should print the message
- * zero: succuess
- * plus: error, caller should NOT print the message
- */
-static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr,
-				int do_plink, struct au_wh_base base[],
-				struct path *h_path)
-{
-	int err;
-	struct inode *h_dir;
-
-	h_dir = d_inode(h_root);
-	h_path->dentry = base[AuBrWh_BASE].dentry;
-	au_wh_clean(h_dir, h_path, /*isdir*/0);
-	h_path->dentry = base[AuBrWh_PLINK].dentry;
-	if (do_plink) {
-		err = test_linkable(h_root);
-		if (unlikely(err)) {
-			err = 1;
-			goto out;
-		}
-
-		err = au_whdir(h_dir, h_path);
-		if (unlikely(err))
-			goto out;
-		wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry);
-	} else
-		au_wh_clean(h_dir, h_path, /*isdir*/1);
-	h_path->dentry = base[AuBrWh_ORPH].dentry;
-	err = au_whdir(h_dir, h_path);
-	if (unlikely(err))
-		goto out;
-	wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry);
-
-out:
-	return err;
-}
-
-/*
- * for the moment, aufs supports the branch filesystem which does not support
- * link(2). testing on FAT which does not support i_op->setattr() fully either,
- * copyup failed. finally, such filesystem will not be used as the writable
- * branch.
- *
- * returns tri-state, see above.
- */
-static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr,
-			 int do_plink, struct au_wh_base base[],
-			 struct path *h_path)
-{
-	int err;
-	struct inode *h_dir;
-
-	WbrWhMustWriteLock(wbr);
-
-	err = test_linkable(h_root);
-	if (unlikely(err)) {
-		err = 1;
-		goto out;
-	}
-
-	/*
-	 * todo: should this create be done in /sbin/mount.aufs helper?
-	 */
-	err = -EEXIST;
-	h_dir = d_inode(h_root);
-	if (d_is_negative(base[AuBrWh_BASE].dentry)) {
-		h_path->dentry = base[AuBrWh_BASE].dentry;
-		err = vfsub_create(h_dir, h_path, WH_MASK, /*want_excl*/true);
-	} else if (d_is_reg(base[AuBrWh_BASE].dentry))
-		err = 0;
-	else
-		pr_err("unknown %pd2 exists\n", base[AuBrWh_BASE].dentry);
-	if (unlikely(err))
-		goto out;
-
-	h_path->dentry = base[AuBrWh_PLINK].dentry;
-	if (do_plink) {
-		err = au_whdir(h_dir, h_path);
-		if (unlikely(err))
-			goto out;
-		wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry);
-	} else
-		au_wh_clean(h_dir, h_path, /*isdir*/1);
-	wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry);
-
-	h_path->dentry = base[AuBrWh_ORPH].dentry;
-	err = au_whdir(h_dir, h_path);
-	if (unlikely(err))
-		goto out;
-	wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry);
-
-out:
-	return err;
-}
-
-/*
- * initialize the whiteout base file/dir for @br.
- */
-int au_wh_init(struct au_branch *br, struct super_block *sb)
-{
-	int err, i;
-	const unsigned char do_plink
-		= !!au_opt_test(au_mntflags(sb), PLINK);
-	struct inode *h_dir;
-	struct path path = br->br_path;
-	struct dentry *h_root = path.dentry;
-	struct au_wbr *wbr = br->br_wbr;
-	static const struct qstr base_name[] = {
-		[AuBrWh_BASE] = QSTR_INIT(AUFS_BASE_NAME,
-					  sizeof(AUFS_BASE_NAME) - 1),
-		[AuBrWh_PLINK] = QSTR_INIT(AUFS_PLINKDIR_NAME,
-					   sizeof(AUFS_PLINKDIR_NAME) - 1),
-		[AuBrWh_ORPH] = QSTR_INIT(AUFS_ORPHDIR_NAME,
-					  sizeof(AUFS_ORPHDIR_NAME) - 1)
-	};
-	struct au_wh_base base[] = {
-		[AuBrWh_BASE] = {
-			.name	= base_name + AuBrWh_BASE,
-			.dentry	= NULL
-		},
-		[AuBrWh_PLINK] = {
-			.name	= base_name + AuBrWh_PLINK,
-			.dentry	= NULL
-		},
-		[AuBrWh_ORPH] = {
-			.name	= base_name + AuBrWh_ORPH,
-			.dentry	= NULL
-		}
-	};
-
-	if (wbr)
-		WbrWhMustWriteLock(wbr);
-
-	for (i = 0; i < AuBrWh_Last; i++) {
-		/* doubly whiteouted */
-		struct dentry *d;
-
-		d = au_wh_lkup(h_root, (void *)base[i].name, br);
-		err = PTR_ERR(d);
-		if (IS_ERR(d))
-			goto out;
-
-		base[i].dentry = d;
-		AuDebugOn(wbr
-			  && wbr->wbr_wh[i]
-			  && wbr->wbr_wh[i] != base[i].dentry);
-	}
-
-	if (wbr)
-		for (i = 0; i < AuBrWh_Last; i++) {
-			dput(wbr->wbr_wh[i]);
-			wbr->wbr_wh[i] = NULL;
-		}
-
-	err = 0;
-	if (!au_br_writable(br->br_perm)) {
-		h_dir = d_inode(h_root);
-		au_wh_init_ro(h_dir, base, &path);
-	} else if (!au_br_wh_linkable(br->br_perm)) {
-		err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path);
-		if (err > 0)
-			goto out;
-		else if (err)
-			goto out_err;
-	} else {
-		err = au_wh_init_rw(h_root, wbr, do_plink, base, &path);
-		if (err > 0)
-			goto out;
-		else if (err)
-			goto out_err;
-	}
-	goto out; /* success */
-
-out_err:
-	pr_err("an error(%d) on the writable branch %pd(%s)\n",
-	       err, h_root, au_sbtype(h_root->d_sb));
-out:
-	for (i = 0; i < AuBrWh_Last; i++)
-		dput(base[i].dentry);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * whiteouts are all hard-linked usually.
- * when its link count reaches a ceiling, we create a new whiteout base
- * asynchronously.
- */
-
-struct reinit_br_wh {
-	struct super_block *sb;
-	struct au_branch *br;
-};
-
-static void reinit_br_wh(void *arg)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct path h_path;
-	struct reinit_br_wh *a = arg;
-	struct au_wbr *wbr;
-	struct inode *dir, *delegated;
-	struct dentry *h_root;
-	struct au_hinode *hdir;
-
-	err = 0;
-	wbr = a->br->br_wbr;
-	/* big aufs lock */
-	si_noflush_write_lock(a->sb);
-	if (!au_br_writable(a->br->br_perm))
-		goto out;
-	bindex = au_br_index(a->sb, a->br->br_id);
-	if (unlikely(bindex < 0))
-		goto out;
-
-	di_read_lock_parent(a->sb->s_root, AuLock_IR);
-	dir = d_inode(a->sb->s_root);
-	hdir = au_hi(dir, bindex);
-	h_root = au_h_dptr(a->sb->s_root, bindex);
-	AuDebugOn(h_root != au_br_dentry(a->br));
-
-	au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
-	wbr_wh_write_lock(wbr);
-	err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode,
-			  h_root, a->br);
-	if (!err) {
-		h_path.dentry = wbr->wbr_whbase;
-		h_path.mnt = au_br_mnt(a->br);
-		delegated = NULL;
-		err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated,
-				   /*force*/0);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal unlink\n");
-			iput(delegated);
-		}
-	} else {
-		pr_warn("%pd is moved, ignored\n", wbr->wbr_whbase);
-		err = 0;
-	}
-	dput(wbr->wbr_whbase);
-	wbr->wbr_whbase = NULL;
-	if (!err)
-		err = au_wh_init(a->br, a->sb);
-	wbr_wh_write_unlock(wbr);
-	au_hn_imtx_unlock(hdir);
-	di_read_unlock(a->sb->s_root, AuLock_IR);
-	if (!err)
-		au_fhsm_wrote(a->sb, bindex, /*force*/0);
-
-out:
-	if (wbr)
-		atomic_dec(&wbr->wbr_wh_running);
-	atomic_dec(&a->br->br_count);
-	si_write_unlock(a->sb);
-	au_nwt_done(&au_sbi(a->sb)->si_nowait);
-	kfree(arg);
-	if (unlikely(err))
-		AuIOErr("err %d\n", err);
-}
-
-static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br)
-{
-	int do_dec, wkq_err;
-	struct reinit_br_wh *arg;
-
-	do_dec = 1;
-	if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1)
-		goto out;
-
-	/* ignore ENOMEM */
-	arg = kmalloc(sizeof(*arg), GFP_NOFS);
-	if (arg) {
-		/*
-		 * dec(wh_running), kfree(arg) and dec(br_count)
-		 * in reinit function
-		 */
-		arg->sb = sb;
-		arg->br = br;
-		atomic_inc(&br->br_count);
-		wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb, /*flags*/0);
-		if (unlikely(wkq_err)) {
-			atomic_dec(&br->br_wbr->wbr_wh_running);
-			atomic_dec(&br->br_count);
-			kfree(arg);
-		}
-		do_dec = 0;
-	}
-
-out:
-	if (do_dec)
-		atomic_dec(&br->br_wbr->wbr_wh_running);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * create the whiteout @wh.
- */
-static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex,
-			     struct dentry *wh)
-{
-	int err;
-	struct path h_path = {
-		.dentry = wh
-	};
-	struct au_branch *br;
-	struct au_wbr *wbr;
-	struct dentry *h_parent;
-	struct inode *h_dir, *delegated;
-
-	h_parent = wh->d_parent; /* dir inode is locked */
-	h_dir = d_inode(h_parent);
-	IMustLock(h_dir);
-
-	br = au_sbr(sb, bindex);
-	h_path.mnt = au_br_mnt(br);
-	wbr = br->br_wbr;
-	wbr_wh_read_lock(wbr);
-	if (wbr->wbr_whbase) {
-		delegated = NULL;
-		err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path, &delegated);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal link\n");
-			iput(delegated);
-		}
-		if (!err || err != -EMLINK)
-			goto out;
-
-		/* link count full. re-initialize br_whbase. */
-		kick_reinit_br_wh(sb, br);
-	}
-
-	/* return this error in this context */
-	err = vfsub_create(h_dir, &h_path, WH_MASK, /*want_excl*/true);
-	if (!err)
-		au_fhsm_wrote(sb, bindex, /*force*/0);
-
-out:
-	wbr_wh_read_unlock(wbr);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * create or remove the diropq.
- */
-static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex,
-				unsigned int flags)
-{
-	struct dentry *opq_dentry, *h_dentry;
-	struct super_block *sb;
-	struct au_branch *br;
-	int err;
-
-	sb = dentry->d_sb;
-	br = au_sbr(sb, bindex);
-	h_dentry = au_h_dptr(dentry, bindex);
-	opq_dentry = vfsub_lkup_one(&diropq_name, h_dentry);
-	if (IS_ERR(opq_dentry))
-		goto out;
-
-	if (au_ftest_diropq(flags, CREATE)) {
-		err = link_or_create_wh(sb, bindex, opq_dentry);
-		if (!err) {
-			au_set_dbdiropq(dentry, bindex);
-			goto out; /* success */
-		}
-	} else {
-		struct path tmp = {
-			.dentry = opq_dentry,
-			.mnt	= au_br_mnt(br)
-		};
-		err = do_unlink_wh(au_h_iptr(d_inode(dentry), bindex), &tmp);
-		if (!err)
-			au_set_dbdiropq(dentry, -1);
-	}
-	dput(opq_dentry);
-	opq_dentry = ERR_PTR(err);
-
-out:
-	return opq_dentry;
-}
-
-struct do_diropq_args {
-	struct dentry **errp;
-	struct dentry *dentry;
-	aufs_bindex_t bindex;
-	unsigned int flags;
-};
-
-static void call_do_diropq(void *args)
-{
-	struct do_diropq_args *a = args;
-	*a->errp = do_diropq(a->dentry, a->bindex, a->flags);
-}
-
-struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex,
-			     unsigned int flags)
-{
-	struct dentry *diropq, *h_dentry;
-
-	h_dentry = au_h_dptr(dentry, bindex);
-	if (!au_test_h_perm_sio(d_inode(h_dentry), MAY_EXEC | MAY_WRITE))
-		diropq = do_diropq(dentry, bindex, flags);
-	else {
-		int wkq_err;
-		struct do_diropq_args args = {
-			.errp		= &diropq,
-			.dentry		= dentry,
-			.bindex		= bindex,
-			.flags		= flags
-		};
-
-		wkq_err = au_wkq_wait(call_do_diropq, &args);
-		if (unlikely(wkq_err))
-			diropq = ERR_PTR(wkq_err);
-	}
-
-	return diropq;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * lookup whiteout dentry.
- * @h_parent: lower parent dentry which must exist and be locked
- * @base_name: name of dentry which will be whiteouted
- * returns dentry for whiteout.
- */
-struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name,
-			  struct au_branch *br)
-{
-	int err;
-	struct qstr wh_name;
-	struct dentry *wh_dentry;
-
-	err = au_wh_name_alloc(&wh_name, base_name);
-	wh_dentry = ERR_PTR(err);
-	if (!err) {
-		wh_dentry = vfsub_lkup_one(&wh_name, h_parent);
-		kfree(wh_name.name);
-	}
-	return wh_dentry;
-}
-
-/*
- * link/create a whiteout for @dentry on @bindex.
- */
-struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex,
-			    struct dentry *h_parent)
-{
-	struct dentry *wh_dentry;
-	struct super_block *sb;
-	int err;
-
-	sb = dentry->d_sb;
-	wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex));
-	if (!IS_ERR(wh_dentry) && d_is_negative(wh_dentry)) {
-		err = link_or_create_wh(sb, bindex, wh_dentry);
-		if (!err) {
-			au_set_dbwh(dentry, bindex);
-			au_fhsm_wrote(sb, bindex, /*force*/0);
-		} else {
-			dput(wh_dentry);
-			wh_dentry = ERR_PTR(err);
-		}
-	}
-
-	return wh_dentry;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* Delete all whiteouts in this directory on branch bindex. */
-static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist,
-			   aufs_bindex_t bindex, struct au_branch *br)
-{
-	int err;
-	unsigned long ul, n;
-	struct qstr wh_name;
-	char *p;
-	struct hlist_head *head;
-	struct au_vdir_wh *pos;
-	struct au_vdir_destr *str;
-
-	err = -ENOMEM;
-	p = (void *)__get_free_page(GFP_NOFS);
-	wh_name.name = p;
-	if (unlikely(!wh_name.name))
-		goto out;
-
-	err = 0;
-	memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
-	p += AUFS_WH_PFX_LEN;
-	n = whlist->nh_num;
-	head = whlist->nh_head;
-	for (ul = 0; !err && ul < n; ul++, head++) {
-		hlist_for_each_entry(pos, head, wh_hash) {
-			if (pos->wh_bindex != bindex)
-				continue;
-
-			str = &pos->wh_str;
-			if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) {
-				memcpy(p, str->name, str->len);
-				wh_name.len = AUFS_WH_PFX_LEN + str->len;
-				err = unlink_wh_name(h_dentry, &wh_name, br);
-				if (!err)
-					continue;
-				break;
-			}
-			AuIOErr("whiteout name too long %.*s\n",
-				str->len, str->name);
-			err = -EIO;
-			break;
-		}
-	}
-	free_page((unsigned long)wh_name.name);
-
-out:
-	return err;
-}
-
-struct del_wh_children_args {
-	int *errp;
-	struct dentry *h_dentry;
-	struct au_nhash *whlist;
-	aufs_bindex_t bindex;
-	struct au_branch *br;
-};
-
-static void call_del_wh_children(void *args)
-{
-	struct del_wh_children_args *a = args;
-	*a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br);
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp)
-{
-	struct au_whtmp_rmdir *whtmp;
-	int err;
-	unsigned int rdhash;
-
-	SiMustAnyLock(sb);
-
-	whtmp = kmalloc(sizeof(*whtmp), gfp);
-	if (unlikely(!whtmp)) {
-		whtmp = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	whtmp->dir = NULL;
-	whtmp->br = NULL;
-	whtmp->wh_dentry = NULL;
-	/* no estimation for dir size */
-	rdhash = au_sbi(sb)->si_rdhash;
-	if (!rdhash)
-		rdhash = AUFS_RDHASH_DEF;
-	err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp);
-	if (unlikely(err)) {
-		kfree(whtmp);
-		whtmp = ERR_PTR(err);
-	}
-
-out:
-	return whtmp;
-}
-
-void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp)
-{
-	if (whtmp->br)
-		atomic_dec(&whtmp->br->br_count);
-	dput(whtmp->wh_dentry);
-	iput(whtmp->dir);
-	au_nhash_wh_free(&whtmp->whlist);
-	kfree(whtmp);
-}
-
-/*
- * rmdir the whiteouted temporary named dir @h_dentry.
- * @whlist: whiteouted children.
- */
-int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex,
-		   struct dentry *wh_dentry, struct au_nhash *whlist)
-{
-	int err;
-	unsigned int h_nlink;
-	struct path h_tmp;
-	struct inode *wh_inode, *h_dir;
-	struct au_branch *br;
-
-	h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */
-	IMustLock(h_dir);
-
-	br = au_sbr(dir->i_sb, bindex);
-	wh_inode = d_inode(wh_dentry);
-	mutex_lock_nested(&wh_inode->i_mutex, AuLsc_I_CHILD);
-
-	/*
-	 * someone else might change some whiteouts while we were sleeping.
-	 * it means this whlist may have an obsoleted entry.
-	 */
-	if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE))
-		err = del_wh_children(wh_dentry, whlist, bindex, br);
-	else {
-		int wkq_err;
-		struct del_wh_children_args args = {
-			.errp		= &err,
-			.h_dentry	= wh_dentry,
-			.whlist		= whlist,
-			.bindex		= bindex,
-			.br		= br
-		};
-
-		wkq_err = au_wkq_wait(call_del_wh_children, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-	mutex_unlock(&wh_inode->i_mutex);
-
-	if (!err) {
-		h_tmp.dentry = wh_dentry;
-		h_tmp.mnt = au_br_mnt(br);
-		h_nlink = h_dir->i_nlink;
-		err = vfsub_rmdir(h_dir, &h_tmp);
-		/* some fs doesn't change the parent nlink in some cases */
-		h_nlink -= h_dir->i_nlink;
-	}
-
-	if (!err) {
-		if (au_ibstart(dir) == bindex) {
-			/* todo: dir->i_mutex is necessary */
-			au_cpup_attr_timesizes(dir);
-			if (h_nlink)
-				vfsub_drop_nlink(dir);
-		}
-		return 0; /* success */
-	}
-
-	pr_warn("failed removing %pd(%d), ignored\n", wh_dentry, err);
-	return err;
-}
-
-static void call_rmdir_whtmp(void *args)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct au_whtmp_rmdir *a = args;
-	struct super_block *sb;
-	struct dentry *h_parent;
-	struct inode *h_dir;
-	struct au_hinode *hdir;
-
-	/* rmdir by nfsd may cause deadlock with this i_mutex */
-	/* mutex_lock(&a->dir->i_mutex); */
-	err = -EROFS;
-	sb = a->dir->i_sb;
-	si_read_lock(sb, !AuLock_FLUSH);
-	if (!au_br_writable(a->br->br_perm))
-		goto out;
-	bindex = au_br_index(sb, a->br->br_id);
-	if (unlikely(bindex < 0))
-		goto out;
-
-	err = -EIO;
-	ii_write_lock_parent(a->dir);
-	h_parent = dget_parent(a->wh_dentry);
-	h_dir = d_inode(h_parent);
-	hdir = au_hi(a->dir, bindex);
-	err = vfsub_mnt_want_write(au_br_mnt(a->br));
-	if (unlikely(err))
-		goto out_mnt;
-	au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
-	err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent,
-			  a->br);
-	if (!err)
-		err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, &a->whlist);
-	au_hn_imtx_unlock(hdir);
-	vfsub_mnt_drop_write(au_br_mnt(a->br));
-
-out_mnt:
-	dput(h_parent);
-	ii_write_unlock(a->dir);
-out:
-	/* mutex_unlock(&a->dir->i_mutex); */
-	au_whtmp_rmdir_free(a);
-	si_read_unlock(sb);
-	au_nwt_done(&au_sbi(sb)->si_nowait);
-	if (unlikely(err))
-		AuIOErr("err %d\n", err);
-}
-
-void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex,
-			 struct dentry *wh_dentry, struct au_whtmp_rmdir *args)
-{
-	int wkq_err;
-	struct super_block *sb;
-
-	IMustLock(dir);
-
-	/* all post-process will be done in do_rmdir_whtmp(). */
-	sb = dir->i_sb;
-	args->dir = au_igrab(dir);
-	args->br = au_sbr(sb, bindex);
-	atomic_inc(&args->br->br_count);
-	args->wh_dentry = dget(wh_dentry);
-	wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, sb, /*flags*/0);
-	if (unlikely(wkq_err)) {
-		pr_warn("rmdir error %pd (%d), ignored\n", wh_dentry, wkq_err);
-		au_whtmp_rmdir_free(args);
-	}
-}
diff --git a/fs/aufs/whout.h b/fs/aufs/whout.h
deleted file mode 100644
index 81652ce2f..000000000
--- a/fs/aufs/whout.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * whiteout for logical deletion and opaque directory
- */
-
-#ifndef __AUFS_WHOUT_H__
-#define __AUFS_WHOUT_H__
-
-#ifdef __KERNEL__
-
-#include "dir.h"
-
-/* whout.c */
-int au_wh_name_alloc(struct qstr *wh, const struct qstr *name);
-int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio);
-int au_diropq_test(struct dentry *h_dentry);
-struct au_branch;
-struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br,
-			     struct qstr *prefix);
-int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br);
-int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path,
-			struct dentry *dentry);
-int au_wh_init(struct au_branch *br, struct super_block *sb);
-
-/* diropq flags */
-#define AuDiropq_CREATE	1
-#define au_ftest_diropq(flags, name)	((flags) & AuDiropq_##name)
-#define au_fset_diropq(flags, name) \
-	do { (flags) |= AuDiropq_##name; } while (0)
-#define au_fclr_diropq(flags, name) \
-	do { (flags) &= ~AuDiropq_##name; } while (0)
-
-struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex,
-			     unsigned int flags);
-struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name,
-			  struct au_branch *br);
-struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex,
-			    struct dentry *h_parent);
-
-/* real rmdir for the whiteout-ed dir */
-struct au_whtmp_rmdir {
-	struct inode *dir;
-	struct au_branch *br;
-	struct dentry *wh_dentry;
-	struct au_nhash whlist;
-};
-
-struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp);
-void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp);
-int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex,
-		   struct dentry *wh_dentry, struct au_nhash *whlist);
-void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex,
-			 struct dentry *wh_dentry, struct au_whtmp_rmdir *args);
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct dentry *au_diropq_create(struct dentry *dentry,
-					      aufs_bindex_t bindex)
-{
-	return au_diropq_sio(dentry, bindex, AuDiropq_CREATE);
-}
-
-static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE));
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_WHOUT_H__ */
diff --git a/fs/aufs/wkq.c b/fs/aufs/wkq.c
deleted file mode 100644
index 3ffedae93..000000000
--- a/fs/aufs/wkq.c
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * workqueue for asynchronous/super-io operations
- * todo: try new dredential scheme
- */
-
-#include <linux/module.h>
-#include "aufs.h"
-
-/* internal workqueue named AUFS_WKQ_NAME */
-
-static struct workqueue_struct *au_wkq;
-
-struct au_wkinfo {
-	struct work_struct wk;
-	struct kobject *kobj;
-
-	unsigned int flags; /* see wkq.h */
-
-	au_wkq_func_t func;
-	void *args;
-
-	struct completion *comp;
-};
-
-/* ---------------------------------------------------------------------- */
-
-static void wkq_func(struct work_struct *wk)
-{
-	struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk);
-
-	AuDebugOn(!uid_eq(current_fsuid(), GLOBAL_ROOT_UID));
-	AuDebugOn(rlimit(RLIMIT_FSIZE) != RLIM_INFINITY);
-
-	wkinfo->func(wkinfo->args);
-	if (au_ftest_wkq(wkinfo->flags, WAIT))
-		complete(wkinfo->comp);
-	else {
-		kobject_put(wkinfo->kobj);
-		module_put(THIS_MODULE); /* todo: ?? */
-		kfree(wkinfo);
-	}
-}
-
-/*
- * Since struct completion is large, try allocating it dynamically.
- */
-#if 1 /* defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) */
-#define AuWkqCompDeclare(name)	struct completion *comp = NULL
-
-static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp)
-{
-	*comp = kmalloc(sizeof(**comp), GFP_NOFS);
-	if (*comp) {
-		init_completion(*comp);
-		wkinfo->comp = *comp;
-		return 0;
-	}
-	return -ENOMEM;
-}
-
-static void au_wkq_comp_free(struct completion *comp)
-{
-	kfree(comp);
-}
-
-#else
-
-/* no braces */
-#define AuWkqCompDeclare(name) \
-	DECLARE_COMPLETION_ONSTACK(_ ## name); \
-	struct completion *comp = &_ ## name
-
-static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp)
-{
-	wkinfo->comp = *comp;
-	return 0;
-}
-
-static void au_wkq_comp_free(struct completion *comp __maybe_unused)
-{
-	/* empty */
-}
-#endif /* 4KSTACKS */
-
-static void au_wkq_run(struct au_wkinfo *wkinfo)
-{
-	if (au_ftest_wkq(wkinfo->flags, NEST)) {
-		if (au_wkq_test()) {
-			AuWarn1("wkq from wkq, unless silly-rename on NFS,"
-				" due to a dead dir by UDBA?\n");
-			AuDebugOn(au_ftest_wkq(wkinfo->flags, WAIT));
-		}
-	} else
-		au_dbg_verify_kthread();
-
-	if (au_ftest_wkq(wkinfo->flags, WAIT)) {
-		INIT_WORK_ONSTACK(&wkinfo->wk, wkq_func);
-		queue_work(au_wkq, &wkinfo->wk);
-	} else {
-		INIT_WORK(&wkinfo->wk, wkq_func);
-		schedule_work(&wkinfo->wk);
-	}
-}
-
-/*
- * Be careful. It is easy to make deadlock happen.
- * processA: lock, wkq and wait
- * processB: wkq and wait, lock in wkq
- * --> deadlock
- */
-int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args)
-{
-	int err;
-	AuWkqCompDeclare(comp);
-	struct au_wkinfo wkinfo = {
-		.flags	= flags,
-		.func	= func,
-		.args	= args
-	};
-
-	err = au_wkq_comp_alloc(&wkinfo, &comp);
-	if (!err) {
-		au_wkq_run(&wkinfo);
-		/* no timeout, no interrupt */
-		wait_for_completion(wkinfo.comp);
-		au_wkq_comp_free(comp);
-		destroy_work_on_stack(&wkinfo.wk);
-	}
-
-	return err;
-
-}
-
-/*
- * Note: dget/dput() in func for aufs dentries are not supported. It will be a
- * problem in a concurrent umounting.
- */
-int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb,
-		  unsigned int flags)
-{
-	int err;
-	struct au_wkinfo *wkinfo;
-
-	atomic_inc(&au_sbi(sb)->si_nowait.nw_len);
-
-	/*
-	 * wkq_func() must free this wkinfo.
-	 * it highly depends upon the implementation of workqueue.
-	 */
-	err = 0;
-	wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS);
-	if (wkinfo) {
-		wkinfo->kobj = &au_sbi(sb)->si_kobj;
-		wkinfo->flags = flags & ~AuWkq_WAIT;
-		wkinfo->func = func;
-		wkinfo->args = args;
-		wkinfo->comp = NULL;
-		kobject_get(wkinfo->kobj);
-		__module_get(THIS_MODULE); /* todo: ?? */
-
-		au_wkq_run(wkinfo);
-	} else {
-		err = -ENOMEM;
-		au_nwt_done(&au_sbi(sb)->si_nowait);
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_nwt_init(struct au_nowait_tasks *nwt)
-{
-	atomic_set(&nwt->nw_len, 0);
-	/* smp_mb(); */ /* atomic_set */
-	init_waitqueue_head(&nwt->nw_wq);
-}
-
-void au_wkq_fin(void)
-{
-	destroy_workqueue(au_wkq);
-}
-
-int __init au_wkq_init(void)
-{
-	int err;
-
-	err = 0;
-	au_wkq = alloc_workqueue(AUFS_WKQ_NAME, 0, WQ_DFL_ACTIVE);
-	if (IS_ERR(au_wkq))
-		err = PTR_ERR(au_wkq);
-	else if (!au_wkq)
-		err = -ENOMEM;
-
-	return err;
-}
diff --git a/fs/aufs/wkq.h b/fs/aufs/wkq.h
deleted file mode 100644
index e57bfb3d9..000000000
--- a/fs/aufs/wkq.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * workqueue for asynchronous/super-io operations
- * todo: try new credentials management scheme
- */
-
-#ifndef __AUFS_WKQ_H__
-#define __AUFS_WKQ_H__
-
-#ifdef __KERNEL__
-
-struct super_block;
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * in the next operation, wait for the 'nowait' tasks in system-wide workqueue
- */
-struct au_nowait_tasks {
-	atomic_t		nw_len;
-	wait_queue_head_t	nw_wq;
-};
-
-/* ---------------------------------------------------------------------- */
-
-typedef void (*au_wkq_func_t)(void *args);
-
-/* wkq flags */
-#define AuWkq_WAIT	1
-#define AuWkq_NEST	(1 << 1)
-#define au_ftest_wkq(flags, name)	((flags) & AuWkq_##name)
-#define au_fset_wkq(flags, name) \
-	do { (flags) |= AuWkq_##name; } while (0)
-#define au_fclr_wkq(flags, name) \
-	do { (flags) &= ~AuWkq_##name; } while (0)
-
-#ifndef CONFIG_AUFS_HNOTIFY
-#undef AuWkq_NEST
-#define AuWkq_NEST	0
-#endif
-
-/* wkq.c */
-int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args);
-int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb,
-		  unsigned int flags);
-void au_nwt_init(struct au_nowait_tasks *nwt);
-int __init au_wkq_init(void);
-void au_wkq_fin(void);
-
-/* ---------------------------------------------------------------------- */
-
-static inline int au_wkq_test(void)
-{
-	return current->flags & PF_WQ_WORKER;
-}
-
-static inline int au_wkq_wait(au_wkq_func_t func, void *args)
-{
-	return au_wkq_do_wait(AuWkq_WAIT, func, args);
-}
-
-static inline void au_nwt_done(struct au_nowait_tasks *nwt)
-{
-	if (atomic_dec_and_test(&nwt->nw_len))
-		wake_up_all(&nwt->nw_wq);
-}
-
-static inline int au_nwt_flush(struct au_nowait_tasks *nwt)
-{
-	wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len));
-	return 0;
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_WKQ_H__ */
diff --git a/fs/aufs/xattr.c b/fs/aufs/xattr.c
deleted file mode 100644
index b7e13ca3e..000000000
--- a/fs/aufs/xattr.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Copyright (C) 2014-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * handling xattr functions
- */
-
-#include <linux/xattr.h>
-#include "aufs.h"
-
-static int au_xattr_ignore(int err, char *name, unsigned int ignore_flags)
-{
-	if (!ignore_flags)
-		goto out;
-	switch (err) {
-	case -ENOMEM:
-	case -EDQUOT:
-		goto out;
-	}
-
-	if ((ignore_flags & AuBrAttr_ICEX) == AuBrAttr_ICEX) {
-		err = 0;
-		goto out;
-	}
-
-#define cmp(brattr, prefix) do {					\
-		if (!strncmp(name, XATTR_##prefix##_PREFIX,		\
-			     XATTR_##prefix##_PREFIX_LEN)) {		\
-			if (ignore_flags & AuBrAttr_ICEX_##brattr)	\
-				err = 0;				\
-			goto out;					\
-		}							\
-	} while (0)
-
-	cmp(SEC, SECURITY);
-	cmp(SYS, SYSTEM);
-	cmp(TR, TRUSTED);
-	cmp(USR, USER);
-#undef cmp
-
-	if (ignore_flags & AuBrAttr_ICEX_OTH)
-		err = 0;
-
-out:
-	return err;
-}
-
-static const int au_xattr_out_of_list = AuBrAttr_ICEX_OTH << 1;
-
-static int au_do_cpup_xattr(struct dentry *h_dst, struct dentry *h_src,
-			    char *name, char **buf, unsigned int ignore_flags,
-			    unsigned int verbose)
-{
-	int err;
-	ssize_t ssz;
-	struct inode *h_idst;
-
-	ssz = vfs_getxattr_alloc(h_src, name, buf, 0, GFP_NOFS);
-	err = ssz;
-	if (unlikely(err <= 0)) {
-		if (err == -ENODATA
-		    || (err == -EOPNOTSUPP
-			&& ((ignore_flags & au_xattr_out_of_list)
-			    || (au_test_nfs_noacl(d_inode(h_src))
-				&& (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS)
-				    || !strcmp(name,
-					       XATTR_NAME_POSIX_ACL_DEFAULT))))
-			    ))
-			err = 0;
-		if (err && (verbose || au_debug_test()))
-			pr_err("%s, err %d\n", name, err);
-		goto out;
-	}
-
-	/* unlock it temporary */
-	h_idst = d_inode(h_dst);
-	mutex_unlock(&h_idst->i_mutex);
-	err = vfsub_setxattr(h_dst, name, *buf, ssz, /*flags*/0);
-	mutex_lock_nested(&h_idst->i_mutex, AuLsc_I_CHILD2);
-	if (unlikely(err)) {
-		if (verbose || au_debug_test())
-			pr_err("%s, err %d\n", name, err);
-		err = au_xattr_ignore(err, name, ignore_flags);
-	}
-
-out:
-	return err;
-}
-
-int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags,
-		  unsigned int verbose)
-{
-	int err, unlocked, acl_access, acl_default;
-	ssize_t ssz;
-	struct inode *h_isrc, *h_idst;
-	char *value, *p, *o, *e;
-
-	/* try stopping to update the source inode while we are referencing */
-	/* there should not be the parent-child relationship between them */
-	h_isrc = d_inode(h_src);
-	h_idst = d_inode(h_dst);
-	mutex_unlock(&h_idst->i_mutex);
-	mutex_lock_nested(&h_isrc->i_mutex, AuLsc_I_CHILD);
-	mutex_lock_nested(&h_idst->i_mutex, AuLsc_I_CHILD2);
-	unlocked = 0;
-
-	/* some filesystems don't list POSIX ACL, for example tmpfs */
-	ssz = vfs_listxattr(h_src, NULL, 0);
-	err = ssz;
-	if (unlikely(err < 0)) {
-		AuTraceErr(err);
-		if (err == -ENODATA
-		    || err == -EOPNOTSUPP)
-			err = 0;	/* ignore */
-		goto out;
-	}
-
-	err = 0;
-	p = NULL;
-	o = NULL;
-	if (ssz) {
-		err = -ENOMEM;
-		p = kmalloc(ssz, GFP_NOFS);
-		o = p;
-		if (unlikely(!p))
-			goto out;
-		err = vfs_listxattr(h_src, p, ssz);
-	}
-	mutex_unlock(&h_isrc->i_mutex);
-	unlocked = 1;
-	AuDbg("err %d, ssz %zd\n", err, ssz);
-	if (unlikely(err < 0))
-		goto out_free;
-
-	err = 0;
-	e = p + ssz;
-	value = NULL;
-	acl_access = 0;
-	acl_default = 0;
-	while (!err && p < e) {
-		acl_access |= !strncmp(p, XATTR_NAME_POSIX_ACL_ACCESS,
-				       sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1);
-		acl_default |= !strncmp(p, XATTR_NAME_POSIX_ACL_DEFAULT,
-					sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)
-					- 1);
-		err = au_do_cpup_xattr(h_dst, h_src, p, &value, ignore_flags,
-				       verbose);
-		p += strlen(p) + 1;
-	}
-	AuTraceErr(err);
-	ignore_flags |= au_xattr_out_of_list;
-	if (!err && !acl_access) {
-		err = au_do_cpup_xattr(h_dst, h_src,
-				       XATTR_NAME_POSIX_ACL_ACCESS, &value,
-				       ignore_flags, verbose);
-		AuTraceErr(err);
-	}
-	if (!err && !acl_default) {
-		err = au_do_cpup_xattr(h_dst, h_src,
-				       XATTR_NAME_POSIX_ACL_DEFAULT, &value,
-				       ignore_flags, verbose);
-		AuTraceErr(err);
-	}
-
-	kfree(value);
-
-out_free:
-	kfree(o);
-out:
-	if (!unlocked)
-		mutex_unlock(&h_isrc->i_mutex);
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-enum {
-	AU_XATTR_LIST,
-	AU_XATTR_GET
-};
-
-struct au_lgxattr {
-	int type;
-	union {
-		struct {
-			char	*list;
-			size_t	size;
-		} list;
-		struct {
-			const char	*name;
-			void		*value;
-			size_t		size;
-		} get;
-	} u;
-};
-
-static ssize_t au_lgxattr(struct dentry *dentry, struct au_lgxattr *arg)
-{
-	ssize_t err;
-	struct path h_path;
-	struct super_block *sb;
-
-	sb = dentry->d_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out;
-	err = au_h_path_getattr(dentry, /*force*/1, &h_path);
-	if (unlikely(err))
-		goto out_si;
-	if (unlikely(!h_path.dentry))
-		/* illegally overlapped or something */
-		goto out_di; /* pretending success */
-
-	/* always topmost entry only */
-	switch (arg->type) {
-	case AU_XATTR_LIST:
-		err = vfs_listxattr(h_path.dentry,
-				    arg->u.list.list, arg->u.list.size);
-		break;
-	case AU_XATTR_GET:
-		err = vfs_getxattr(h_path.dentry,
-				   arg->u.get.name, arg->u.get.value,
-				   arg->u.get.size);
-		break;
-	}
-
-out_di:
-	di_read_unlock(dentry, AuLock_IR);
-out_si:
-	si_read_unlock(sb);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size)
-{
-	struct au_lgxattr arg = {
-		.type = AU_XATTR_LIST,
-		.u.list = {
-			.list	= list,
-			.size	= size
-		},
-	};
-
-	return au_lgxattr(dentry, &arg);
-}
-
-ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value,
-		      size_t size)
-{
-	struct au_lgxattr arg = {
-		.type = AU_XATTR_GET,
-		.u.get = {
-			.name	= name,
-			.value	= value,
-			.size	= size
-		},
-	};
-
-	return au_lgxattr(dentry, &arg);
-}
-
-int aufs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		  size_t size, int flags)
-{
-	struct au_srxattr arg = {
-		.type = AU_XATTR_SET,
-		.u.set = {
-			.name	= name,
-			.value	= value,
-			.size	= size,
-			.flags	= flags
-		},
-	};
-
-	return au_srxattr(dentry, &arg);
-}
-
-int aufs_removexattr(struct dentry *dentry, const char *name)
-{
-	struct au_srxattr arg = {
-		.type = AU_XATTR_REMOVE,
-		.u.remove = {
-			.name	= name
-		},
-	};
-
-	return au_srxattr(dentry, &arg);
-}
-
-/* ---------------------------------------------------------------------- */
-
-#if 0
-static size_t au_xattr_list(struct dentry *dentry, char *list, size_t list_size,
-			    const char *name, size_t name_len, int type)
-{
-	return aufs_listxattr(dentry, list, list_size);
-}
-
-static int au_xattr_get(struct dentry *dentry, const char *name, void *buffer,
-			size_t size, int type)
-{
-	return aufs_getxattr(dentry, name, buffer, size);
-}
-
-static int au_xattr_set(struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags, int type)
-{
-	return aufs_setxattr(dentry, name, value, size, flags);
-}
-
-static const struct xattr_handler au_xattr_handler = {
-	/* no prefix, no flags */
-	.list	= au_xattr_list,
-	.get	= au_xattr_get,
-	.set	= au_xattr_set
-	/* why no remove? */
-};
-
-static const struct xattr_handler *au_xattr_handlers[] = {
-	&au_xattr_handler
-};
-
-void au_xattr_init(struct super_block *sb)
-{
-	/* sb->s_xattr = au_xattr_handlers; */
-}
-#endif
diff --git a/fs/aufs/xino.c b/fs/aufs/xino.c
deleted file mode 100644
index 053594410..000000000
--- a/fs/aufs/xino.c
+++ /dev/null
@@ -1,1297 +0,0 @@
-/*
- * Copyright (C) 2005-2015 Junjiro R. Okajima
- *
- * This program, aufs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * external inode number translation table and bitmap
- */
-
-#include <linux/seq_file.h>
-#include <linux/statfs.h>
-#include "aufs.h"
-
-/* todo: unnecessary to support mmap_sem since kernel-space? */
-ssize_t xino_fread(vfs_readf_t func, struct file *file, void *kbuf, size_t size,
-		   loff_t *pos)
-{
-	ssize_t err;
-	mm_segment_t oldfs;
-	union {
-		void *k;
-		char __user *u;
-	} buf;
-
-	buf.k = kbuf;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	do {
-		/* todo: signal_pending? */
-		err = func(file, buf.u, size, pos);
-	} while (err == -EAGAIN || err == -EINTR);
-	set_fs(oldfs);
-
-#if 0 /* reserved for future use */
-	if (err > 0)
-		fsnotify_access(file->f_path.dentry);
-#endif
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static ssize_t do_xino_fwrite(vfs_writef_t func, struct file *file, void *kbuf,
-			      size_t size, loff_t *pos)
-{
-	ssize_t err;
-	mm_segment_t oldfs;
-	union {
-		void *k;
-		const char __user *u;
-	} buf;
-
-	buf.k = kbuf;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	do {
-		/* todo: signal_pending? */
-		err = func(file, buf.u, size, pos);
-	} while (err == -EAGAIN || err == -EINTR);
-	set_fs(oldfs);
-
-#if 0 /* reserved for future use */
-	if (err > 0)
-		fsnotify_modify(file->f_path.dentry);
-#endif
-
-	return err;
-}
-
-struct do_xino_fwrite_args {
-	ssize_t *errp;
-	vfs_writef_t func;
-	struct file *file;
-	void *buf;
-	size_t size;
-	loff_t *pos;
-};
-
-static void call_do_xino_fwrite(void *args)
-{
-	struct do_xino_fwrite_args *a = args;
-	*a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos);
-}
-
-ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf,
-		    size_t size, loff_t *pos)
-{
-	ssize_t err;
-
-	/* todo: signal block and no wkq? */
-	if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) {
-		lockdep_off();
-		err = do_xino_fwrite(func, file, buf, size, pos);
-		lockdep_on();
-	} else {
-		/*
-		 * it breaks RLIMIT_FSIZE and normal user's limit,
-		 * users should care about quota and real 'filesystem full.'
-		 */
-		int wkq_err;
-		struct do_xino_fwrite_args args = {
-			.errp	= &err,
-			.func	= func,
-			.file	= file,
-			.buf	= buf,
-			.size	= size,
-			.pos	= pos
-		};
-
-		wkq_err = au_wkq_wait(call_do_xino_fwrite, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * create a new xinofile at the same place/path as @base_file.
- */
-struct file *au_xino_create2(struct file *base_file, struct file *copy_src)
-{
-	struct file *file;
-	struct dentry *base, *parent;
-	struct inode *dir, *delegated;
-	struct qstr *name;
-	struct path path;
-	int err;
-
-	base = base_file->f_path.dentry;
-	parent = base->d_parent; /* dir inode is locked */
-	dir = d_inode(parent);
-	IMustLock(dir);
-
-	file = ERR_PTR(-EINVAL);
-	name = &base->d_name;
-	path.dentry = vfsub_lookup_one_len(name->name, parent, name->len);
-	if (IS_ERR(path.dentry)) {
-		file = (void *)path.dentry;
-		pr_err("%pd lookup err %ld\n",
-		       base, PTR_ERR(path.dentry));
-		goto out;
-	}
-
-	/* no need to mnt_want_write() since we call dentry_open() later */
-	err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL);
-	if (unlikely(err)) {
-		file = ERR_PTR(err);
-		pr_err("%pd create err %d\n", base, err);
-		goto out_dput;
-	}
-
-	path.mnt = base_file->f_path.mnt;
-	file = vfsub_dentry_open(&path,
-				 O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE
-				 /* | __FMODE_NONOTIFY */);
-	if (IS_ERR(file)) {
-		pr_err("%pd open err %ld\n", base, PTR_ERR(file));
-		goto out_dput;
-	}
-
-	delegated = NULL;
-	err = vfsub_unlink(dir, &file->f_path, &delegated, /*force*/0);
-	if (unlikely(err == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal unlink\n");
-		iput(delegated);
-	}
-	if (unlikely(err)) {
-		pr_err("%pd unlink err %d\n", base, err);
-		goto out_fput;
-	}
-
-	if (copy_src) {
-		/* no one can touch copy_src xino */
-		err = au_copy_file(file, copy_src, vfsub_f_size_read(copy_src));
-		if (unlikely(err)) {
-			pr_err("%pd copy err %d\n", base, err);
-			goto out_fput;
-		}
-	}
-	goto out_dput; /* success */
-
-out_fput:
-	fput(file);
-	file = ERR_PTR(err);
-out_dput:
-	dput(path.dentry);
-out:
-	return file;
-}
-
-struct au_xino_lock_dir {
-	struct au_hinode *hdir;
-	struct dentry *parent;
-	struct mutex *mtx;
-};
-
-static void au_xino_lock_dir(struct super_block *sb, struct file *xino,
-			     struct au_xino_lock_dir *ldir)
-{
-	aufs_bindex_t brid, bindex;
-
-	ldir->hdir = NULL;
-	bindex = -1;
-	brid = au_xino_brid(sb);
-	if (brid >= 0)
-		bindex = au_br_index(sb, brid);
-	if (bindex >= 0) {
-		ldir->hdir = au_hi(d_inode(sb->s_root), bindex);
-		au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT);
-	} else {
-		ldir->parent = dget_parent(xino->f_path.dentry);
-		ldir->mtx = &d_inode(ldir->parent)->i_mutex;
-		mutex_lock_nested(ldir->mtx, AuLsc_I_PARENT);
-	}
-}
-
-static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir)
-{
-	if (ldir->hdir)
-		au_hn_imtx_unlock(ldir->hdir);
-	else {
-		mutex_unlock(ldir->mtx);
-		dput(ldir->parent);
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* trucate xino files asynchronously */
-
-int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex)
-{
-	int err;
-	unsigned long jiffy;
-	blkcnt_t blocks;
-	aufs_bindex_t bi, bend;
-	struct kstatfs *st;
-	struct au_branch *br;
-	struct file *new_xino, *file;
-	struct super_block *h_sb;
-	struct au_xino_lock_dir ldir;
-
-	err = -ENOMEM;
-	st = kzalloc(sizeof(*st), GFP_NOFS);
-	if (unlikely(!st))
-		goto out;
-
-	err = -EINVAL;
-	bend = au_sbend(sb);
-	if (unlikely(bindex < 0 || bend < bindex))
-		goto out_st;
-	br = au_sbr(sb, bindex);
-	file = br->br_xino.xi_file;
-	if (!file)
-		goto out_st;
-
-	err = vfs_statfs(&file->f_path, st);
-	if (unlikely(err))
-		AuErr1("statfs err %d, ignored\n", err);
-	jiffy = jiffies;
-	blocks = file_inode(file)->i_blocks;
-	pr_info("begin truncating xino(b%d), ib%llu, %llu/%llu free blks\n",
-		bindex, (u64)blocks, st->f_bfree, st->f_blocks);
-
-	au_xino_lock_dir(sb, file, &ldir);
-	/* mnt_want_write() is unnecessary here */
-	new_xino = au_xino_create2(file, file);
-	au_xino_unlock_dir(&ldir);
-	err = PTR_ERR(new_xino);
-	if (IS_ERR(new_xino)) {
-		pr_err("err %d, ignored\n", err);
-		goto out_st;
-	}
-	err = 0;
-	fput(file);
-	br->br_xino.xi_file = new_xino;
-
-	h_sb = au_br_sb(br);
-	for (bi = 0; bi <= bend; bi++) {
-		if (unlikely(bi == bindex))
-			continue;
-		br = au_sbr(sb, bi);
-		if (au_br_sb(br) != h_sb)
-			continue;
-
-		fput(br->br_xino.xi_file);
-		br->br_xino.xi_file = new_xino;
-		get_file(new_xino);
-	}
-
-	err = vfs_statfs(&new_xino->f_path, st);
-	if (!err) {
-		pr_info("end truncating xino(b%d), ib%llu, %llu/%llu free blks\n",
-			bindex, (u64)file_inode(new_xino)->i_blocks,
-			st->f_bfree, st->f_blocks);
-		if (file_inode(new_xino)->i_blocks < blocks)
-			au_sbi(sb)->si_xino_jiffy = jiffy;
-	} else
-		AuErr1("statfs err %d, ignored\n", err);
-
-out_st:
-	kfree(st);
-out:
-	return err;
-}
-
-struct xino_do_trunc_args {
-	struct super_block *sb;
-	struct au_branch *br;
-};
-
-static void xino_do_trunc(void *_args)
-{
-	struct xino_do_trunc_args *args = _args;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct inode *dir;
-	int err;
-	aufs_bindex_t bindex;
-
-	err = 0;
-	sb = args->sb;
-	dir = d_inode(sb->s_root);
-	br = args->br;
-
-	si_noflush_write_lock(sb);
-	ii_read_lock_parent(dir);
-	bindex = au_br_index(sb, br->br_id);
-	err = au_xino_trunc(sb, bindex);
-	ii_read_unlock(dir);
-	if (unlikely(err))
-		pr_warn("err b%d, (%d)\n", bindex, err);
-	atomic_dec(&br->br_xino_running);
-	atomic_dec(&br->br_count);
-	si_write_unlock(sb);
-	au_nwt_done(&au_sbi(sb)->si_nowait);
-	kfree(args);
-}
-
-static int xino_trunc_test(struct super_block *sb, struct au_branch *br)
-{
-	int err;
-	struct kstatfs st;
-	struct au_sbinfo *sbinfo;
-
-	/* todo: si_xino_expire and the ratio should be customizable */
-	sbinfo = au_sbi(sb);
-	if (time_before(jiffies,
-			sbinfo->si_xino_jiffy + sbinfo->si_xino_expire))
-		return 0;
-
-	/* truncation border */
-	err = vfs_statfs(&br->br_xino.xi_file->f_path, &st);
-	if (unlikely(err)) {
-		AuErr1("statfs err %d, ignored\n", err);
-		return 0;
-	}
-	if (div64_u64(st.f_bfree * 100, st.f_blocks) >= AUFS_XINO_DEF_TRUNC)
-		return 0;
-
-	return 1;
-}
-
-static void xino_try_trunc(struct super_block *sb, struct au_branch *br)
-{
-	struct xino_do_trunc_args *args;
-	int wkq_err;
-
-	if (!xino_trunc_test(sb, br))
-		return;
-
-	if (atomic_inc_return(&br->br_xino_running) > 1)
-		goto out;
-
-	/* lock and kfree() will be called in trunc_xino() */
-	args = kmalloc(sizeof(*args), GFP_NOFS);
-	if (unlikely(!args)) {
-		AuErr1("no memory\n");
-		goto out_args;
-	}
-
-	atomic_inc(&br->br_count);
-	args->sb = sb;
-	args->br = br;
-	wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0);
-	if (!wkq_err)
-		return; /* success */
-
-	pr_err("wkq %d\n", wkq_err);
-	atomic_dec(&br->br_count);
-
-out_args:
-	kfree(args);
-out:
-	atomic_dec(&br->br_xino_running);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_xino_do_write(vfs_writef_t write, struct file *file,
-			    ino_t h_ino, ino_t ino)
-{
-	loff_t pos;
-	ssize_t sz;
-
-	pos = h_ino;
-	if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) {
-		AuIOErr1("too large hi%lu\n", (unsigned long)h_ino);
-		return -EFBIG;
-	}
-	pos *= sizeof(ino);
-	sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos);
-	if (sz == sizeof(ino))
-		return 0; /* success */
-
-	AuIOErr("write failed (%zd)\n", sz);
-	return -EIO;
-}
-
-/*
- * write @ino to the xinofile for the specified branch{@sb, @bindex}
- * at the position of @h_ino.
- * even if @ino is zero, it is written to the xinofile and means no entry.
- * if the size of the xino file on a specific filesystem exceeds the watermark,
- * try truncating it.
- */
-int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-		  ino_t ino)
-{
-	int err;
-	unsigned int mnt_flags;
-	struct au_branch *br;
-
-	BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max)
-		     || ((loff_t)-1) > 0);
-	SiMustAnyLock(sb);
-
-	mnt_flags = au_mntflags(sb);
-	if (!au_opt_test(mnt_flags, XINO))
-		return 0;
-
-	br = au_sbr(sb, bindex);
-	err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file,
-			       h_ino, ino);
-	if (!err) {
-		if (au_opt_test(mnt_flags, TRUNC_XINO)
-		    && au_test_fs_trunc_xino(au_br_sb(br)))
-			xino_try_trunc(sb, br);
-		return 0; /* success */
-	}
-
-	AuIOErr("write failed (%d)\n", err);
-	return -EIO;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* aufs inode number bitmap */
-
-static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE;
-static ino_t xib_calc_ino(unsigned long pindex, int bit)
-{
-	ino_t ino;
-
-	AuDebugOn(bit < 0 || page_bits <= bit);
-	ino = AUFS_FIRST_INO + pindex * page_bits + bit;
-	return ino;
-}
-
-static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit)
-{
-	AuDebugOn(ino < AUFS_FIRST_INO);
-	ino -= AUFS_FIRST_INO;
-	*pindex = ino / page_bits;
-	*bit = ino % page_bits;
-}
-
-static int xib_pindex(struct super_block *sb, unsigned long pindex)
-{
-	int err;
-	loff_t pos;
-	ssize_t sz;
-	struct au_sbinfo *sbinfo;
-	struct file *xib;
-	unsigned long *p;
-
-	sbinfo = au_sbi(sb);
-	MtxMustLock(&sbinfo->si_xib_mtx);
-	AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE
-		  || !au_opt_test(sbinfo->si_mntflags, XINO));
-
-	if (pindex == sbinfo->si_xib_last_pindex)
-		return 0;
-
-	xib = sbinfo->si_xib;
-	p = sbinfo->si_xib_buf;
-	pos = sbinfo->si_xib_last_pindex;
-	pos *= PAGE_SIZE;
-	sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos);
-	if (unlikely(sz != PAGE_SIZE))
-		goto out;
-
-	pos = pindex;
-	pos *= PAGE_SIZE;
-	if (vfsub_f_size_read(xib) >= pos + PAGE_SIZE)
-		sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos);
-	else {
-		memset(p, 0, PAGE_SIZE);
-		sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos);
-	}
-	if (sz == PAGE_SIZE) {
-		sbinfo->si_xib_last_pindex = pindex;
-		return 0; /* success */
-	}
-
-out:
-	AuIOErr1("write failed (%zd)\n", sz);
-	err = sz;
-	if (sz >= 0)
-		err = -EIO;
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void au_xib_clear_bit(struct inode *inode)
-{
-	int err, bit;
-	unsigned long pindex;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-
-	AuDebugOn(inode->i_nlink);
-
-	sb = inode->i_sb;
-	xib_calc_bit(inode->i_ino, &pindex, &bit);
-	AuDebugOn(page_bits <= bit);
-	sbinfo = au_sbi(sb);
-	mutex_lock(&sbinfo->si_xib_mtx);
-	err = xib_pindex(sb, pindex);
-	if (!err) {
-		clear_bit(bit, sbinfo->si_xib_buf);
-		sbinfo->si_xib_next_bit = bit;
-	}
-	mutex_unlock(&sbinfo->si_xib_mtx);
-}
-
-/* for s_op->delete_inode() */
-void au_xino_delete_inode(struct inode *inode, const int unlinked)
-{
-	int err;
-	unsigned int mnt_flags;
-	aufs_bindex_t bindex, bend, bi;
-	unsigned char try_trunc;
-	struct au_iinfo *iinfo;
-	struct super_block *sb;
-	struct au_hinode *hi;
-	struct inode *h_inode;
-	struct au_branch *br;
-	vfs_writef_t xwrite;
-
-	sb = inode->i_sb;
-	mnt_flags = au_mntflags(sb);
-	if (!au_opt_test(mnt_flags, XINO)
-	    || inode->i_ino == AUFS_ROOT_INO)
-		return;
-
-	if (unlinked) {
-		au_xigen_inc(inode);
-		au_xib_clear_bit(inode);
-	}
-
-	iinfo = au_ii(inode);
-	if (!iinfo)
-		return;
-
-	bindex = iinfo->ii_bstart;
-	if (bindex < 0)
-		return;
-
-	xwrite = au_sbi(sb)->si_xwrite;
-	try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO);
-	hi = iinfo->ii_hinode + bindex;
-	bend = iinfo->ii_bend;
-	for (; bindex <= bend; bindex++, hi++) {
-		h_inode = hi->hi_inode;
-		if (!h_inode
-		    || (!unlinked && h_inode->i_nlink))
-			continue;
-
-		/* inode may not be revalidated */
-		bi = au_br_index(sb, hi->hi_id);
-		if (bi < 0)
-			continue;
-
-		br = au_sbr(sb, bi);
-		err = au_xino_do_write(xwrite, br->br_xino.xi_file,
-				       h_inode->i_ino, /*ino*/0);
-		if (!err && try_trunc
-		    && au_test_fs_trunc_xino(au_br_sb(br)))
-			xino_try_trunc(sb, br);
-	}
-}
-
-/* get an unused inode number from bitmap */
-ino_t au_xino_new_ino(struct super_block *sb)
-{
-	ino_t ino;
-	unsigned long *p, pindex, ul, pend;
-	struct au_sbinfo *sbinfo;
-	struct file *file;
-	int free_bit, err;
-
-	if (!au_opt_test(au_mntflags(sb), XINO))
-		return iunique(sb, AUFS_FIRST_INO);
-
-	sbinfo = au_sbi(sb);
-	mutex_lock(&sbinfo->si_xib_mtx);
-	p = sbinfo->si_xib_buf;
-	free_bit = sbinfo->si_xib_next_bit;
-	if (free_bit < page_bits && !test_bit(free_bit, p))
-		goto out; /* success */
-	free_bit = find_first_zero_bit(p, page_bits);
-	if (free_bit < page_bits)
-		goto out; /* success */
-
-	pindex = sbinfo->si_xib_last_pindex;
-	for (ul = pindex - 1; ul < ULONG_MAX; ul--) {
-		err = xib_pindex(sb, ul);
-		if (unlikely(err))
-			goto out_err;
-		free_bit = find_first_zero_bit(p, page_bits);
-		if (free_bit < page_bits)
-			goto out; /* success */
-	}
-
-	file = sbinfo->si_xib;
-	pend = vfsub_f_size_read(file) / PAGE_SIZE;
-	for (ul = pindex + 1; ul <= pend; ul++) {
-		err = xib_pindex(sb, ul);
-		if (unlikely(err))
-			goto out_err;
-		free_bit = find_first_zero_bit(p, page_bits);
-		if (free_bit < page_bits)
-			goto out; /* success */
-	}
-	BUG();
-
-out:
-	set_bit(free_bit, p);
-	sbinfo->si_xib_next_bit = free_bit + 1;
-	pindex = sbinfo->si_xib_last_pindex;
-	mutex_unlock(&sbinfo->si_xib_mtx);
-	ino = xib_calc_ino(pindex, free_bit);
-	AuDbg("i%lu\n", (unsigned long)ino);
-	return ino;
-out_err:
-	mutex_unlock(&sbinfo->si_xib_mtx);
-	AuDbg("i0\n");
-	return 0;
-}
-
-/*
- * read @ino from xinofile for the specified branch{@sb, @bindex}
- * at the position of @h_ino.
- * if @ino does not exist and @do_new is true, get new one.
- */
-int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-		 ino_t *ino)
-{
-	int err;
-	ssize_t sz;
-	loff_t pos;
-	struct file *file;
-	struct au_sbinfo *sbinfo;
-
-	*ino = 0;
-	if (!au_opt_test(au_mntflags(sb), XINO))
-		return 0; /* no xino */
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	pos = h_ino;
-	if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) {
-		AuIOErr1("too large hi%lu\n", (unsigned long)h_ino);
-		return -EFBIG;
-	}
-	pos *= sizeof(*ino);
-
-	file = au_sbr(sb, bindex)->br_xino.xi_file;
-	if (vfsub_f_size_read(file) < pos + sizeof(*ino))
-		return 0; /* no ino */
-
-	sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos);
-	if (sz == sizeof(*ino))
-		return 0; /* success */
-
-	err = sz;
-	if (unlikely(sz >= 0)) {
-		err = -EIO;
-		AuIOErr("xino read error (%zd)\n", sz);
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* create and set a new xino file */
-
-struct file *au_xino_create(struct super_block *sb, char *fname, int silent)
-{
-	struct file *file;
-	struct dentry *h_parent, *d;
-	struct inode *h_dir, *inode;
-	int err;
-
-	/*
-	 * at mount-time, and the xino file is the default path,
-	 * hnotify is disabled so we have no notify events to ignore.
-	 * when a user specified the xino, we cannot get au_hdir to be ignored.
-	 */
-	file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE
-			       /* | __FMODE_NONOTIFY */,
-			       S_IRUGO | S_IWUGO);
-	if (IS_ERR(file)) {
-		if (!silent)
-			pr_err("open %s(%ld)\n", fname, PTR_ERR(file));
-		return file;
-	}
-
-	/* keep file count */
-	err = 0;
-	inode = file_inode(file);
-	h_parent = dget_parent(file->f_path.dentry);
-	h_dir = d_inode(h_parent);
-	mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT);
-	/* mnt_want_write() is unnecessary here */
-	/* no delegation since it is just created */
-	if (inode->i_nlink)
-		err = vfsub_unlink(h_dir, &file->f_path, /*delegated*/NULL,
-				   /*force*/0);
-	mutex_unlock(&h_dir->i_mutex);
-	dput(h_parent);
-	if (unlikely(err)) {
-		if (!silent)
-			pr_err("unlink %s(%d)\n", fname, err);
-		goto out;
-	}
-
-	err = -EINVAL;
-	d = file->f_path.dentry;
-	if (unlikely(sb == d->d_sb)) {
-		if (!silent)
-			pr_err("%s must be outside\n", fname);
-		goto out;
-	}
-	if (unlikely(au_test_fs_bad_xino(d->d_sb))) {
-		if (!silent)
-			pr_err("xino doesn't support %s(%s)\n",
-			       fname, au_sbtype(d->d_sb));
-		goto out;
-	}
-	return file; /* success */
-
-out:
-	fput(file);
-	file = ERR_PTR(err);
-	return file;
-}
-
-/*
- * find another branch who is on the same filesystem of the specified
- * branch{@btgt}. search until @bend.
- */
-static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt,
-			aufs_bindex_t bend)
-{
-	aufs_bindex_t bindex;
-	struct super_block *tgt_sb = au_sbr_sb(sb, btgt);
-
-	for (bindex = 0; bindex < btgt; bindex++)
-		if (unlikely(tgt_sb == au_sbr_sb(sb, bindex)))
-			return bindex;
-	for (bindex++; bindex <= bend; bindex++)
-		if (unlikely(tgt_sb == au_sbr_sb(sb, bindex)))
-			return bindex;
-	return -1;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * initialize the xinofile for the specified branch @br
- * at the place/path where @base_file indicates.
- * test whether another branch is on the same filesystem or not,
- * if @do_test is true.
- */
-int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino,
-	       struct file *base_file, int do_test)
-{
-	int err;
-	ino_t ino;
-	aufs_bindex_t bend, bindex;
-	struct au_branch *shared_br, *b;
-	struct file *file;
-	struct super_block *tgt_sb;
-
-	shared_br = NULL;
-	bend = au_sbend(sb);
-	if (do_test) {
-		tgt_sb = au_br_sb(br);
-		for (bindex = 0; bindex <= bend; bindex++) {
-			b = au_sbr(sb, bindex);
-			if (tgt_sb == au_br_sb(b)) {
-				shared_br = b;
-				break;
-			}
-		}
-	}
-
-	if (!shared_br || !shared_br->br_xino.xi_file) {
-		struct au_xino_lock_dir ldir;
-
-		au_xino_lock_dir(sb, base_file, &ldir);
-		/* mnt_want_write() is unnecessary here */
-		file = au_xino_create2(base_file, NULL);
-		au_xino_unlock_dir(&ldir);
-		err = PTR_ERR(file);
-		if (IS_ERR(file))
-			goto out;
-		br->br_xino.xi_file = file;
-	} else {
-		br->br_xino.xi_file = shared_br->br_xino.xi_file;
-		get_file(br->br_xino.xi_file);
-	}
-
-	ino = AUFS_ROOT_INO;
-	err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file,
-			       h_ino, ino);
-	if (unlikely(err)) {
-		fput(br->br_xino.xi_file);
-		br->br_xino.xi_file = NULL;
-	}
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* trucate a xino bitmap file */
-
-/* todo: slow */
-static int do_xib_restore(struct super_block *sb, struct file *file, void *page)
-{
-	int err, bit;
-	ssize_t sz;
-	unsigned long pindex;
-	loff_t pos, pend;
-	struct au_sbinfo *sbinfo;
-	vfs_readf_t func;
-	ino_t *ino;
-	unsigned long *p;
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	MtxMustLock(&sbinfo->si_xib_mtx);
-	p = sbinfo->si_xib_buf;
-	func = sbinfo->si_xread;
-	pend = vfsub_f_size_read(file);
-	pos = 0;
-	while (pos < pend) {
-		sz = xino_fread(func, file, page, PAGE_SIZE, &pos);
-		err = sz;
-		if (unlikely(sz <= 0))
-			goto out;
-
-		err = 0;
-		for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) {
-			if (unlikely(*ino < AUFS_FIRST_INO))
-				continue;
-
-			xib_calc_bit(*ino, &pindex, &bit);
-			AuDebugOn(page_bits <= bit);
-			err = xib_pindex(sb, pindex);
-			if (!err)
-				set_bit(bit, p);
-			else
-				goto out;
-		}
-	}
-
-out:
-	return err;
-}
-
-static int xib_restore(struct super_block *sb)
-{
-	int err;
-	aufs_bindex_t bindex, bend;
-	void *page;
-
-	err = -ENOMEM;
-	page = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!page))
-		goto out;
-
-	err = 0;
-	bend = au_sbend(sb);
-	for (bindex = 0; !err && bindex <= bend; bindex++)
-		if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0)
-			err = do_xib_restore
-				(sb, au_sbr(sb, bindex)->br_xino.xi_file, page);
-		else
-			AuDbg("b%d\n", bindex);
-	free_page((unsigned long)page);
-
-out:
-	return err;
-}
-
-int au_xib_trunc(struct super_block *sb)
-{
-	int err;
-	ssize_t sz;
-	loff_t pos;
-	struct au_xino_lock_dir ldir;
-	struct au_sbinfo *sbinfo;
-	unsigned long *p;
-	struct file *file;
-
-	SiMustWriteLock(sb);
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	if (!au_opt_test(sbinfo->si_mntflags, XINO))
-		goto out;
-
-	file = sbinfo->si_xib;
-	if (vfsub_f_size_read(file) <= PAGE_SIZE)
-		goto out;
-
-	au_xino_lock_dir(sb, file, &ldir);
-	/* mnt_want_write() is unnecessary here */
-	file = au_xino_create2(sbinfo->si_xib, NULL);
-	au_xino_unlock_dir(&ldir);
-	err = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out;
-	fput(sbinfo->si_xib);
-	sbinfo->si_xib = file;
-
-	p = sbinfo->si_xib_buf;
-	memset(p, 0, PAGE_SIZE);
-	pos = 0;
-	sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos);
-	if (unlikely(sz != PAGE_SIZE)) {
-		err = sz;
-		AuIOErr("err %d\n", err);
-		if (sz >= 0)
-			err = -EIO;
-		goto out;
-	}
-
-	mutex_lock(&sbinfo->si_xib_mtx);
-	/* mnt_want_write() is unnecessary here */
-	err = xib_restore(sb);
-	mutex_unlock(&sbinfo->si_xib_mtx);
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * xino mount option handlers
- */
-
-/* xino bitmap */
-static void xino_clear_xib(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	sbinfo->si_xread = NULL;
-	sbinfo->si_xwrite = NULL;
-	if (sbinfo->si_xib)
-		fput(sbinfo->si_xib);
-	sbinfo->si_xib = NULL;
-	free_page((unsigned long)sbinfo->si_xib_buf);
-	sbinfo->si_xib_buf = NULL;
-}
-
-static int au_xino_set_xib(struct super_block *sb, struct file *base)
-{
-	int err;
-	loff_t pos;
-	struct au_sbinfo *sbinfo;
-	struct file *file;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	file = au_xino_create2(base, sbinfo->si_xib);
-	err = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out;
-	if (sbinfo->si_xib)
-		fput(sbinfo->si_xib);
-	sbinfo->si_xib = file;
-	sbinfo->si_xread = vfs_readf(file);
-	sbinfo->si_xwrite = vfs_writef(file);
-
-	err = -ENOMEM;
-	if (!sbinfo->si_xib_buf)
-		sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS);
-	if (unlikely(!sbinfo->si_xib_buf))
-		goto out_unset;
-
-	sbinfo->si_xib_last_pindex = 0;
-	sbinfo->si_xib_next_bit = 0;
-	if (vfsub_f_size_read(file) < PAGE_SIZE) {
-		pos = 0;
-		err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf,
-				  PAGE_SIZE, &pos);
-		if (unlikely(err != PAGE_SIZE))
-			goto out_free;
-	}
-	err = 0;
-	goto out; /* success */
-
-out_free:
-	free_page((unsigned long)sbinfo->si_xib_buf);
-	sbinfo->si_xib_buf = NULL;
-	if (err >= 0)
-		err = -EIO;
-out_unset:
-	fput(sbinfo->si_xib);
-	sbinfo->si_xib = NULL;
-	sbinfo->si_xread = NULL;
-	sbinfo->si_xwrite = NULL;
-out:
-	return err;
-}
-
-/* xino for each branch */
-static void xino_clear_br(struct super_block *sb)
-{
-	aufs_bindex_t bindex, bend;
-	struct au_branch *br;
-
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		if (!br || !br->br_xino.xi_file)
-			continue;
-
-		fput(br->br_xino.xi_file);
-		br->br_xino.xi_file = NULL;
-	}
-}
-
-static int au_xino_set_br(struct super_block *sb, struct file *base)
-{
-	int err;
-	ino_t ino;
-	aufs_bindex_t bindex, bend, bshared;
-	struct {
-		struct file *old, *new;
-	} *fpair, *p;
-	struct au_branch *br;
-	struct inode *inode;
-	vfs_writef_t writef;
-
-	SiMustWriteLock(sb);
-
-	err = -ENOMEM;
-	bend = au_sbend(sb);
-	fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS);
-	if (unlikely(!fpair))
-		goto out;
-
-	inode = d_inode(sb->s_root);
-	ino = AUFS_ROOT_INO;
-	writef = au_sbi(sb)->si_xwrite;
-	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) {
-		br = au_sbr(sb, bindex);
-		bshared = is_sb_shared(sb, bindex, bindex - 1);
-		if (bshared >= 0) {
-			/* shared xino */
-			*p = fpair[bshared];
-			get_file(p->new);
-		}
-
-		if (!p->new) {
-			/* new xino */
-			p->old = br->br_xino.xi_file;
-			p->new = au_xino_create2(base, br->br_xino.xi_file);
-			err = PTR_ERR(p->new);
-			if (IS_ERR(p->new)) {
-				p->new = NULL;
-				goto out_pair;
-			}
-		}
-
-		err = au_xino_do_write(writef, p->new,
-				       au_h_iptr(inode, bindex)->i_ino, ino);
-		if (unlikely(err))
-			goto out_pair;
-	}
-
-	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) {
-		br = au_sbr(sb, bindex);
-		if (br->br_xino.xi_file)
-			fput(br->br_xino.xi_file);
-		get_file(p->new);
-		br->br_xino.xi_file = p->new;
-	}
-
-out_pair:
-	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++)
-		if (p->new)
-			fput(p->new);
-		else
-			break;
-	kfree(fpair);
-out:
-	return err;
-}
-
-void au_xino_clr(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-
-	au_xigen_clr(sb);
-	xino_clear_xib(sb);
-	xino_clear_br(sb);
-	sbinfo = au_sbi(sb);
-	/* lvalue, do not call au_mntflags() */
-	au_opt_clr(sbinfo->si_mntflags, XINO);
-}
-
-int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount)
-{
-	int err, skip;
-	struct dentry *parent, *cur_parent;
-	struct qstr *dname, *cur_name;
-	struct file *cur_xino;
-	struct inode *dir;
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	parent = dget_parent(xino->file->f_path.dentry);
-	if (remount) {
-		skip = 0;
-		dname = &xino->file->f_path.dentry->d_name;
-		cur_xino = sbinfo->si_xib;
-		if (cur_xino) {
-			cur_parent = dget_parent(cur_xino->f_path.dentry);
-			cur_name = &cur_xino->f_path.dentry->d_name;
-			skip = (cur_parent == parent
-				&& au_qstreq(dname, cur_name));
-			dput(cur_parent);
-		}
-		if (skip)
-			goto out;
-	}
-
-	au_opt_set(sbinfo->si_mntflags, XINO);
-	dir = d_inode(parent);
-	mutex_lock_nested(&dir->i_mutex, AuLsc_I_PARENT);
-	/* mnt_want_write() is unnecessary here */
-	err = au_xino_set_xib(sb, xino->file);
-	if (!err)
-		err = au_xigen_set(sb, xino->file);
-	if (!err)
-		err = au_xino_set_br(sb, xino->file);
-	mutex_unlock(&dir->i_mutex);
-	if (!err)
-		goto out; /* success */
-
-	/* reset all */
-	AuIOErr("failed creating xino(%d).\n", err);
-	au_xigen_clr(sb);
-	xino_clear_xib(sb);
-
-out:
-	dput(parent);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * create a xinofile at the default place/path.
- */
-struct file *au_xino_def(struct super_block *sb)
-{
-	struct file *file;
-	char *page, *p;
-	struct au_branch *br;
-	struct super_block *h_sb;
-	struct path path;
-	aufs_bindex_t bend, bindex, bwr;
-
-	br = NULL;
-	bend = au_sbend(sb);
-	bwr = -1;
-	for (bindex = 0; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		if (au_br_writable(br->br_perm)
-		    && !au_test_fs_bad_xino(au_br_sb(br))) {
-			bwr = bindex;
-			break;
-		}
-	}
-
-	if (bwr >= 0) {
-		file = ERR_PTR(-ENOMEM);
-		page = (void *)__get_free_page(GFP_NOFS);
-		if (unlikely(!page))
-			goto out;
-		path.mnt = au_br_mnt(br);
-		path.dentry = au_h_dptr(sb->s_root, bwr);
-		p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME));
-		file = (void *)p;
-		if (!IS_ERR(p)) {
-			strcat(p, "/" AUFS_XINO_FNAME);
-			AuDbg("%s\n", p);
-			file = au_xino_create(sb, p, /*silent*/0);
-			if (!IS_ERR(file))
-				au_xino_brid_set(sb, br->br_id);
-		}
-		free_page((unsigned long)page);
-	} else {
-		file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0);
-		if (IS_ERR(file))
-			goto out;
-		h_sb = file->f_path.dentry->d_sb;
-		if (unlikely(au_test_fs_bad_xino(h_sb))) {
-			pr_err("xino doesn't support %s(%s)\n",
-			       AUFS_XINO_DEFPATH, au_sbtype(h_sb));
-			fput(file);
-			file = ERR_PTR(-EINVAL);
-		}
-		if (!IS_ERR(file))
-			au_xino_brid_set(sb, -1);
-	}
-
-out:
-	return file;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_xino_path(struct seq_file *seq, struct file *file)
-{
-	int err;
-
-	err = au_seq_path(seq, &file->f_path);
-	if (unlikely(err < 0))
-		goto out;
-
-	err = 0;
-#define Deleted "\\040(deleted)"
-	seq->count -= sizeof(Deleted) - 1;
-	AuDebugOn(memcmp(seq->buf + seq->count, Deleted,
-			 sizeof(Deleted) - 1));
-#undef Deleted
-
-out:
-	return err;
-}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 5b700ef1e..c37149b92 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -238,11 +238,6 @@ static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
 	return d_inode(sbi->sb->s_root)->i_ino;
 }
 
-static inline int simple_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 static inline void __autofs4_add_expiring(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index de58cc7b8..da0c33481 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -12,14 +12,13 @@
 
 #include "autofs_i.h"
 
-static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	if (ino && !autofs4_oz_mode(sbi))
 		ino->last_used = jiffies;
-	nd_set_link(nd, d_inode(dentry)->i_private);
-	return NULL;
+	return d_inode(dentry)->i_private;
 }
 
 const struct inode_operations autofs4_symlink_inode_operations = {
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index 1fead8d56..35d19e873 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -112,7 +112,7 @@ BEFS_SB(const struct super_block *super)
 static inline struct befs_inode_info *
 BEFS_I(const struct inode *inode)
 {
-	return list_entry(inode, struct befs_inode_info, vfs_inode);
+	return container_of(inode, struct befs_inode_info, vfs_inode);
 }
 
 static inline befs_blocknr_t
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 0826e91da..22c166280 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -137,8 +137,8 @@ static int
 befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
 		   befs_btree_super * sup)
 {
-	struct buffer_head *bh = NULL;
-	befs_disk_btree_super *od_sup = NULL;
+	struct buffer_head *bh;
+	befs_disk_btree_super *od_sup;
 
 	befs_debug(sb, "---> %s", __func__);
 
@@ -250,7 +250,7 @@ int
 befs_btree_find(struct super_block *sb, befs_data_stream * ds,
 		const char *key, befs_off_t * value)
 {
-	struct befs_btree_node *this_node = NULL;
+	struct befs_btree_node *this_node;
 	befs_btree_super bt_super;
 	befs_off_t node_off;
 	int res;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 7943533c3..46aedacfa 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,8 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
 static struct inode *befs_alloc_inode(struct super_block *sb);
 static void befs_destroy_inode(struct inode *inode);
 static void befs_destroy_inodecache(void);
-static void *befs_follow_link(struct dentry *, struct nameidata *);
-static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
+static const char *befs_follow_link(struct dentry *, void **);
 static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -80,11 +79,6 @@ static const struct address_space_operations befs_aops = {
 	.bmap		= befs_bmap,
 };
 
-static const struct inode_operations befs_fast_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= befs_fast_follow_link,
-};
-
 static const struct inode_operations befs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= befs_follow_link,
@@ -403,10 +397,12 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &befs_dir_inode_operations;
 		inode->i_fop = &befs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (befs_ino->i_flags & BEFS_LONG_SYMLINK)
+		if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
 			inode->i_op = &befs_symlink_inode_operations;
-		else
-			inode->i_op = &befs_fast_symlink_inode_operations;
+		} else {
+			inode->i_link = befs_ino->i_data.symlink;
+			inode->i_op = &simple_symlink_inode_operations;
+		}
 	} else {
 		befs_error(sb, "Inode %lu is not a regular file, "
 			   "directory or symlink. THAT IS WRONG! BeFS has no "
@@ -467,8 +463,8 @@ befs_destroy_inodecache(void)
  * The data stream become link name. Unless the LONG_SYMLINK
  * flag is set.
  */
-static void *
-befs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *
+befs_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
@@ -478,33 +474,20 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
 
 	if (len == 0) {
 		befs_error(sb, "Long symlink with illegal length");
-		link = ERR_PTR(-EIO);
-	} else {
-		befs_debug(sb, "Follow long symlink");
-
-		link = kmalloc(len, GFP_NOFS);
-		if (!link) {
-			link = ERR_PTR(-ENOMEM);
-		} else if (befs_read_lsymlink(sb, data, link, len) != len) {
-			kfree(link);
-			befs_error(sb, "Failed to read entire long symlink");
-			link = ERR_PTR(-EIO);
-		} else {
-			link[len - 1] = '\0';
-		}
+		return ERR_PTR(-EIO);
 	}
-	nd_set_link(nd, link);
-	return NULL;
-}
-
-
-static void *
-befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
+	befs_debug(sb, "Follow long symlink");
 
-	nd_set_link(nd, befs_ino->i_data.symlink);
-	return NULL;
+	link = kmalloc(len, GFP_NOFS);
+	if (!link)
+		return ERR_PTR(-ENOMEM);
+	if (befs_read_lsymlink(sb, data, link, len) != len) {
+		kfree(link);
+		befs_error(sb, "Failed to read entire long symlink");
+		return ERR_PTR(-EIO);
+	}
+	link[len - 1] = '\0';
+	return *cookie = link;
 }
 
 /*
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index cd46e4158..6b6599678 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1530,7 +1530,7 @@ static int fill_files_note(struct memelfnote *note)
 		file = vma->vm_file;
 		if (!file)
 			continue;
-		filename = d_path(&file->f_path, name_curpos, remaining);
+		filename = file_path(file, name_curpos, remaining);
 		if (IS_ERR(filename)) {
 			if (PTR_ERR(filename) == -ENAMETOOLONG) {
 				vfree(data);
@@ -1540,7 +1540,7 @@ static int fill_files_note(struct memelfnote *note)
 			continue;
 		}
 
-		/* d_path() fills at the end, move name down */
+		/* file_path() fills at the end, move name down */
 		/* n = strlen(filename) + 1: */
 		n = (name_curpos + remaining) - filename;
 		remaining = filename - name_curpos;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c7e4163ed..198243717 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -14,6 +14,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/magic.h>
@@ -42,7 +43,7 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode)
 	return container_of(inode, struct bdev_inode, vfs_inode);
 }
 
-inline struct block_device *I_BDEV(struct inode *inode)
+struct block_device *I_BDEV(struct inode *inode)
 {
 	return &BDEV_I(inode)->bdev;
 }
@@ -151,6 +152,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
+	if (IS_DAX(inode))
+		return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
+				NULL, DIO_SKIP_DIO_COUNT);
 	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
 				    blkdev_get_block, NULL, NULL,
 				    DIO_SKIP_DIO_COUNT);
@@ -376,7 +380,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 			struct page *page)
 {
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
-	if (!ops->rw_page)
+	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
 	return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
 }
@@ -407,7 +411,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 	int result;
 	int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
-	if (!ops->rw_page)
+	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
 	set_page_writeback(page);
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
@@ -442,6 +446,12 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
 	long avail;
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
 
+	/*
+	 * The device driver is allowed to sleep, in order to make the
+	 * memory directly accessible.
+	 */
+	might_sleep();
+
 	if (size < 0)
 		return size;
 	if (!ops->direct_access)
@@ -546,7 +556,8 @@ static struct file_system_type bd_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static struct super_block *blockdev_superblock __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
+EXPORT_SYMBOL_GPL(blockdev_superblock);
 
 void __init bdev_cache_init(void)
 {
@@ -687,11 +698,6 @@ static struct block_device *bd_acquire(struct inode *inode)
 	return bdev;
 }
 
-int sb_is_blkdev_sb(struct super_block *sb)
-{
-	return sb == blockdev_superblock;
-}
-
 /* Call when you free inode */
 
 void bd_forget(struct inode *inode)
@@ -1173,6 +1179,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
+		bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
 		if (!partno) {
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, partno);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index df9932b00..1ce06c849 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -85,6 +85,7 @@ BTRFS_WORK_HELPER(extent_refs_helper);
 BTRFS_WORK_HELPER(scrub_helper);
 BTRFS_WORK_HELPER(scrubwrc_helper);
 BTRFS_WORK_HELPER(scrubnc_helper);
+BTRFS_WORK_HELPER(scrubparity_helper);
 
 static struct __btrfs_workqueue *
 __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index ec2ee477f..b0b093b6a 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
 BTRFS_WORK_HELPER_PROTO(scrub_helper);
 BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
 BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
+
 
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
 					      unsigned int flags,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 614aaa196..802fabb30 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -250,8 +250,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 	 * the first item to check. But sometimes, we may enter it with
 	 * slot==nritems. In that case, go to the next leaf before we continue.
 	 */
-	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
-		ret = btrfs_next_old_leaf(root, path, time_seq);
+	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+		if (time_seq == (u64)-1)
+			ret = btrfs_next_leaf(root, path);
+		else
+			ret = btrfs_next_old_leaf(root, path, time_seq);
+	}
 
 	while (!ret && count < total_refs) {
 		eb = path->nodes[0];
@@ -291,7 +295,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			eie = NULL;
 		}
 next:
-		ret = btrfs_next_old_item(root, path, time_seq);
+		if (time_seq == (u64)-1)
+			ret = btrfs_next_item(root, path);
+		else
+			ret = btrfs_next_old_item(root, path, time_seq);
 	}
 
 	if (ret > 0)
@@ -334,6 +341,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 
 	if (path->search_commit_root)
 		root_level = btrfs_header_level(root->commit_root);
+	else if (time_seq == (u64)-1)
+		root_level = btrfs_header_level(root->node);
 	else
 		root_level = btrfs_old_root_level(root, time_seq);
 
@@ -343,7 +352,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	}
 
 	path->lowest_level = level;
-	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+	if (time_seq == (u64)-1)
+		ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
+					0, 0);
+	else
+		ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
+					    time_seq);
 
 	/* root node has been locked, we can release @subvol_srcu safely here */
 	srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -491,7 +505,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
 		BUG_ON(!ref->wanted_disk_byte);
 		eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
 				     0);
-		if (!eb || !extent_buffer_uptodate(eb)) {
+		if (IS_ERR(eb)) {
+			return PTR_ERR(eb);
+		} else if (!extent_buffer_uptodate(eb)) {
 			free_extent_buffer(eb);
 			return -EIO;
 		}
@@ -507,7 +523,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * merge two lists of backrefs and adjust counts accordingly
+ * merge backrefs and adjust counts accordingly
  *
  * mode = 1: merge identical keys, if key is set
  *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
@@ -535,9 +551,9 @@ static void __merge_refs(struct list_head *head, int mode)
 
 			ref2 = list_entry(pos2, struct __prelim_ref, list);
 
+			if (!ref_for_same_block(ref1, ref2))
+				continue;
 			if (mode == 1) {
-				if (!ref_for_same_block(ref1, ref2))
-					continue;
 				if (!ref1->parent && ref2->parent) {
 					xchg = ref1;
 					ref1 = ref2;
@@ -572,8 +588,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			      struct list_head *prefs, u64 *total_refs,
 			      u64 inum)
 {
+	struct btrfs_delayed_ref_node *node;
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
-	struct rb_node *n = &head->node.rb_node;
 	struct btrfs_key key;
 	struct btrfs_key op_key = {0};
 	int sgn;
@@ -583,12 +599,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
 
 	spin_lock(&head->lock);
-	n = rb_first(&head->ref_root);
-	while (n) {
-		struct btrfs_delayed_ref_node *node;
-		node = rb_entry(n, struct btrfs_delayed_ref_node,
-				rb_node);
-		n = rb_next(n);
+	list_for_each_entry(node, &head->ref_list, list) {
 		if (node->seq > seq)
 			continue;
 
@@ -882,6 +893,11 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
  *
  * NOTE: This can return values > 0
  *
+ * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave
+ * much like trans == NULL case, the difference only lies in it will not
+ * commit root.
+ * The special case is for qgroup to search roots in commit_transaction().
+ *
  * FIXME some caching might speed things up
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
@@ -920,6 +936,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 		path->skip_locking = 1;
 	}
 
+	if (time_seq == (u64)-1)
+		path->skip_locking = 1;
+
 	/*
 	 * grab both a lock on the path and a lock on the delayed ref head.
 	 * We need both to get a consistent picture of how the refs look
@@ -934,9 +953,10 @@ again:
 	BUG_ON(ret == 0);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-	if (trans && likely(trans->type != __TRANS_DUMMY)) {
+	if (trans && likely(trans->type != __TRANS_DUMMY) &&
+	    time_seq != (u64)-1) {
 #else
-	if (trans) {
+	if (trans && time_seq != (u64)-1) {
 #endif
 		/*
 		 * look if there are updates for this ref queued and lock the
@@ -1034,7 +1054,10 @@ again:
 
 				eb = read_tree_block(fs_info->extent_root,
 							   ref->parent, 0);
-				if (!eb || !extent_buffer_uptodate(eb)) {
+				if (IS_ERR(eb)) {
+					ret = PTR_ERR(eb);
+					goto out;
+				} else if (!extent_buffer_uptodate(eb)) {
 					free_extent_buffer(eb);
 					ret = -EIO;
 					goto out;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0ef5cc13f..81220b220 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,8 @@
 #define BTRFS_INODE_IN_DELALLOC_LIST		9
 #define BTRFS_INODE_READDIO_NEED_LOCK		10
 #define BTRFS_INODE_HAS_PROPS		        11
+/* DIO is ready to submit */
+#define BTRFS_INODE_DIO_READY		        12
 /*
  * The following 3 bits are meant only for the btree inode.
  * When any of them is set, it means an error happened while writing an
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0f11ebc92..54114b488 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1439,8 +1439,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 		btrfs_tree_read_unlock(eb_root);
 		free_extent_buffer(eb_root);
 		old = read_tree_block(root, logical, 0);
-		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
-			free_extent_buffer(old);
+		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+			if (!IS_ERR(old))
+				free_extent_buffer(old);
 			btrfs_warn(root->fs_info,
 				"failed to read tree block %llu from get_old_root", logical);
 		} else {
@@ -1685,7 +1686,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		if (!cur || !uptodate) {
 			if (!cur) {
 				cur = read_tree_block(root, blocknr, gen);
-				if (!cur || !extent_buffer_uptodate(cur)) {
+				if (IS_ERR(cur)) {
+					return PTR_ERR(cur);
+				} else if (!extent_buffer_uptodate(cur)) {
 					free_extent_buffer(cur);
 					return -EIO;
 				}
@@ -1864,8 +1867,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 
 	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
 			     btrfs_node_ptr_generation(parent, slot));
-	if (eb && !extent_buffer_uptodate(eb)) {
-		free_extent_buffer(eb);
+	if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
+		if (!IS_ERR(eb))
+			free_extent_buffer(eb);
 		eb = NULL;
 	}
 
@@ -2494,7 +2498,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 
 	ret = -EAGAIN;
 	tmp = read_tree_block(root, blocknr, 0);
-	if (tmp) {
+	if (!IS_ERR(tmp)) {
 		/*
 		 * If the read above didn't mark this buffer up to date,
 		 * it will never end up being up to date.  Set ret to EIO now
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6f364e1d8..aac314e14 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -174,7 +174,7 @@ struct btrfs_ordered_sum;
 /* csum types */
 #define BTRFS_CSUM_TYPE_CRC32	0
 
-static int btrfs_csum_sizes[] = { 4, 0 };
+static int btrfs_csum_sizes[] = { 4 };
 
 /* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
@@ -1619,10 +1619,7 @@ struct btrfs_fs_info {
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
-	struct kobject super_kobj;
 	struct kobject *space_info_kobj;
-	struct kobject *device_dir_kobj;
-	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
 	int log_root_recovering;
@@ -1698,6 +1695,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *scrub_workers;
 	struct btrfs_workqueue *scrub_wr_completion_workers;
 	struct btrfs_workqueue *scrub_nocow_workers;
+	struct btrfs_workqueue *scrub_parity_workers;
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	u32 check_integrity_print_mask;
@@ -1735,7 +1733,7 @@ struct btrfs_fs_info {
 	/* list of dirty qgroups to be written at next commit */
 	struct list_head dirty_qgroups;
 
-	/* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+	/* used by qgroup for an efficient tree traversal */
 	u64 qgroup_seq;
 
 	/* qgroup rescan items */
@@ -1780,6 +1778,7 @@ struct btrfs_fs_info {
 	spinlock_t unused_bgs_lock;
 	struct list_head unused_bgs;
 	struct mutex unused_bg_unpin_mutex;
+	struct mutex delete_unused_bgs_mutex;
 
 	/* For btrfs to record security options */
 	struct security_mnt_opts security_opts;
@@ -3458,6 +3457,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 				  struct inode *inode);
 void btrfs_orphan_release_metadata(struct inode *inode);
@@ -3515,6 +3515,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 int __get_raid_index(u64 flags);
 int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
 void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void check_system_chunk(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			const u64 type);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -4050,6 +4053,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 
 #ifdef CONFIG_BTRFS_ASSERT
 
+__cold
 static inline void assfail(char *expr, char *file, int line)
 {
 	pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
@@ -4065,10 +4069,12 @@ static inline void assfail(char *expr, char *file, int line)
 
 #define btrfs_assert()
 __printf(5, 6)
+__cold
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		     unsigned int line, int errno, const char *fmt, ...);
 
 
+__cold
 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root, const char *function,
 			       unsigned int line, int errno);
@@ -4111,11 +4117,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
  * Call btrfs_abort_transaction as early as possible when an error condition is
  * detected, that way the exact line number is reported.
  */
-
 #define btrfs_abort_transaction(trans, root, errno)		\
 do {								\
-	__btrfs_abort_transaction(trans, root, __func__,	\
-				  __LINE__, errno);		\
+	/* Report first abort since mount */			\
+	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
+			&((root)->fs_info->fs_state))) {	\
+		WARN(1, KERN_DEBUG				\
+		"BTRFS: Transaction aborted (error %d)\n",	\
+		(errno));					\
+	}							\
+	__btrfs_abort_transaction((trans), (root), __func__,	\
+				  __LINE__, (errno));		\
 } while (0)
 
 #define btrfs_std_error(fs_info, errno)				\
@@ -4132,6 +4144,7 @@ do {								\
 } while (0)
 
 __printf(5, 6)
+__cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 		   unsigned int line, int errno, const char *fmt, ...);
 
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 8f8ed7d20..ac3e81da6 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -22,6 +22,7 @@
 #include "ctree.h"
 #include "delayed-ref.h"
 #include "transaction.h"
+#include "qgroup.h"
 
 struct kmem_cache *btrfs_delayed_ref_head_cachep;
 struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
 	return 0;
 }
 
-/*
- * entries in the rb tree are ordered by the byte number of the extent,
- * type of the delayed backrefs and content of delayed backrefs.
- */
-static int comp_entry(struct btrfs_delayed_ref_node *ref2,
-		      struct btrfs_delayed_ref_node *ref1,
-		      bool compare_seq)
-{
-	if (ref1->bytenr < ref2->bytenr)
-		return -1;
-	if (ref1->bytenr > ref2->bytenr)
-		return 1;
-	if (ref1->is_head && ref2->is_head)
-		return 0;
-	if (ref2->is_head)
-		return -1;
-	if (ref1->is_head)
-		return 1;
-	if (ref1->type < ref2->type)
-		return -1;
-	if (ref1->type > ref2->type)
-		return 1;
-	if (ref1->no_quota > ref2->no_quota)
-		return 1;
-	if (ref1->no_quota < ref2->no_quota)
-		return -1;
-	/* merging of sequenced refs is not allowed */
-	if (compare_seq) {
-		if (ref1->seq < ref2->seq)
-			return -1;
-		if (ref1->seq > ref2->seq)
-			return 1;
-	}
-	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
-	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
-		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
-				      btrfs_delayed_node_to_tree_ref(ref1),
-				      ref1->type);
-	} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
-		   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
-		return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
-				      btrfs_delayed_node_to_data_ref(ref1));
-	}
-	BUG();
-	return 0;
-}
-
-/*
- * insert a new ref into the rbtree.  This returns any existing refs
- * for the same (bytenr,parent) tuple, or NULL if the new node was properly
- * inserted.
- */
-static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
-						  struct rb_node *node)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent_node = NULL;
-	struct btrfs_delayed_ref_node *entry;
-	struct btrfs_delayed_ref_node *ins;
-	int cmp;
-
-	ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-	while (*p) {
-		parent_node = *p;
-		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
-				 rb_node);
-
-		cmp = comp_entry(entry, ins, 1);
-		if (cmp < 0)
-			p = &(*p)->rb_left;
-		else if (cmp > 0)
-			p = &(*p)->rb_right;
-		else
-			return entry;
-	}
-
-	rb_link_node(node, parent_node, p);
-	rb_insert_color(node, root);
-	return NULL;
-}
-
 /* insert a new ref to head ref rbtree */
 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
 						   struct rb_node *node)
@@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 		rb_erase(&head->href_node, &delayed_refs->href_root);
 	} else {
 		assert_spin_locked(&head->lock);
-		rb_erase(&ref->rb_node, &head->ref_root);
+		list_del(&ref->list);
 	}
 	ref->in_tree = 0;
 	btrfs_put_delayed_ref(ref);
@@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 		trans->delayed_ref_updates--;
 }
 
-static int merge_ref(struct btrfs_trans_handle *trans,
-		     struct btrfs_delayed_ref_root *delayed_refs,
-		     struct btrfs_delayed_ref_head *head,
-		     struct btrfs_delayed_ref_node *ref, u64 seq)
-{
-	struct rb_node *node;
-	int mod = 0;
-	int done = 0;
-
-	node = rb_next(&ref->rb_node);
-	while (!done && node) {
-		struct btrfs_delayed_ref_node *next;
-
-		next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		node = rb_next(node);
-		if (seq && next->seq >= seq)
-			break;
-		if (comp_entry(ref, next, 0))
-			continue;
-
-		if (ref->action == next->action) {
-			mod = next->ref_mod;
-		} else {
-			if (ref->ref_mod < next->ref_mod) {
-				struct btrfs_delayed_ref_node *tmp;
-
-				tmp = ref;
-				ref = next;
-				next = tmp;
-				done = 1;
-			}
-			mod = -next->ref_mod;
-		}
-
-		drop_delayed_ref(trans, delayed_refs, head, next);
-		ref->ref_mod += mod;
-		if (ref->ref_mod == 0) {
-			drop_delayed_ref(trans, delayed_refs, head, ref);
-			done = 1;
-		} else {
-			/*
-			 * You can't have multiples of the same ref on a tree
-			 * block.
-			 */
-			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
-				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
-		}
-	}
-	return done;
-}
-
-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
-			      struct btrfs_fs_info *fs_info,
-			      struct btrfs_delayed_ref_root *delayed_refs,
-			      struct btrfs_delayed_ref_head *head)
-{
-	struct rb_node *node;
-	u64 seq = 0;
-
-	assert_spin_locked(&head->lock);
-	/*
-	 * We don't have too much refs to merge in the case of delayed data
-	 * refs.
-	 */
-	if (head->is_data)
-		return;
-
-	spin_lock(&fs_info->tree_mod_seq_lock);
-	if (!list_empty(&fs_info->tree_mod_seq_list)) {
-		struct seq_list *elem;
-
-		elem = list_first_entry(&fs_info->tree_mod_seq_list,
-					struct seq_list, list);
-		seq = elem->seq;
-	}
-	spin_unlock(&fs_info->tree_mod_seq_lock);
-
-	node = rb_first(&head->ref_root);
-	while (node) {
-		struct btrfs_delayed_ref_node *ref;
-
-		ref = rb_entry(node, struct btrfs_delayed_ref_node,
-			       rb_node);
-		/* We can't merge refs that are outside of our seq count */
-		if (seq && ref->seq >= seq)
-			break;
-		if (merge_ref(trans, delayed_refs, head, ref, seq))
-			node = rb_first(&head->ref_root);
-		else
-			node = rb_next(&ref->rb_node);
-	}
-}
-
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq)
@@ -443,45 +270,71 @@ again:
 }
 
 /*
- * helper function to update an extent delayed ref in the
- * rbtree.  existing and update must both have the same
- * bytenr and parent
+ * Helper to insert the ref_node to the tail or merge with tail.
  *
- * This may free existing if the update cancels out whatever
- * operation it was doing.
+ * Return 0 for insert.
+ * Return >0 for merge.
  */
-static noinline void
-update_existing_ref(struct btrfs_trans_handle *trans,
-		    struct btrfs_delayed_ref_root *delayed_refs,
-		    struct btrfs_delayed_ref_head *head,
-		    struct btrfs_delayed_ref_node *existing,
-		    struct btrfs_delayed_ref_node *update)
+static int
+add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_root *root,
+			   struct btrfs_delayed_ref_head *href,
+			   struct btrfs_delayed_ref_node *ref)
 {
-	if (update->action != existing->action) {
-		/*
-		 * this is effectively undoing either an add or a
-		 * drop.  We decrement the ref_mod, and if it goes
-		 * down to zero we just delete the entry without
-		 * every changing the extent allocation tree.
-		 */
-		existing->ref_mod--;
-		if (existing->ref_mod == 0)
-			drop_delayed_ref(trans, delayed_refs, head, existing);
-		else
-			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
-				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+	struct btrfs_delayed_ref_node *exist;
+	int mod;
+	int ret = 0;
+
+	spin_lock(&href->lock);
+	/* Check whether we can merge the tail node with ref */
+	if (list_empty(&href->ref_list))
+		goto add_tail;
+	exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
+			   list);
+	/* No need to compare bytenr nor is_head */
+	if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
+	    exist->seq != ref->seq)
+		goto add_tail;
+
+	if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	     exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+	    comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
+			   btrfs_delayed_node_to_tree_ref(ref),
+			   ref->type))
+		goto add_tail;
+	if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
+	     exist->type == BTRFS_SHARED_DATA_REF_KEY) &&
+	    comp_data_refs(btrfs_delayed_node_to_data_ref(exist),
+			   btrfs_delayed_node_to_data_ref(ref)))
+		goto add_tail;
+
+	/* Now we are sure we can merge */
+	ret = 1;
+	if (exist->action == ref->action) {
+		mod = ref->ref_mod;
 	} else {
-		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
-			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-		/*
-		 * the action on the existing ref matches
-		 * the action on the ref we're trying to add.
-		 * Bump the ref_mod by one so the backref that
-		 * is eventually added/removed has the correct
-		 * reference count
-		 */
-		existing->ref_mod += update->ref_mod;
+		/* Need to change action */
+		if (exist->ref_mod < ref->ref_mod) {
+			exist->action = ref->action;
+			mod = -exist->ref_mod;
+			exist->ref_mod = ref->ref_mod;
+		} else
+			mod = -ref->ref_mod;
 	}
+	exist->ref_mod += mod;
+
+	/* remove existing tail if its ref_mod is zero */
+	if (exist->ref_mod == 0)
+		drop_delayed_ref(trans, root, href, exist);
+	spin_unlock(&href->lock);
+	return ret;
+
+add_tail:
+	list_add_tail(&ref->list, &href->ref_list);
+	atomic_inc(&root->num_entries);
+	trans->delayed_ref_updates++;
+	spin_unlock(&href->lock);
+	return ret;
 }
 
 /*
@@ -568,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
 static noinline struct btrfs_delayed_ref_head *
 add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		     struct btrfs_trans_handle *trans,
-		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
-		     u64 num_bytes, int action, int is_data)
+		     struct btrfs_delayed_ref_node *ref,
+		     struct btrfs_qgroup_extent_record *qrecord,
+		     u64 bytenr, u64 num_bytes, int action, int is_data)
 {
 	struct btrfs_delayed_ref_head *existing;
 	struct btrfs_delayed_ref_head *head_ref = NULL;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_qgroup_extent_record *qexisting;
 	int count_mod = 1;
 	int must_insert_reserved = 0;
 
@@ -618,10 +473,22 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	head_ref = btrfs_delayed_node_to_head(ref);
 	head_ref->must_insert_reserved = must_insert_reserved;
 	head_ref->is_data = is_data;
-	head_ref->ref_root = RB_ROOT;
+	INIT_LIST_HEAD(&head_ref->ref_list);
 	head_ref->processing = 0;
 	head_ref->total_ref_mod = count_mod;
 
+	/* Record qgroup extent info if provided */
+	if (qrecord) {
+		qrecord->bytenr = bytenr;
+		qrecord->num_bytes = num_bytes;
+		qrecord->old_roots = NULL;
+
+		qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+							     qrecord);
+		if (qexisting)
+			kfree(qrecord);
+	}
+
 	spin_lock_init(&head_ref->lock);
 	mutex_init(&head_ref->mutex);
 
@@ -659,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 		     u64 num_bytes, u64 parent, u64 ref_root, int level,
 		     int action, int no_quota)
 {
-	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_tree_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	u64 seq = 0;
+	int ret;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -693,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 
 	trace_add_delayed_tree_ref(ref, full_ref, action);
 
-	spin_lock(&head_ref->lock);
-	existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
-	if (existing) {
-		update_existing_ref(trans, delayed_refs, head_ref, existing,
-				    ref);
-		/*
-		 * we've updated the existing ref, free the newly
-		 * allocated ref
-		 */
+	ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+	/*
+	 * XXX: memory should be freed at the same level allocated.
+	 * But bad practice is anywhere... Follow it now. Need cleanup.
+	 */
+	if (ret > 0)
 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
-	} else {
-		atomic_inc(&delayed_refs->num_entries);
-		trans->delayed_ref_updates++;
-	}
-	spin_unlock(&head_ref->lock);
 }
 
 /*
@@ -721,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 		     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
 		     u64 offset, int action, int no_quota)
 {
-	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_data_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	u64 seq = 0;
+	int ret;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -758,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 
 	trace_add_delayed_data_ref(ref, full_ref, action);
 
-	spin_lock(&head_ref->lock);
-	existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
-	if (existing) {
-		update_existing_ref(trans, delayed_refs, head_ref, existing,
-				    ref);
-		/*
-		 * we've updated the existing ref, free the newly
-		 * allocated ref
-		 */
+	ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+	if (ret > 0)
 		kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
-	} else {
-		atomic_inc(&delayed_refs->num_entries);
-		trans->delayed_ref_updates++;
-	}
-	spin_unlock(&head_ref->lock);
 }
 
 /*
@@ -790,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_qgroup_extent_record *record = NULL;
 
 	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
 		no_quota = 0;
@@ -800,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 		return -ENOMEM;
 
 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
-	if (!head_ref) {
-		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
-		return -ENOMEM;
+	if (!head_ref)
+		goto free_ref;
+
+	if (fs_info->quota_enabled && is_fstree(ref_root)) {
+		record = kmalloc(sizeof(*record), GFP_NOFS);
+		if (!record)
+			goto free_head_ref;
 	}
 
 	head_ref->extent_op = extent_op;
@@ -814,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
 					bytenr, num_bytes, action, 0);
 
 	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -823,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	spin_unlock(&delayed_refs->lock);
 
 	return 0;
+
+free_head_ref:
+	kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+free_ref:
+	kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
+	return -ENOMEM;
 }
 
 /*
@@ -839,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_data_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_qgroup_extent_record *record = NULL;
 
 	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
 		no_quota = 0;
@@ -854,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 		return -ENOMEM;
 	}
 
+	if (fs_info->quota_enabled && is_fstree(ref_root)) {
+		record = kmalloc(sizeof(*record), GFP_NOFS);
+		if (!record) {
+			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+			kmem_cache_free(btrfs_delayed_ref_head_cachep,
+					head_ref);
+			return -ENOMEM;
+		}
+	}
+
 	head_ref->extent_op = extent_op;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -863,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
 					bytenr, num_bytes, action, 1);
 
 	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -891,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
-				   extent_op->is_data);
+	add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
+			     num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+			     extent_op->is_data);
 
 	spin_unlock(&delayed_refs->lock);
 	return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5eb089239..13fb5e609 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -24,9 +24,25 @@
 #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
 #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
 
+/*
+ * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the
+ * same ref_node structure.
+ * Ref_head is in a higher logic level than tree/data ref, and duplicated
+ * bytenr/num_bytes in ref_node is really a waste or memory, they should be
+ * referred from ref_head.
+ * This gets more disgusting after we use list to store tree/data ref in
+ * ref_head. Must clean this mess up later.
+ */
 struct btrfs_delayed_ref_node {
+	/*
+	 * ref_head use rb tree, stored in ref_root->href.
+	 * indexed by bytenr
+	 */
 	struct rb_node rb_node;
 
+	/*data/tree ref use list, stored in ref_head->ref_list. */
+	struct list_head list;
+
 	/* the starting bytenr of the extent */
 	u64 bytenr;
 
@@ -83,7 +99,7 @@ struct btrfs_delayed_ref_head {
 	struct mutex mutex;
 
 	spinlock_t lock;
-	struct rb_root ref_root;
+	struct list_head ref_list;
 
 	struct rb_node href_node;
 
@@ -132,6 +148,9 @@ struct btrfs_delayed_ref_root {
 	/* head ref rbtree */
 	struct rb_root href_root;
 
+	/* dirty extent records */
+	struct rb_root dirty_extent_root;
+
 	/* this spin lock protects the rbtree and the entries inside */
 	spinlock_t lock;
 
@@ -156,6 +175,14 @@ struct btrfs_delayed_ref_root {
 	int flushing;
 
 	u64 run_delayed_start;
+
+	/*
+	 * To make qgroup to skip given root.
+	 * This is for snapshot, as btrfs_qgroup_inherit() will manully
+	 * modify counters for snapshot and its source, so we should skip
+	 * the snapshot in new_root/old_roots or it will get calculated twice
+	 */
+	u64 qgroup_to_skip;
 };
 
 extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 0573848c7..564a7de17 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	WARN_ON(!tgt_device);
 	dev_replace->tgtdev = tgt_device;
 
+	ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
+	if (ret)
+		btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
 	printk_in_rcu(KERN_INFO
 		      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
 		      src_device->missing ? "<missing disk>" :
@@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	mutex_unlock(&uuid_mutex);
 
 	/* replace the sysfs entry */
-	btrfs_kobj_rm_device(fs_info, src_device);
-	btrfs_kobj_add_device(fs_info, tgt_device);
+	btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
 	btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
 
 	/* write back the superblocks */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2ef9a4b72..f556c3732 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1149,12 +1149,12 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
 	buf = btrfs_find_create_tree_block(root, bytenr);
 	if (!buf)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 	if (ret) {
 		free_extent_buffer(buf);
-		return NULL;
+		return ERR_PTR(ret);
 	}
 	return buf;
 
@@ -1509,20 +1509,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
 	generation = btrfs_root_generation(&root->root_item);
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     generation);
-	if (!root->node) {
-		ret = -ENOMEM;
+	if (IS_ERR(root->node)) {
+		ret = PTR_ERR(root->node);
 		goto find_fail;
 	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
 		ret = -EIO;
-		goto read_fail;
+		free_extent_buffer(root->node);
+		goto find_fail;
 	}
 	root->commit_root = btrfs_root_node(root);
 out:
 	btrfs_free_path(path);
 	return root;
 
-read_fail:
-	free_extent_buffer(root->node);
 find_fail:
 	kfree(root);
 alloc_fail:
@@ -1745,13 +1744,14 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 static int cleaner_kthread(void *arg)
 {
 	struct btrfs_root *root = arg;
 	int again;
+	struct btrfs_trans_handle *trans;
 
 	do {
 		again = 0;
@@ -1773,7 +1773,6 @@ static int cleaner_kthread(void *arg)
 		}
 
 		btrfs_run_delayed_iputs(root);
-		btrfs_delete_unused_bgs(root->fs_info);
 		again = btrfs_clean_one_deleted_snapshot(root);
 		mutex_unlock(&root->fs_info->cleaner_mutex);
 
@@ -1782,6 +1781,16 @@ static int cleaner_kthread(void *arg)
 		 * needn't do anything special here.
 		 */
 		btrfs_run_defrag_inodes(root->fs_info);
+
+		/*
+		 * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
+		 * with relocation (btrfs_relocate_chunk) and relocation
+		 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
+		 * after acquiring fs_info->delete_unused_bgs_mutex. So we
+		 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
+		 * unused block groups.
+		 */
+		btrfs_delete_unused_bgs(root->fs_info);
 sleep:
 		if (!try_to_freeze() && !again) {
 			set_current_state(TASK_INTERRUPTIBLE);
@@ -1790,6 +1799,34 @@ sleep:
 			__set_current_state(TASK_RUNNING);
 		}
 	} while (!kthread_should_stop());
+
+	/*
+	 * Transaction kthread is stopped before us and wakes us up.
+	 * However we might have started a new transaction and COWed some
+	 * tree blocks when deleting unused block groups for example. So
+	 * make sure we commit the transaction we started to have a clean
+	 * shutdown when evicting the btree inode - if it has dirty pages
+	 * when we do the final iput() on it, eviction will trigger a
+	 * writeback for it which will fail with null pointer dereferences
+	 * since work queues and other resources were already released and
+	 * destroyed by the time the iput/eviction/writeback is made.
+	 */
+	trans = btrfs_attach_transaction(root);
+	if (IS_ERR(trans)) {
+		if (PTR_ERR(trans) != -ENOENT)
+			btrfs_err(root->fs_info,
+				  "cleaner transaction attach returned %ld",
+				  PTR_ERR(trans));
+	} else {
+		int ret;
+
+		ret = btrfs_commit_transaction(trans, root);
+		if (ret)
+			btrfs_err(root->fs_info,
+				  "cleaner open transaction commit returned %d",
+				  ret);
+	}
+
 	return 0;
 }
 
@@ -2320,8 +2357,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 
 	log_tree_root->node = read_tree_block(tree_root, bytenr,
 			fs_info->generation + 1);
-	if (!log_tree_root->node ||
-	    !extent_buffer_uptodate(log_tree_root->node)) {
+	if (IS_ERR(log_tree_root->node)) {
+		printk(KERN_ERR "BTRFS: failed to read log tree\n");
+		ret = PTR_ERR(log_tree_root->node);
+		kfree(log_tree_root);
+		return ret;
+	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
 		printk(KERN_ERR "BTRFS: failed to read log tree\n");
 		free_extent_buffer(log_tree_root->node);
 		kfree(log_tree_root);
@@ -2489,12 +2530,12 @@ int open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->unused_bgs_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
 	mutex_init(&fs_info->unused_bg_unpin_mutex);
+	mutex_init(&fs_info->delete_unused_bgs_mutex);
 	mutex_init(&fs_info->reloc_mutex);
 	mutex_init(&fs_info->delalloc_root_mutex);
 	seqlock_init(&fs_info->profiles_lock);
 	init_rwsem(&fs_info->delayed_iput_sem);
 
-	init_completion(&fs_info->kobj_unregister);
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -2797,10 +2838,11 @@ int open_ctree(struct super_block *sb,
 	chunk_root->node = read_tree_block(chunk_root,
 					   btrfs_super_chunk_root(disk_super),
 					   generation);
-	if (!chunk_root->node ||
-	    !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+	if (IS_ERR(chunk_root->node) ||
+	    !extent_buffer_uptodate(chunk_root->node)) {
 		printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
 		       sb->s_id);
+		chunk_root->node = NULL;
 		goto fail_tree_roots;
 	}
 	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
@@ -2834,11 +2876,11 @@ retry_root_backup:
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
 					  generation);
-	if (!tree_root->node ||
-	    !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+	if (IS_ERR(tree_root->node) ||
+	    !extent_buffer_uptodate(tree_root->node)) {
 		printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
 		       sb->s_id);
-
+		tree_root->node = NULL;
 		goto recovery_tree_root;
 	}
 
@@ -2874,10 +2916,22 @@ retry_root_backup:
 
 	btrfs_close_extra_devices(fs_devices, 1);
 
+	ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
+	if (ret) {
+		pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_sysfs_add_device(fs_devices);
+	if (ret) {
+		pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
+		goto fail_fsdev_sysfs;
+	}
+
 	ret = btrfs_sysfs_add_one(fs_info);
 	if (ret) {
 		pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
-		goto fail_block_groups;
+		goto fail_fsdev_sysfs;
 	}
 
 	ret = btrfs_init_space_info(fs_info);
@@ -3055,6 +3109,9 @@ fail_cleaner:
 fail_sysfs:
 	btrfs_sysfs_remove_one(fs_info);
 
+fail_fsdev_sysfs:
+	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
+
 fail_block_groups:
 	btrfs_put_block_group_cache(fs_info);
 	btrfs_free_block_groups(fs_info);
@@ -3269,11 +3326,8 @@ static int write_dev_supers(struct btrfs_device *device,
  */
 static void btrfs_end_empty_barrier(struct bio *bio, int err)
 {
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+	if (err)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
 	if (bio->bi_private)
 		complete(bio->bi_private);
 	bio_put(bio);
@@ -3301,11 +3355,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 		wait_for_completion(&device->flush_wait);
 
-		if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-			printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
-				      rcu_str_deref(device->name));
-			device->nobarriers = 1;
-		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
+		if (!bio_flagged(bio, BIO_UPTODATE)) {
 			ret = -EIO;
 			btrfs_dev_stat_inc_and_print(device,
 				BTRFS_DEV_STAT_FLUSH_ERRS);
@@ -3732,6 +3782,7 @@ void close_ctree(struct btrfs_root *root)
 	}
 
 	btrfs_sysfs_remove_one(fs_info);
+	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
 
 	btrfs_free_fs_roots(fs_info);
 
@@ -4060,6 +4111,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 
 	while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
 		struct btrfs_delayed_ref_head *head;
+		struct btrfs_delayed_ref_node *tmp;
 		bool pin_bytes = false;
 
 		head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -4075,11 +4127,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 			continue;
 		}
 		spin_lock(&head->lock);
-		while ((node = rb_first(&head->ref_root)) != NULL) {
-			ref = rb_entry(node, struct btrfs_delayed_ref_node,
-				       rb_node);
+		list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
+						 list) {
 			ref->in_tree = 0;
-			rb_erase(&ref->rb_node, &head->ref_root);
+			list_del(&ref->list);
 			atomic_dec(&delayed_refs->num_entries);
 			btrfs_put_delayed_ref(ref);
 		}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ec3acd14..07204bf60 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			      u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes, u64 parent,
+				struct btrfs_delayed_ref_node *node, u64 parent,
 				u64 root_objectid, u64 owner_objectid,
 				u64 owner_offset, int refs_to_drop,
-				struct btrfs_delayed_extent_op *extra_op,
-				int no_quota);
+				struct btrfs_delayed_extent_op *extra_op);
 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 				    struct extent_buffer *leaf,
 				    struct btrfs_extent_item *ei);
@@ -1967,10 +1966,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  u64 bytenr, u64 num_bytes,
+				  struct btrfs_delayed_ref_node *node,
 				  u64 parent, u64 root_objectid,
 				  u64 owner, u64 offset, int refs_to_add,
-				  int no_quota,
 				  struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1978,9 +1976,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_extent_item *item;
 	struct btrfs_key key;
+	u64 bytenr = node->bytenr;
+	u64 num_bytes = node->num_bytes;
 	u64 refs;
 	int ret;
-	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
+	int no_quota = node->no_quota;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1996,26 +1996,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 					   bytenr, num_bytes, parent,
 					   root_objectid, owner, offset,
 					   refs_to_add, extent_op);
-	if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
+	if ((ret < 0 && ret != -EAGAIN) || !ret)
 		goto out;
-	/*
-	 * Ok we were able to insert an inline extent and it appears to be a new
-	 * reference, deal with the qgroup accounting.
-	 */
-	if (!ret && !no_quota) {
-		ASSERT(root->fs_info->quota_enabled);
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		item = btrfs_item_ptr(leaf, path->slots[0],
-				      struct btrfs_extent_item);
-		if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
-			type = BTRFS_QGROUP_OPER_ADD_SHARED;
-		btrfs_release_path(path);
-
-		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-					      bytenr, num_bytes, type, 0);
-		goto out;
-	}
 
 	/*
 	 * Ok we had -EAGAIN which means we didn't have space to insert and
@@ -2026,8 +2008,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, item);
-	if (refs)
-		type = BTRFS_QGROUP_OPER_ADD_SHARED;
 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
 	if (extent_op)
 		__run_delayed_extent_op(extent_op, leaf, item);
@@ -2035,13 +2015,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
 
-	if (!no_quota) {
-		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-					      bytenr, num_bytes, type, 0);
-		if (ret)
-			goto out;
-	}
-
 	path->reada = 1;
 	path->leave_spinning = 1;
 	/* now insert the actual backref */
@@ -2087,17 +2060,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 						 ref->objectid, ref->offset,
 						 &ins, node->ref_mod);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
-		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
-					     node->num_bytes, parent,
+		ret = __btrfs_inc_extent_ref(trans, root, node, parent,
 					     ref_root, ref->objectid,
 					     ref->offset, node->ref_mod,
-					     node->no_quota, extent_op);
+					     extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
-		ret = __btrfs_free_extent(trans, root, node->bytenr,
-					  node->num_bytes, parent,
+		ret = __btrfs_free_extent(trans, root, node, parent,
 					  ref_root, ref->objectid,
 					  ref->offset, node->ref_mod,
-					  extent_op, node->no_quota);
+					  extent_op);
 	} else {
 		BUG();
 	}
@@ -2255,15 +2226,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 						ref->level, &ins,
 						node->no_quota);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
-		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
-					     node->num_bytes, parent, ref_root,
-					     ref->level, 0, 1, node->no_quota,
+		ret = __btrfs_inc_extent_ref(trans, root, node,
+					     parent, ref_root,
+					     ref->level, 0, 1,
 					     extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
-		ret = __btrfs_free_extent(trans, root, node->bytenr,
-					  node->num_bytes, parent, ref_root,
-					  ref->level, 0, 1, extent_op,
-					  node->no_quota);
+		ret = __btrfs_free_extent(trans, root, node,
+					  parent, ref_root,
+					  ref->level, 0, 1, extent_op);
 	} else {
 		BUG();
 	}
@@ -2323,28 +2293,27 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static noinline struct btrfs_delayed_ref_node *
+static inline struct btrfs_delayed_ref_node *
 select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
-	struct rb_node *node;
-	struct btrfs_delayed_ref_node *ref, *last = NULL;;
+	struct btrfs_delayed_ref_node *ref;
+
+	if (list_empty(&head->ref_list))
+		return NULL;
 
 	/*
-	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
-	 * this prevents ref count from going down to zero when
-	 * there still are pending delayed ref.
+	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+	 * This is to prevent a ref count from going down to zero, which deletes
+	 * the extent item from the extent tree, when there still are references
+	 * to add, which would fail because they would not find the extent item.
 	 */
-	node = rb_first(&head->ref_root);
-	while (node) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node,
-				rb_node);
+	list_for_each_entry(ref, &head->ref_list, list) {
 		if (ref->action == BTRFS_ADD_DELAYED_REF)
 			return ref;
-		else if (last == NULL)
-			last = ref;
-		node = rb_next(node);
 	}
-	return last;
+
+	return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
+			  list);
 }
 
 /*
@@ -2396,16 +2365,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			}
 		}
 
-		/*
-		 * We need to try and merge add/drops of the same ref since we
-		 * can run into issues with relocate dropping the implicit ref
-		 * and then it being added back again before the drop can
-		 * finish.  If we merged anything we need to re-loop so we can
-		 * get a good ref.
-		 */
 		spin_lock(&locked_ref->lock);
-		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
-					 locked_ref);
 
 		/*
 		 * locked_ref is the head node, so we have to go one
@@ -2482,7 +2442,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			spin_unlock(&locked_ref->lock);
 			spin_lock(&delayed_refs->lock);
 			spin_lock(&locked_ref->lock);
-			if (rb_first(&locked_ref->ref_root) ||
+			if (!list_empty(&locked_ref->ref_list) ||
 			    locked_ref->extent_op) {
 				spin_unlock(&locked_ref->lock);
 				spin_unlock(&delayed_refs->lock);
@@ -2496,7 +2456,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		} else {
 			actual_count++;
 			ref->in_tree = 0;
-			rb_erase(&ref->rb_node, &locked_ref->ref_root);
+			list_del(&ref->list);
 		}
 		atomic_dec(&delayed_refs->num_entries);
 
@@ -2864,9 +2824,6 @@ again:
 		goto again;
 	}
 out:
-	ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
-	if (ret)
-		return ret;
 	assert_qgroups_uptodate(trans);
 	return 0;
 }
@@ -2905,7 +2862,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_data_ref *data_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	struct rb_node *node;
 	int ret = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -2934,11 +2890,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
 	spin_unlock(&delayed_refs->lock);
 
 	spin_lock(&head->lock);
-	node = rb_first(&head->ref_root);
-	while (node) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		node = rb_next(node);
-
+	list_for_each_entry(ref, &head->ref_list, list) {
 		/* If it's a shared ref we know a cross reference exists */
 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
 			ret = 1;
@@ -3693,7 +3645,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		found->disk_total += total_bytes * factor;
 		found->bytes_used += bytes_used;
 		found->disk_used += bytes_used * factor;
-		found->full = 0;
+		if (total_bytes > 0)
+			found->full = 0;
 		spin_unlock(&found->lock);
 		*space_info = found;
 		return 0;
@@ -3721,7 +3674,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_reserved = 0;
 	found->bytes_readonly = 0;
 	found->bytes_may_use = 0;
-	found->full = 0;
+	if (total_bytes > 0)
+		found->full = 0;
+	else
+		found->full = 1;
 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
 	found->chunk_alloc = 0;
 	found->flush = 0;
@@ -3975,6 +3931,9 @@ commit_trans:
 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
 			need_commit--;
 
+			if (need_commit > 0)
+				btrfs_wait_ordered_roots(fs_info, -1);
+
 			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
@@ -4088,7 +4047,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	return 1;
 }
 
-static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
 {
 	u64 num_dev;
 
@@ -4102,24 +4061,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
 	else
 		num_dev = 1;	/* DUP or single */
 
-	/* metadata for updaing devices and chunk tree */
-	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+	return num_dev;
 }
 
-static void check_system_chunk(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root, u64 type)
+/*
+ * If @is_allocation is true, reserve space in the system space info necessary
+ * for allocating a chunk, otherwise if it's false, reserve space necessary for
+ * removing a chunk.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			u64 type)
 {
 	struct btrfs_space_info *info;
 	u64 left;
 	u64 thresh;
+	int ret = 0;
+	u64 num_devs;
+
+	/*
+	 * Needed because we can end up allocating a system chunk and for an
+	 * atomic and race free space reservation in the chunk block reserve.
+	 */
+	ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
 
 	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 	spin_lock(&info->lock);
 	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
-		info->bytes_reserved - info->bytes_readonly;
+		info->bytes_reserved - info->bytes_readonly -
+		info->bytes_may_use;
 	spin_unlock(&info->lock);
 
-	thresh = get_system_chunk_thresh(root, type);
+	num_devs = get_profile_num_devs(root, type);
+
+	/* num_devs device items to update and 1 chunk item to add or remove */
+	thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
+		btrfs_calc_trans_metadata_size(root, 1);
+
 	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
 		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
 			left, thresh, type);
@@ -4130,7 +4108,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
 		u64 flags;
 
 		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
-		btrfs_alloc_chunk(trans, root, flags);
+		/*
+		 * Ignore failure to create system chunk. We might end up not
+		 * needing it, as we might not need to COW all nodes/leafs from
+		 * the paths we visit in the chunk tree (they were already COWed
+		 * or created in the current transaction for example).
+		 */
+		ret = btrfs_alloc_chunk(trans, root, flags);
+	}
+
+	if (!ret) {
+		ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
+					  &root->fs_info->chunk_block_rsv,
+					  thresh, BTRFS_RESERVE_NO_FLUSH);
+		if (!ret)
+			trans->chunk_bytes_reserved += thresh;
 	}
 }
 
@@ -4235,6 +4227,24 @@ out:
 	space_info->chunk_alloc = 0;
 	spin_unlock(&space_info->lock);
 	mutex_unlock(&fs_info->chunk_mutex);
+	/*
+	 * When we allocate a new chunk we reserve space in the chunk block
+	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
+	 * add new nodes/leafs to it if we end up needing to do it when
+	 * inserting the chunk item and updating device items as part of the
+	 * second phase of chunk allocation, performed by
+	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+	 * large number of new block groups to create in our transaction
+	 * handle's new_bgs list to avoid exhausting the chunk block reserve
+	 * in extreme cases - like having a single transaction create many new
+	 * block groups when starting to write out the free space caches of all
+	 * the block groups that were made dirty during the lifetime of the
+	 * transaction.
+	 */
+	if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+		btrfs_create_pending_block_groups(trans, trans->root);
+		btrfs_trans_release_chunk_metadata(trans);
+	}
 	return ret;
 }
 
@@ -5188,6 +5198,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 	trans->bytes_reserved = 0;
 }
 
+/*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->root->fs_info;
+
+	if (!trans->chunk_bytes_reserved)
+		return;
+
+	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
+				trans->chunk_bytes_reserved);
+	trans->chunk_bytes_reserved = 0;
+}
+
 /* Can only return 0 or -ENOSPC */
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 				  struct inode *inode)
@@ -6092,11 +6120,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
 
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes, u64 parent,
+				struct btrfs_delayed_ref_node *node, u64 parent,
 				u64 root_objectid, u64 owner_objectid,
 				u64 owner_offset, int refs_to_drop,
-				struct btrfs_delayed_extent_op *extent_op,
-				int no_quota)
+				struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -6110,10 +6137,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	int extent_slot = 0;
 	int found_extent = 0;
 	int num_to_del = 1;
+	int no_quota = node->no_quota;
 	u32 item_size;
 	u64 refs;
+	u64 bytenr = node->bytenr;
+	u64 num_bytes = node->num_bytes;
 	int last_ref = 0;
-	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
 						 SKINNY_METADATA);
 
@@ -6294,7 +6323,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	refs -= refs_to_drop;
 
 	if (refs > 0) {
-		type = BTRFS_QGROUP_OPER_SUB_SHARED;
 		if (extent_op)
 			__run_delayed_extent_op(extent_op, leaf, ei);
 		/*
@@ -6356,18 +6384,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 
-	/* Deal with the quota accounting */
-	if (!ret && last_ref && !no_quota) {
-		int mod_seq = 0;
-
-		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
-		    type == BTRFS_QGROUP_OPER_SUB_SHARED)
-			mod_seq = 1;
-
-		ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
-					      bytenr, num_bytes, type,
-					      mod_seq);
-	}
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -6393,7 +6409,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 		goto out_delayed_unlock;
 
 	spin_lock(&head->lock);
-	if (rb_first(&head->ref_root))
+	if (!list_empty(&head->ref_list))
 		goto out;
 
 	if (head->extent_op) {
@@ -7303,13 +7319,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	/* Always set parent to 0 here since its exclusive anyway. */
-	ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-				      ins->objectid, ins->offset,
-				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
-	if (ret)
-		return ret;
-
 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
 	if (ret) { /* -ENOENT, logic error */
 		btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7391,14 +7400,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_free_path(path);
 
-	if (!no_quota) {
-		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-					      ins->objectid, num_bytes,
-					      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
-		if (ret)
-			return ret;
-	}
-
 	ret = update_block_group(trans, root, ins->objectid, root->nodesize,
 				 1);
 	if (ret) { /* -ENOENT, logic error */
@@ -7755,12 +7756,18 @@ reada:
 	wc->reada_slot = slot;
 }
 
+/*
+ * TODO: Modify related function to add related node/leaf to dirty_extent_root,
+ * for later qgroup accounting.
+ *
+ * Current, this function does nothing.
+ */
 static int account_leaf_items(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct extent_buffer *eb)
 {
 	int nr = btrfs_header_nritems(eb);
-	int i, extent_type, ret;
+	int i, extent_type;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	u64 bytenr, num_bytes;
@@ -7783,13 +7790,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
 			continue;
 
 		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
-
-		ret = btrfs_qgroup_record_ref(trans, root->fs_info,
-					      root->objectid,
-					      bytenr, num_bytes,
-					      BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
-		if (ret)
-			return ret;
 	}
 	return 0;
 }
@@ -7858,6 +7858,8 @@ static int adjust_slots_upwards(struct btrfs_root *root,
 
 /*
  * root_eb is the subtree root and is locked before this function is called.
+ * TODO: Modify this function to mark all (including complete shared node)
+ * to dirty_extent_root to allow it get accounted in qgroup.
  */
 static int account_shared_subtree(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
@@ -7920,7 +7922,11 @@ walk_down:
 			child_gen = btrfs_node_ptr_generation(eb, parent_slot);
 
 			eb = read_tree_block(root, child_bytenr, child_gen);
-			if (!eb || !extent_buffer_uptodate(eb)) {
+			if (IS_ERR(eb)) {
+				ret = PTR_ERR(eb);
+				goto out;
+			} else if (!extent_buffer_uptodate(eb)) {
+				free_extent_buffer(eb);
 				ret = -EIO;
 				goto out;
 			}
@@ -7931,16 +7937,6 @@ walk_down:
 			btrfs_tree_read_lock(eb);
 			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 			path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
-
-			ret = btrfs_qgroup_record_ref(trans, root->fs_info,
-						root->objectid,
-						child_bytenr,
-						root->nodesize,
-						BTRFS_QGROUP_OPER_SUB_SUBTREE,
-						0);
-			if (ret)
-				goto out;
-
 		}
 
 		if (level == 0) {
@@ -8151,7 +8147,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 		if (reada && level == 1)
 			reada_walk_down(trans, root, wc, path);
 		next = read_tree_block(root, bytenr, generation);
-		if (!next || !extent_buffer_uptodate(next)) {
+		if (IS_ERR(next)) {
+			return PTR_ERR(next);
+		} else if (!extent_buffer_uptodate(next)) {
 			free_extent_buffer(next);
 			return -EIO;
 		}
@@ -8533,24 +8531,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 				goto out_end_trans;
 			}
 
-			/*
-			 * Qgroup update accounting is run from
-			 * delayed ref handling. This usually works
-			 * out because delayed refs are normally the
-			 * only way qgroup updates are added. However,
-			 * we may have added updates during our tree
-			 * walk so run qgroups here to make sure we
-			 * don't lose any updates.
-			 */
-			ret = btrfs_delayed_qgroup_accounting(trans,
-							      root->fs_info);
-			if (ret)
-				printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
-						   "running qgroup updates "
-						   "during snapshot delete. "
-						   "Quota is out of sync, "
-						   "rescan required.\n", ret);
-
 			btrfs_end_transaction_throttle(trans, tree_root);
 			if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
 				pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8604,14 +8584,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	}
 	root_dropped = true;
 out_end_trans:
-	ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
-	if (ret)
-		printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
-				   "running qgroup updates "
-				   "during snapshot delete. "
-				   "Quota is out of sync, "
-				   "rescan required.\n", ret);
-
 	btrfs_end_transaction_throttle(trans, tree_root);
 out_free:
 	kfree(wc);
@@ -9562,6 +9534,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	free_excluded_extents(root, cache);
 
+	/*
+	 * Call to ensure the corresponding space_info object is created and
+	 * assigned to our block group, but don't update its counters just yet.
+	 * We want our bg to be added to the rbtree with its ->space_info set.
+	 */
+	ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+				&cache->space_info);
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+
 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
 	if (ret) {
 		btrfs_remove_free_space_cache(cache);
@@ -9569,6 +9554,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
+	/*
+	 * Now that our block group has its ->space_info set and is inserted in
+	 * the rbtree, update the space info's counters.
+	 */
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	if (ret) {
@@ -9931,6 +9920,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		}
 		spin_unlock(&fs_info->unused_bgs_lock);
 
+		mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+
 		/* Don't want to race with allocators so take the groups_sem */
 		down_write(&space_info->groups_sem);
 		spin_lock(&block_group->lock);
@@ -10025,6 +10016,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 end_trans:
 		btrfs_end_transaction(trans, root);
 next:
+		mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 		btrfs_put_block_group(block_group);
 		spin_lock(&fs_info->unused_bgs_lock);
 	}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/fs/btrfs/extent-tree.h
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c32d226bf..02d05817c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      unsigned bits, gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+	int wake = 0;
+
+	if (bits & EXTENT_LOCKED)
+		wake = 1;
+
+	return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
 }
 
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
@@ -2767,8 +2772,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
 	else
 		btrfsic_submit_bio(rw, bio);
 
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
 	bio_put(bio);
 	return ret;
 }
@@ -4492,6 +4495,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		}
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			flags |= FIEMAP_EXTENT_UNWRITTEN;
 
 		free_extent_map(em);
 		em = NULL;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b072e1747..b823fac91 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1748,7 +1748,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	}
 
 	current->backing_dev_info = inode_to_bdi(inode);
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err) {
 		mutex_unlock(&inode->i_mutex);
 		goto out;
@@ -1868,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	struct btrfs_log_ctx ctx;
 	int ret = 0;
 	bool full_sync = 0;
+	const u64 len = end - start + 1;
 
 	trace_btrfs_sync_file(file, datasync);
 
@@ -1896,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		 * all extents are persisted and the respective file extent
 		 * items are in the fs/subvol btree.
 		 */
-		ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+		ret = btrfs_wait_ordered_range(inode, start, len);
 	} else {
 		/*
 		 * Start any new ordered operations before starting to log the
@@ -1968,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 */
 	smp_mb();
 	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
-	    (full_sync && BTRFS_I(inode)->last_trans <=
-	     root->fs_info->last_trans_committed)) {
+	    (BTRFS_I(inode)->last_trans <=
+	     root->fs_info->last_trans_committed &&
+	     (full_sync ||
+	      !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
 		/*
 		 * We'v had everything committed since the last time we were
 		 * modified so clear this flag in case it was set for whatever
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9dbe5b548..fb5a6b1c6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -231,6 +231,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 {
 	int ret = 0;
 	struct btrfs_path *path = btrfs_alloc_path();
+	bool locked = false;
 
 	if (!path) {
 		ret = -ENOMEM;
@@ -238,6 +239,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	}
 
 	if (block_group) {
+		locked = true;
 		mutex_lock(&trans->transaction->cache_write_mutex);
 		if (!list_empty(&block_group->io_list)) {
 			list_del_init(&block_group->io_list);
@@ -269,18 +271,14 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	 */
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
-	if (ret) {
-		mutex_unlock(&trans->transaction->cache_write_mutex);
-		btrfs_abort_transaction(trans, root, ret);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	ret = btrfs_update_inode(trans, root, inode);
 
-	if (block_group)
-		mutex_unlock(&trans->transaction->cache_write_mutex);
-
 fail:
+	if (locked)
+		mutex_unlock(&trans->transaction->cache_write_mutex);
 	if (ret)
 		btrfs_abort_transaction(trans, root, ret);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bb013672..e33dff356 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4209,7 +4209,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	u64 extent_num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 item_end = 0;
-	u64 last_size = (u64)-1;
+	u64 last_size = new_size;
 	u32 found_type = (u8)-1;
 	int found_extent;
 	int del_item;
@@ -4493,8 +4493,7 @@ out:
 			btrfs_abort_transaction(trans, root, ret);
 	}
 error:
-	if (last_size != (u64)-1 &&
-	    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 		btrfs_ordered_update_i_size(inode, last_size, NULL);
 
 	btrfs_free_path(path);
@@ -4986,24 +4985,41 @@ static void evict_inode_truncate_pages(struct inode *inode)
 	}
 	write_unlock(&map_tree->lock);
 
+	/*
+	 * Keep looping until we have no more ranges in the io tree.
+	 * We can have ongoing bios started by readpages (called from readahead)
+	 * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+	 * still in progress (unlocked the pages in the bio but did not yet
+	 * unlocked the ranges in the io tree). Therefore this means some
+	 * ranges can still be locked and eviction started because before
+	 * submitting those bios, which are executed by a separate task (work
+	 * queue kthread), inode references (inode->i_count) were not taken
+	 * (which would be dropped in the end io callback of each bio).
+	 * Therefore here we effectively end up waiting for those bios and
+	 * anyone else holding locked ranges without having bumped the inode's
+	 * reference count - if we don't do it, when they access the inode's
+	 * io_tree to unlock a range it may be too late, leading to an
+	 * use-after-free issue.
+	 */
 	spin_lock(&io_tree->lock);
 	while (!RB_EMPTY_ROOT(&io_tree->state)) {
 		struct extent_state *state;
 		struct extent_state *cached_state = NULL;
+		u64 start;
+		u64 end;
 
 		node = rb_first(&io_tree->state);
 		state = rb_entry(node, struct extent_state, rb_node);
-		atomic_inc(&state->refs);
+		start = state->start;
+		end = state->end;
 		spin_unlock(&io_tree->lock);
 
-		lock_extent_bits(io_tree, state->start, state->end,
-				 0, &cached_state);
-		clear_extent_bit(io_tree, state->start, state->end,
+		lock_extent_bits(io_tree, start, end, 0, &cached_state);
+		clear_extent_bit(io_tree, start, end,
 				 EXTENT_LOCKED | EXTENT_DIRTY |
 				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
 				 EXTENT_DEFRAG, 1, 1,
 				 &cached_state, GFP_NOFS);
-		free_extent_state(state);
 
 		cond_resched();
 		spin_lock(&io_tree->lock);
@@ -7530,6 +7546,7 @@ unlock:
 
 		current->journal_info = outstanding_extents;
 		btrfs_free_reserved_data_space(inode, len);
+		set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags);
 	}
 
 	/*
@@ -7855,8 +7872,6 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
 	struct bio *dio_bio;
 	int ret;
 
-	if (err)
-		goto out_done;
 again:
 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
 						   &ordered_offset,
@@ -7879,7 +7894,6 @@ out_test:
 		ordered = NULL;
 		goto again;
 	}
-out_done:
 	dio_bio = dip->dio_bio;
 
 	kfree(dip);
@@ -8147,9 +8161,8 @@ out_err:
 static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 				struct inode *inode, loff_t file_offset)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_dio_private *dip;
-	struct bio *io_bio;
+	struct btrfs_dio_private *dip = NULL;
+	struct bio *io_bio = NULL;
 	struct btrfs_io_bio *btrfs_bio;
 	int skip_sum;
 	int write = rw & REQ_WRITE;
@@ -8166,7 +8179,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 	dip = kzalloc(sizeof(*dip), GFP_NOFS);
 	if (!dip) {
 		ret = -ENOMEM;
-		goto free_io_bio;
+		goto free_ordered;
 	}
 
 	dip->private = dio_bio->bi_private;
@@ -8194,25 +8207,55 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 
 	if (btrfs_bio->end_io)
 		btrfs_bio->end_io(btrfs_bio, ret);
-free_io_bio:
-	bio_put(io_bio);
 
 free_ordered:
 	/*
-	 * If this is a write, we need to clean up the reserved space and kill
-	 * the ordered extent.
+	 * If we arrived here it means either we failed to submit the dip
+	 * or we either failed to clone the dio_bio or failed to allocate the
+	 * dip. If we cloned the dio_bio and allocated the dip, we can just
+	 * call bio_endio against our io_bio so that we get proper resource
+	 * cleanup if we fail to submit the dip, otherwise, we must do the
+	 * same as btrfs_endio_direct_[write|read] because we can't call these
+	 * callbacks - they require an allocated dip and a clone of dio_bio.
 	 */
-	if (write) {
-		struct btrfs_ordered_extent *ordered;
-		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
-		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
-			btrfs_free_reserved_extent(root, ordered->start,
-						   ordered->disk_len, 1);
-		btrfs_put_ordered_extent(ordered);
-		btrfs_put_ordered_extent(ordered);
+	if (io_bio && dip) {
+		bio_endio(io_bio, ret);
+		/*
+		 * The end io callbacks free our dip, do the final put on io_bio
+		 * and all the cleanup and final put for dio_bio (through
+		 * dio_end_io()).
+		 */
+		dip = NULL;
+		io_bio = NULL;
+	} else {
+		if (write) {
+			struct btrfs_ordered_extent *ordered;
+
+			ordered = btrfs_lookup_ordered_extent(inode,
+							      file_offset);
+			set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+			/*
+			 * Decrements our ref on the ordered extent and removes
+			 * the ordered extent from the inode's ordered tree,
+			 * doing all the proper resource cleanup such as for the
+			 * reserved space and waking up any waiters for this
+			 * ordered extent (through btrfs_remove_ordered_extent).
+			 */
+			btrfs_finish_ordered_io(ordered);
+		} else {
+			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+			      file_offset + dio_bio->bi_iter.bi_size - 1);
+		}
+		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+		/*
+		 * Releases and cleans up our dio_bio, no need to bio_put()
+		 * nor bio_endio()/bio_io_error() against dio_bio.
+		 */
+		dio_end_io(dio_bio, ret);
 	}
-	bio_endio(dio_bio, ret);
+	if (io_bio)
+		bio_put(io_bio);
+	kfree(dip);
 }
 
 static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
@@ -8314,9 +8357,18 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 				   btrfs_submit_direct, flags);
 	if (iov_iter_rw(iter) == WRITE) {
 		current->journal_info = NULL;
-		if (ret < 0 && ret != -EIOCBQUEUED)
-			btrfs_delalloc_release_space(inode, count);
-		else if (ret >= 0 && (size_t)ret < count)
+		if (ret < 0 && ret != -EIOCBQUEUED) {
+			/*
+			 * If the error comes from submitting stage,
+			 * btrfs_get_blocsk_direct() has free'd data space,
+			 * and metadata space will be handled by
+			 * finish_ordered_fn, don't do that again to make
+			 * sure bytes_may_use is correct.
+			 */
+			if (!test_and_clear_bit(BTRFS_INODE_DIO_READY,
+				     &BTRFS_I(inode)->runtime_flags))
+				btrfs_delalloc_release_space(inode, count);
+		} else if (ret >= 0 && (size_t)ret < count)
 			btrfs_delalloc_release_space(inode,
 						     count - (size_t)ret);
 	}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 37d456a9a..0770c9158 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 {
 
 
 static int btrfs_clone(struct inode *src, struct inode *inode,
-		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+		       u64 off, u64 olen, u64 olen_aligned, u64 destoff,
+		       int no_time_update);
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -553,8 +554,8 @@ static noinline int create_subvol(struct inode *dir,
 	key.offset = (u64)-1;
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
 	if (IS_ERR(new_root)) {
-		btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
 		ret = PTR_ERR(new_root);
+		btrfs_abort_transaction(trans, root, ret);
 		goto fail;
 	}
 
@@ -1318,7 +1319,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		i = range->start >> PAGE_CACHE_SHIFT;
 	}
 	if (!max_to_defrag)
-		max_to_defrag = last_index + 1;
+		max_to_defrag = last_index - i + 1;
 
 	/*
 	 * make writeback starts from i, so the defrag range can be
@@ -1368,7 +1369,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			ra_index = max(i, ra_index);
 			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
 				       cluster);
-			ra_index += max_cluster;
+			ra_index += cluster;
 		}
 
 		mutex_lock(&inode->i_mutex);
@@ -2271,10 +2272,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 {
 	 struct btrfs_ioctl_ino_lookup_args *args;
 	 struct inode *inode;
-	 int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	int ret = 0;
 
 	args = memdup_user(argp, sizeof(*args));
 	if (IS_ERR(args))
@@ -2282,13 +2280,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 
 	inode = file_inode(file);
 
+	/*
+	 * Unprivileged query to obtain the containing subvolume root id. The
+	 * path is reset so it's consistent with btrfs_search_path_in_tree.
+	 */
 	if (args->treeid == 0)
 		args->treeid = BTRFS_I(inode)->root->root_key.objectid;
 
+	if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
+		args->name[0] = 0;
+		goto out;
+	}
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out;
+	}
+
 	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
 					args->treeid, args->objectid,
 					args->name);
 
+out:
 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
 		ret = -EFAULT;
 
@@ -2753,14 +2766,11 @@ out:
 	return ret;
 }
 
-static struct page *extent_same_get_page(struct inode *inode, u64 off)
+static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
 {
 	struct page *page;
-	pgoff_t index;
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 
-	index = off >> PAGE_CACHE_SHIFT;
-
 	page = grab_cache_page(inode->i_mapping, index);
 	if (!page)
 		return NULL;
@@ -2781,6 +2791,20 @@ static struct page *extent_same_get_page(struct inode *inode, u64 off)
 	return page;
 }
 
+static int gather_extent_pages(struct inode *inode, struct page **pages,
+			       int num_pages, u64 off)
+{
+	int i;
+	pgoff_t index = off >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = extent_same_get_page(inode, index + i);
+		if (!pages[i])
+			return -ENOMEM;
+	}
+	return 0;
+}
+
 static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
 {
 	/* do any pending delalloc/csum calc on src, one way or
@@ -2806,52 +2830,120 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
 	}
 }
 
-static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
-				struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
 {
-	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
-	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-
 	mutex_unlock(&inode1->i_mutex);
 	mutex_unlock(&inode2->i_mutex);
 }
 
-static void btrfs_double_lock(struct inode *inode1, u64 loff1,
-			      struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
+{
+	if (inode1 < inode2)
+		swap(inode1, inode2);
+
+	mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+	if (inode1 != inode2)
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+				      struct inode *inode2, u64 loff2, u64 len)
+{
+	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+				     struct inode *inode2, u64 loff2, u64 len)
 {
 	if (inode1 < inode2) {
 		swap(inode1, inode2);
 		swap(loff1, loff2);
 	}
-
-	mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
 	lock_extent_range(inode1, loff1, len);
-	if (inode1 != inode2) {
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+	if (inode1 != inode2)
 		lock_extent_range(inode2, loff2, len);
+}
+
+struct cmp_pages {
+	int		num_pages;
+	struct page	**src_pages;
+	struct page	**dst_pages;
+};
+
+static void btrfs_cmp_data_free(struct cmp_pages *cmp)
+{
+	int i;
+	struct page *pg;
+
+	for (i = 0; i < cmp->num_pages; i++) {
+		pg = cmp->src_pages[i];
+		if (pg)
+			page_cache_release(pg);
+		pg = cmp->dst_pages[i];
+		if (pg)
+			page_cache_release(pg);
+	}
+	kfree(cmp->src_pages);
+	kfree(cmp->dst_pages);
+}
+
+static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
+				  struct inode *dst, u64 dst_loff,
+				  u64 len, struct cmp_pages *cmp)
+{
+	int ret;
+	int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+	struct page **src_pgarr, **dst_pgarr;
+
+	/*
+	 * We must gather up all the pages before we initiate our
+	 * extent locking. We use an array for the page pointers. Size
+	 * of the array is bounded by len, which is in turn bounded by
+	 * BTRFS_MAX_DEDUPE_LEN.
+	 */
+	src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+	dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+	if (!src_pgarr || !dst_pgarr) {
+		kfree(src_pgarr);
+		kfree(dst_pgarr);
+		return -ENOMEM;
 	}
+	cmp->num_pages = num_pages;
+	cmp->src_pages = src_pgarr;
+	cmp->dst_pages = dst_pgarr;
+
+	ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff);
+	if (ret)
+		goto out;
+
+	ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff);
+
+out:
+	if (ret)
+		btrfs_cmp_data_free(cmp);
+	return 0;
 }
 
 static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
-			  u64 dst_loff, u64 len)
+			  u64 dst_loff, u64 len, struct cmp_pages *cmp)
 {
 	int ret = 0;
+	int i;
 	struct page *src_page, *dst_page;
 	unsigned int cmp_len = PAGE_CACHE_SIZE;
 	void *addr, *dst_addr;
 
+	i = 0;
 	while (len) {
 		if (len < PAGE_CACHE_SIZE)
 			cmp_len = len;
 
-		src_page = extent_same_get_page(src, loff);
-		if (!src_page)
-			return -EINVAL;
-		dst_page = extent_same_get_page(dst, dst_loff);
-		if (!dst_page) {
-			page_cache_release(src_page);
-			return -EINVAL;
-		}
+		BUG_ON(i >= cmp->num_pages);
+
+		src_page = cmp->src_pages[i];
+		dst_page = cmp->dst_pages[i];
+
 		addr = kmap_atomic(src_page);
 		dst_addr = kmap_atomic(dst_page);
 
@@ -2863,26 +2955,30 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
 
 		kunmap_atomic(addr);
 		kunmap_atomic(dst_addr);
-		page_cache_release(src_page);
-		page_cache_release(dst_page);
 
 		if (ret)
 			break;
 
-		loff += cmp_len;
-		dst_loff += cmp_len;
 		len -= cmp_len;
+		i++;
 	}
 
 	return ret;
 }
 
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
+				     u64 olen)
 {
+	u64 len = *plen;
 	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
 
-	if (off + len > inode->i_size || off + len < off)
+	if (off + olen > inode->i_size || off + olen < off)
 		return -EINVAL;
+
+	/* if we extend to eof, continue to block boundary */
+	if (off + len == inode->i_size)
+		*plen = len = ALIGN(inode->i_size, bs) - off;
+
 	/* Check that we are block aligned - btrfs_clone() requires this */
 	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
 		return -EINVAL;
@@ -2890,31 +2986,67 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
 	return 0;
 }
 
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 			     struct inode *dst, u64 dst_loff)
 {
 	int ret;
+	u64 len = olen;
+	struct cmp_pages cmp;
+	int same_inode = 0;
+	u64 same_lock_start = 0;
+	u64 same_lock_len = 0;
 
-	/*
-	 * btrfs_clone() can't handle extents in the same file
-	 * yet. Once that works, we can drop this check and replace it
-	 * with a check for the same inode, but overlapping extents.
-	 */
 	if (src == dst)
-		return -EINVAL;
+		same_inode = 1;
 
 	if (len == 0)
 		return 0;
 
-	btrfs_double_lock(src, loff, dst, dst_loff, len);
+	if (same_inode) {
+		mutex_lock(&src->i_mutex);
 
-	ret = extent_same_check_offsets(src, loff, len);
-	if (ret)
-		goto out_unlock;
+		ret = extent_same_check_offsets(src, loff, &len, olen);
+		if (ret)
+			goto out_unlock;
 
-	ret = extent_same_check_offsets(dst, dst_loff, len);
-	if (ret)
-		goto out_unlock;
+		/*
+		 * Single inode case wants the same checks, except we
+		 * don't want our length pushed out past i_size as
+		 * comparing that data range makes no sense.
+		 *
+		 * extent_same_check_offsets() will do this for an
+		 * unaligned length at i_size, so catch it here and
+		 * reject the request.
+		 *
+		 * This effectively means we require aligned extents
+		 * for the single-inode case, whereas the other cases
+		 * allow an unaligned length so long as it ends at
+		 * i_size.
+		 */
+		if (len != olen) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* Check for overlapping ranges */
+		if (dst_loff + len > loff && dst_loff < loff + len) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		same_lock_start = min_t(u64, loff, dst_loff);
+		same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
+	} else {
+		btrfs_double_inode_lock(src, dst);
+
+		ret = extent_same_check_offsets(src, loff, &len, olen);
+		if (ret)
+			goto out_unlock;
+
+		ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
+		if (ret)
+			goto out_unlock;
+	}
 
 	/* don't make the dst file partly checksummed */
 	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
@@ -2923,12 +3055,32 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
 		goto out_unlock;
 	}
 
-	ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+	ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
+	if (ret)
+		goto out_unlock;
+
+	if (same_inode)
+		lock_extent_range(src, same_lock_start, same_lock_len);
+	else
+		btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+
+	/* pass original length for comparison so we stay within i_size */
+	ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
 	if (ret == 0)
-		ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
 
+	if (same_inode)
+		unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
+			      same_lock_start + same_lock_len - 1);
+	else
+		btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+
+	btrfs_cmp_data_free(&cmp);
 out_unlock:
-	btrfs_double_unlock(src, loff, dst, dst_loff, len);
+	if (same_inode)
+		mutex_unlock(&src->i_mutex);
+	else
+		btrfs_double_inode_unlock(src, dst);
 
 	return ret;
 }
@@ -3082,13 +3234,15 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 				     struct inode *inode,
 				     u64 endoff,
 				     const u64 destoff,
-				     const u64 olen)
+				     const u64 olen,
+				     int no_time_update)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	if (!no_time_update)
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	/*
 	 * We round up to the block size at eof when determining which
 	 * extents to clone above, but shouldn't round up the file size.
@@ -3173,13 +3327,13 @@ static void clone_update_extent_map(struct inode *inode,
  * @inode: Inode to clone to
  * @off: Offset within source to start clone from
  * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen, extent_same uses
- *               identical values here
+ * @olen_aligned: Block-aligned value of olen
  * @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
  */
 static int btrfs_clone(struct inode *src, struct inode *inode,
 		       const u64 off, const u64 olen, const u64 olen_aligned,
-		       const u64 destoff)
+		       const u64 destoff, int no_time_update)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path = NULL;
@@ -3517,7 +3671,8 @@ process_slot:
 					      root->sectorsize);
 			ret = clone_finish_inode_update(trans, inode,
 							last_dest_end,
-							destoff, olen);
+							destoff, olen,
+							no_time_update);
 			if (ret)
 				goto out;
 			if (new_key.offset + datal >= destoff + len)
@@ -3555,7 +3710,7 @@ process_slot:
 		clone_update_extent_map(inode, trans, NULL, last_dest_end,
 					destoff + len - last_dest_end);
 		ret = clone_finish_inode_update(trans, inode, destoff + len,
-						destoff, olen);
+						destoff, olen, no_time_update);
 	}
 
 out:
@@ -3692,7 +3847,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		lock_extent_range(inode, destoff, len);
 	}
 
-	ret = btrfs_clone(src, inode, off, olen, len, destoff);
+	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
 
 	if (same_inode) {
 		u64 lock_start = min_t(u64, off, destoff);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 760c4a5e0..52170cf17 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
-	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
-	    !(type == BTRFS_ORDERED_NOCOW))
-		entry->csum_bytes_left = disk_len;
 	entry->disk_len = disk_len;
 	entry->bytes_left = len;
 	entry->inode = igrab(inode);
@@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode,
 	tree = &BTRFS_I(inode)->ordered_tree;
 	spin_lock_irq(&tree->lock);
 	list_add_tail(&sum->list, &entry->list);
-	WARN_ON(entry->csum_bytes_left < sum->len);
-	entry->csum_bytes_left -= sum->len;
-	if (entry->csum_bytes_left == 0)
-		wake_up(&entry->wait);
 	spin_unlock_irq(&tree->lock);
 }
 
@@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
 						   &ordered->flags));
 
-		list_add_tail(&ordered->trans_list, &trans->ordered);
+		/*
+		 * If our ordered extent completed it means it updated the
+		 * fs/subvol and csum trees already, so no need to make the
+		 * current transaction's commit wait for it, as we end up
+		 * holding memory unnecessarily and delaying the inode's iput
+		 * until the transaction commit (we schedule an iput for the
+		 * inode when the ordered extent's refcount drops to 0), which
+		 * prevents it from being evictable until the transaction
+		 * commits.
+		 */
+		if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
+			btrfs_put_ordered_extent(ordered);
+		else
+			list_add_tail(&ordered->trans_list, &trans->ordered);
+
 		spin_lock_irq(&log->log_extents_lock[index]);
 	}
 	spin_unlock_irq(&log->log_extents_lock[index]);
@@ -545,6 +552,10 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 	trace_btrfs_ordered_extent_put(entry->inode, entry);
 
 	if (atomic_dec_and_test(&entry->refs)) {
+		ASSERT(list_empty(&entry->log_list));
+		ASSERT(list_empty(&entry->trans_list));
+		ASSERT(list_empty(&entry->root_extent_list));
+		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
 		if (entry->inode)
 			btrfs_add_delayed_iput(entry->inode);
 		while (!list_empty(&entry->list)) {
@@ -572,6 +583,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	spin_lock_irq(&tree->lock);
 	node = &entry->rb_node;
 	rb_erase(node, &tree->tree);
+	RB_CLEAR_NODE(node);
 	if (tree->last == node)
 		tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
@@ -844,6 +856,20 @@ out:
 	return entry;
 }
 
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+					 u64 file_offset,
+					 u64 len)
+{
+	struct btrfs_ordered_extent *oe;
+
+	oe = btrfs_lookup_ordered_range(inode, file_offset, len);
+	if (oe) {
+		btrfs_put_ordered_extent(oe);
+		return true;
+	}
+	return false;
+}
+
 /*
  * lookup and return any extent before 'file_offset'.  NULL is returned
  * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e96cd4ccd..7176cc0fe 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -89,9 +89,6 @@ struct btrfs_ordered_extent {
 	/* number of bytes that still need writing */
 	u64 bytes_left;
 
-	/* number of bytes that still need csumming */
-	u64 csum_bytes_left;
-
 	/*
 	 * the end of the ordered extent which is behind it but
 	 * didn't update disk_i_size. Please see the comment of
@@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 							u64 file_offset,
 							u64 len);
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+					 u64 file_offset,
+					 u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3d6546581..8a8202956 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -34,6 +34,7 @@
 #include "extent_io.h"
 #include "qgroup.h"
 
+
 /* TODO XXX FIXME
  *  - subvol delete -> delete when ref goes to 0? delete limits also?
  *  - reorganize keys
@@ -84,11 +85,42 @@ struct btrfs_qgroup {
 
 	/*
 	 * temp variables for accounting operations
+	 * Refer to qgroup_shared_accouting() for details.
 	 */
 	u64 old_refcnt;
 	u64 new_refcnt;
 };
 
+static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
+					   int mod)
+{
+	if (qg->old_refcnt < seq)
+		qg->old_refcnt = seq;
+	qg->old_refcnt += mod;
+}
+
+static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
+					   int mod)
+{
+	if (qg->new_refcnt < seq)
+		qg->new_refcnt = seq;
+	qg->new_refcnt += mod;
+}
+
+static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+	if (qg->old_refcnt < seq)
+		return 0;
+	return qg->old_refcnt - seq;
+}
+
+static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+	if (qg->new_refcnt < seq)
+		return 0;
+	return qg->new_refcnt - seq;
+}
+
 /*
  * glue structure to represent the relations between qgroups.
  */
@@ -1115,14 +1147,14 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 	struct ulist *tmp;
 	int ret = 0;
 
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
-		return -ENOMEM;
-
 	/* Check the level of src and dst first */
 	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
 		return -EINVAL;
 
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	quota_root = fs_info->quota_root;
 	if (!quota_root) {
@@ -1317,6 +1349,11 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
 	int ret = 0;
+	/* Sometimes we would want to clear the limit on this qgroup.
+	 * To meet this requirement, we treat the -1 as a special value
+	 * which tell kernel to clear the limit on this qgroup.
+	 */
+	const u64 CLEAR_VALUE = -1;
 
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	quota_root = fs_info->quota_root;
@@ -1332,14 +1369,42 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 	}
 
 	spin_lock(&fs_info->qgroup_lock);
-	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
-		qgroup->max_rfer = limit->max_rfer;
-	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
-		qgroup->max_excl = limit->max_excl;
-	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
-		qgroup->rsv_rfer = limit->rsv_rfer;
-	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
-		qgroup->rsv_excl = limit->rsv_excl;
+	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
+		if (limit->max_rfer == CLEAR_VALUE) {
+			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+			limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+			qgroup->max_rfer = 0;
+		} else {
+			qgroup->max_rfer = limit->max_rfer;
+		}
+	}
+	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+		if (limit->max_excl == CLEAR_VALUE) {
+			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+			limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+			qgroup->max_excl = 0;
+		} else {
+			qgroup->max_excl = limit->max_excl;
+		}
+	}
+	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
+		if (limit->rsv_rfer == CLEAR_VALUE) {
+			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+			limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+			qgroup->rsv_rfer = 0;
+		} else {
+			qgroup->rsv_rfer = limit->rsv_rfer;
+		}
+	}
+	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
+		if (limit->rsv_excl == CLEAR_VALUE) {
+			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+			limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+			qgroup->rsv_excl = 0;
+		} else {
+			qgroup->rsv_excl = limit->rsv_excl;
+		}
+	}
 	qgroup->lim_flags |= limit->flags;
 
 	spin_unlock(&fs_info->qgroup_lock);
@@ -1356,239 +1421,86 @@ out:
 	return ret;
 }
 
-static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
-			   struct btrfs_qgroup_operation *oper2)
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info)
 {
-	/*
-	 * Ignore seq and type here, we're looking for any operation
-	 * at all related to this extent on that root.
-	 */
-	if (oper1->bytenr < oper2->bytenr)
-		return -1;
-	if (oper1->bytenr > oper2->bytenr)
-		return 1;
-	if (oper1->ref_root < oper2->ref_root)
-		return -1;
-	if (oper1->ref_root > oper2->ref_root)
-		return 1;
-	return 0;
-}
+	struct btrfs_qgroup_extent_record *record;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct rb_node *node;
+	u64 qgroup_to_skip;
+	int ret = 0;
 
-static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
-			      struct btrfs_qgroup_operation *oper)
-{
-	struct rb_node *n;
-	struct btrfs_qgroup_operation *cur;
-	int cmp;
+	delayed_refs = &trans->transaction->delayed_refs;
+	qgroup_to_skip = delayed_refs->qgroup_to_skip;
 
-	spin_lock(&fs_info->qgroup_op_lock);
-	n = fs_info->qgroup_op_tree.rb_node;
-	while (n) {
-		cur = rb_entry(n, struct btrfs_qgroup_operation, n);
-		cmp = comp_oper_exist(cur, oper);
-		if (cmp < 0) {
-			n = n->rb_right;
-		} else if (cmp) {
-			n = n->rb_left;
-		} else {
-			spin_unlock(&fs_info->qgroup_op_lock);
-			return -EEXIST;
-		}
+	/*
+	 * No need to do lock, since this function will only be called in
+	 * btrfs_commmit_transaction().
+	 */
+	node = rb_first(&delayed_refs->dirty_extent_root);
+	while (node) {
+		record = rb_entry(node, struct btrfs_qgroup_extent_record,
+				  node);
+		ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0,
+					   &record->old_roots);
+		if (ret < 0)
+			break;
+		if (qgroup_to_skip)
+			ulist_del(record->old_roots, qgroup_to_skip, 0);
+		node = rb_next(node);
 	}
-	spin_unlock(&fs_info->qgroup_op_lock);
-	return 0;
-}
-
-static int comp_oper(struct btrfs_qgroup_operation *oper1,
-		     struct btrfs_qgroup_operation *oper2)
-{
-	if (oper1->bytenr < oper2->bytenr)
-		return -1;
-	if (oper1->bytenr > oper2->bytenr)
-		return 1;
-	if (oper1->ref_root < oper2->ref_root)
-		return -1;
-	if (oper1->ref_root > oper2->ref_root)
-		return 1;
-	if (oper1->seq < oper2->seq)
-		return -1;
-	if (oper1->seq > oper2->seq)
-		return 1;
-	if (oper1->type < oper2->type)
-		return -1;
-	if (oper1->type > oper2->type)
-		return 1;
-	return 0;
+	return ret;
 }
 
-static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
-			      struct btrfs_qgroup_operation *oper)
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+				  struct btrfs_qgroup_extent_record *record)
 {
-	struct rb_node **p;
-	struct rb_node *parent = NULL;
-	struct btrfs_qgroup_operation *cur;
-	int cmp;
+	struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_qgroup_extent_record *entry;
+	u64 bytenr = record->bytenr;
 
-	spin_lock(&fs_info->qgroup_op_lock);
-	p = &fs_info->qgroup_op_tree.rb_node;
 	while (*p) {
-		parent = *p;
-		cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
-		cmp = comp_oper(cur, oper);
-		if (cmp < 0) {
-			p = &(*p)->rb_right;
-		} else if (cmp) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
+				 node);
+		if (bytenr < entry->bytenr)
 			p = &(*p)->rb_left;
-		} else {
-			spin_unlock(&fs_info->qgroup_op_lock);
-			return -EEXIST;
-		}
-	}
-	rb_link_node(&oper->n, parent, p);
-	rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
-	spin_unlock(&fs_info->qgroup_op_lock);
-	return 0;
-}
-
-/*
- * Record a quota operation for processing later on.
- * @trans: the transaction we are adding the delayed op to.
- * @fs_info: the fs_info for this fs.
- * @ref_root: the root of the reference we are acting on,
- * @bytenr: the bytenr we are acting on.
- * @num_bytes: the number of bytes in the reference.
- * @type: the type of operation this is.
- * @mod_seq: do we need to get a sequence number for looking up roots.
- *
- * We just add it to our trans qgroup_ref_list and carry on and process these
- * operations in order at some later point.  If the reference root isn't a fs
- * root then we don't bother with doing anything.
- *
- * MUST BE HOLDING THE REF LOCK.
- */
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_fs_info *fs_info, u64 ref_root,
-			    u64 bytenr, u64 num_bytes,
-			    enum btrfs_qgroup_operation_type type, int mod_seq)
-{
-	struct btrfs_qgroup_operation *oper;
-	int ret;
-
-	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
-		return 0;
-
-	oper = kmalloc(sizeof(*oper), GFP_NOFS);
-	if (!oper)
-		return -ENOMEM;
-
-	oper->ref_root = ref_root;
-	oper->bytenr = bytenr;
-	oper->num_bytes = num_bytes;
-	oper->type = type;
-	oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
-	INIT_LIST_HEAD(&oper->elem.list);
-	oper->elem.seq = 0;
-
-	trace_btrfs_qgroup_record_ref(oper);
-
-	if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
-		/*
-		 * If any operation for this bytenr/ref_root combo
-		 * exists, then we know it's not exclusively owned and
-		 * shouldn't be queued up.
-		 *
-		 * This also catches the case where we have a cloned
-		 * extent that gets queued up multiple times during
-		 * drop snapshot.
-		 */
-		if (qgroup_oper_exists(fs_info, oper)) {
-			kfree(oper);
-			return 0;
-		}
-	}
-
-	ret = insert_qgroup_oper(fs_info, oper);
-	if (ret) {
-		/* Shouldn't happen so have an assert for developers */
-		ASSERT(0);
-		kfree(oper);
-		return ret;
+		else if (bytenr > entry->bytenr)
+			p = &(*p)->rb_right;
+		else
+			return entry;
 	}
-	list_add_tail(&oper->list, &trans->qgroup_ref_list);
-
-	if (mod_seq)
-		btrfs_get_tree_mod_seq(fs_info, &oper->elem);
-
-	return 0;
-}
-
-static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
-				  struct btrfs_qgroup_operation *oper)
-{
-	struct ulist *tmp;
-	int sign = 0;
-	int ret = 0;
 
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
-		return -ENOMEM;
-
-	spin_lock(&fs_info->qgroup_lock);
-	if (!fs_info->quota_root)
-		goto out;
-
-	switch (oper->type) {
-	case BTRFS_QGROUP_OPER_ADD_EXCL:
-		sign = 1;
-		break;
-	case BTRFS_QGROUP_OPER_SUB_EXCL:
-		sign = -1;
-		break;
-	default:
-		ASSERT(0);
-	}
-	ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
-				       oper->num_bytes, sign);
-out:
-	spin_unlock(&fs_info->qgroup_lock);
-	ulist_free(tmp);
-	return ret;
+	rb_link_node(&record->node, parent_node, p);
+	rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
+	return NULL;
 }
 
+#define UPDATE_NEW	0
+#define UPDATE_OLD	1
 /*
- * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
- * properly.
+ * Walk all of the roots that points to the bytenr and adjust their refcnts.
  */
-static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
-				  u64 root_to_skip, struct ulist *tmp,
-				  struct ulist *roots, struct ulist *qgroups,
-				  u64 seq, int *old_roots, int rescan)
+static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+				struct ulist *roots, struct ulist *tmp,
+				struct ulist *qgroups, u64 seq, int update_old)
 {
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
 	struct ulist_node *tmp_unode;
 	struct ulist_iterator tmp_uiter;
 	struct btrfs_qgroup *qg;
-	int ret;
+	int ret = 0;
 
+	if (!roots)
+		return 0;
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(roots, &uiter))) {
-		/* We don't count our current root here */
-		if (unode->val == root_to_skip)
-			continue;
 		qg = find_qgroup_rb(fs_info, unode->val);
 		if (!qg)
 			continue;
-		/*
-		 * We could have a pending removal of this same ref so we may
-		 * not have actually found our ref root when doing
-		 * btrfs_find_all_roots, so we need to keep track of how many
-		 * old roots we find in case we removed ours and added a
-		 * different one at the same time.  I don't think this could
-		 * happen in practice but that sort of thinking leads to pain
-		 * and suffering and to the dark side.
-		 */
-		(*old_roots)++;
 
 		ulist_reinit(tmp);
 		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
@@ -1603,29 +1515,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
 			struct btrfs_qgroup_list *glist;
 
 			qg = u64_to_ptr(tmp_unode->aux);
-			/*
-			 * We use this sequence number to keep from having to
-			 * run the whole list and 0 out the refcnt every time.
-			 * We basically use sequnce as the known 0 count and
-			 * then add 1 everytime we see a qgroup.  This is how we
-			 * get how many of the roots actually point up to the
-			 * upper level qgroups in order to determine exclusive
-			 * counts.
-			 *
-			 * For rescan we want to set old_refcnt to seq so our
-			 * exclusive calculations end up correct.
-			 */
-			if (rescan)
-				qg->old_refcnt = seq;
-			else if (qg->old_refcnt < seq)
-				qg->old_refcnt = seq + 1;
+			if (update_old)
+				btrfs_qgroup_update_old_refcnt(qg, seq, 1);
 			else
-				qg->old_refcnt++;
-
-			if (qg->new_refcnt < seq)
-				qg->new_refcnt = seq + 1;
-			else
-				qg->new_refcnt++;
+				btrfs_qgroup_update_new_refcnt(qg, seq, 1);
 			list_for_each_entry(glist, &qg->groups, next_group) {
 				ret = ulist_add(qgroups, glist->group->qgroupid,
 						ptr_to_u64(glist->group),
@@ -1644,161 +1537,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * We need to walk forward in our operation tree and account for any roots that
- * were deleted after we made this operation.
- */
-static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
-				       struct btrfs_qgroup_operation *oper,
-				       struct ulist *tmp,
-				       struct ulist *qgroups, u64 seq,
-				       int *old_roots)
-{
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	struct btrfs_qgroup *qg;
-	struct btrfs_qgroup_operation *tmp_oper;
-	struct rb_node *n;
-	int ret;
-
-	ulist_reinit(tmp);
-
-	/*
-	 * We only walk forward in the tree since we're only interested in
-	 * removals that happened _after_  our operation.
-	 */
-	spin_lock(&fs_info->qgroup_op_lock);
-	n = rb_next(&oper->n);
-	spin_unlock(&fs_info->qgroup_op_lock);
-	if (!n)
-		return 0;
-	tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
-	while (tmp_oper->bytenr == oper->bytenr) {
-		/*
-		 * If it's not a removal we don't care, additions work out
-		 * properly with our refcnt tracking.
-		 */
-		if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
-		    tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
-			goto next;
-		qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
-		if (!qg)
-			goto next;
-		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
-				GFP_ATOMIC);
-		if (ret) {
-			if (ret < 0)
-				return ret;
-			/*
-			 * We only want to increase old_roots if this qgroup is
-			 * not already in the list of qgroups.  If it is already
-			 * there then that means it must have been re-added or
-			 * the delete will be discarded because we had an
-			 * existing ref that we haven't looked up yet.  In this
-			 * case we don't want to increase old_roots.  So if ret
-			 * == 1 then we know that this is the first time we've
-			 * seen this qgroup and we can bump the old_roots.
-			 */
-			(*old_roots)++;
-			ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
-					GFP_ATOMIC);
-			if (ret < 0)
-				return ret;
-		}
-next:
-		spin_lock(&fs_info->qgroup_op_lock);
-		n = rb_next(&tmp_oper->n);
-		spin_unlock(&fs_info->qgroup_op_lock);
-		if (!n)
-			break;
-		tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
-	}
-
-	/* Ok now process the qgroups we found */
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(tmp, &uiter))) {
-		struct btrfs_qgroup_list *glist;
-
-		qg = u64_to_ptr(unode->aux);
-		if (qg->old_refcnt < seq)
-			qg->old_refcnt = seq + 1;
-		else
-			qg->old_refcnt++;
-		if (qg->new_refcnt < seq)
-			qg->new_refcnt = seq + 1;
-		else
-			qg->new_refcnt++;
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(qgroups, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				return ret;
-			ret = ulist_add(tmp, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				return ret;
-		}
-	}
-	return 0;
-}
-
-/* Add refcnt for the newly added reference. */
-static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
-				  struct btrfs_qgroup_operation *oper,
-				  struct btrfs_qgroup *qgroup,
-				  struct ulist *tmp, struct ulist *qgroups,
-				  u64 seq)
-{
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	struct btrfs_qgroup *qg;
-	int ret;
-
-	ulist_reinit(tmp);
-	ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
-			GFP_ATOMIC);
-	if (ret < 0)
-		return ret;
-	ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
-			GFP_ATOMIC);
-	if (ret < 0)
-		return ret;
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(tmp, &uiter))) {
-		struct btrfs_qgroup_list *glist;
-
-		qg = u64_to_ptr(unode->aux);
-		if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
-			if (qg->new_refcnt < seq)
-				qg->new_refcnt = seq + 1;
-			else
-				qg->new_refcnt++;
-		} else {
-			if (qg->old_refcnt < seq)
-				qg->old_refcnt = seq + 1;
-			else
-				qg->old_refcnt++;
-		}
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(tmp, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				return ret;
-			ret = ulist_add(qgroups, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				return ret;
-		}
-	}
-	return 0;
-}
-
-/*
- * This adjusts the counters for all referenced qgroups if need be.
+ * Update qgroup rfer/excl counters.
+ * Rfer update is easy, codes can explain themselves.
+ *
+ * Excl update is tricky, the update is split into 2 part.
+ * Part 1: Possible exclusive <-> sharing detect:
+ *	|	A	|	!A	|
+ *  -------------------------------------
+ *  B	|	*	|	-	|
+ *  -------------------------------------
+ *  !B	|	+	|	**	|
+ *  -------------------------------------
+ *
+ * Conditions:
+ * A:	cur_old_roots < nr_old_roots	(not exclusive before)
+ * !A:	cur_old_roots == nr_old_roots	(possible exclusive before)
+ * B:	cur_new_roots < nr_new_roots	(not exclusive now)
+ * !B:	cur_new_roots == nr_new_roots	(possible exclsuive now)
+ *
+ * Results:
+ * +: Possible sharing -> exclusive	-: Possible exclusive -> sharing
+ * *: Definitely not changed.		**: Possible unchanged.
+ *
+ * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
+ *
+ * To make the logic clear, we first use condition A and B to split
+ * combination into 4 results.
+ *
+ * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
+ * only on variant maybe 0.
+ *
+ * Lastly, check result **, since there are 2 variants maybe 0, split them
+ * again(2x2).
+ * But this time we don't need to consider other things, the codes and logic
+ * is easy to understand now.
  */
-static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
-				  u64 root_to_skip, u64 num_bytes,
-				  struct ulist *qgroups, u64 seq,
-				  int old_roots, int new_roots, int rescan)
+static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
+				  struct ulist *qgroups,
+				  u64 nr_old_roots,
+				  u64 nr_new_roots,
+				  u64 num_bytes, u64 seq)
 {
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
@@ -1810,423 +1588,196 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
 		bool dirty = false;
 
 		qg = u64_to_ptr(unode->aux);
-		/*
-		 * Wasn't referenced before but is now, add to the reference
-		 * counters.
-		 */
-		if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
+		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+
+		/* Rfer update part */
+		if (cur_old_count == 0 && cur_new_count > 0) {
 			qg->rfer += num_bytes;
 			qg->rfer_cmpr += num_bytes;
 			dirty = true;
 		}
-
-		/*
-		 * Was referenced before but isn't now, subtract from the
-		 * reference counters.
-		 */
-		if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+		if (cur_old_count > 0 && cur_new_count == 0) {
 			qg->rfer -= num_bytes;
 			qg->rfer_cmpr -= num_bytes;
 			dirty = true;
 		}
 
-		if (qg->old_refcnt < seq)
-			cur_old_count = 0;
-		else
-			cur_old_count = qg->old_refcnt - seq;
-		if (qg->new_refcnt < seq)
-			cur_new_count = 0;
-		else
-			cur_new_count = qg->new_refcnt - seq;
+		/* Excl update part */
+		/* Exclusive/none -> shared case */
+		if (cur_old_count == nr_old_roots &&
+		    cur_new_count < nr_new_roots) {
+			/* Exclusive -> shared */
+			if (cur_old_count != 0) {
+				qg->excl -= num_bytes;
+				qg->excl_cmpr -= num_bytes;
+				dirty = true;
+			}
+		}
 
-		/*
-		 * If our refcount was the same as the roots previously but our
-		 * new count isn't the same as the number of roots now then we
-		 * went from having a exclusive reference on this range to not.
-		 */
-		if (old_roots && cur_old_count == old_roots &&
-		    (cur_new_count != new_roots || new_roots == 0)) {
-			WARN_ON(cur_new_count != new_roots && new_roots == 0);
-			qg->excl -= num_bytes;
-			qg->excl_cmpr -= num_bytes;
-			dirty = true;
+		/* Shared -> exclusive/none case */
+		if (cur_old_count < nr_old_roots &&
+		    cur_new_count == nr_new_roots) {
+			/* Shared->exclusive */
+			if (cur_new_count != 0) {
+				qg->excl += num_bytes;
+				qg->excl_cmpr += num_bytes;
+				dirty = true;
+			}
 		}
 
-		/*
-		 * If we didn't reference all the roots before but now we do we
-		 * have an exclusive reference to this range.
-		 */
-		if ((!old_roots || (old_roots && cur_old_count != old_roots))
-		    && cur_new_count == new_roots) {
-			qg->excl += num_bytes;
-			qg->excl_cmpr += num_bytes;
-			dirty = true;
+		/* Exclusive/none -> exclusive/none case */
+		if (cur_old_count == nr_old_roots &&
+		    cur_new_count == nr_new_roots) {
+			if (cur_old_count == 0) {
+				/* None -> exclusive/none */
+
+				if (cur_new_count != 0) {
+					/* None -> exclusive */
+					qg->excl += num_bytes;
+					qg->excl_cmpr += num_bytes;
+					dirty = true;
+				}
+				/* None -> none, nothing changed */
+			} else {
+				/* Exclusive -> exclusive/none */
+
+				if (cur_new_count == 0) {
+					/* Exclusive -> none */
+					qg->excl -= num_bytes;
+					qg->excl_cmpr -= num_bytes;
+					dirty = true;
+				}
+				/* Exclusive -> exclusive, nothing changed */
+			}
 		}
 
+		/* For exclusive extent, free its reserved bytes too */
+		if (nr_old_roots == 0 && nr_new_roots == 1 &&
+		    cur_new_count == nr_new_roots)
+			qg->reserved -= num_bytes;
 		if (dirty)
 			qgroup_dirty(fs_info, qg);
 	}
 	return 0;
 }
 
-/*
- * If we removed a data extent and there were other references for that bytenr
- * then we need to lookup all referenced roots to make sure we still don't
- * reference this bytenr.  If we do then we can just discard this operation.
- */
-static int check_existing_refs(struct btrfs_trans_handle *trans,
-			       struct btrfs_fs_info *fs_info,
-			       struct btrfs_qgroup_operation *oper)
-{
-	struct ulist *roots = NULL;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	int ret = 0;
-
-	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
-				   oper->elem.seq, &roots);
-	if (ret < 0)
-		return ret;
-	ret = 0;
-
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(roots, &uiter))) {
-		if (unode->val == oper->ref_root) {
-			ret = 1;
-			break;
-		}
-	}
-	ulist_free(roots);
-	btrfs_put_tree_mod_seq(fs_info, &oper->elem);
-
-	return ret;
-}
-
-/*
- * If we share a reference across multiple roots then we may need to adjust
- * various qgroups referenced and exclusive counters.  The basic premise is this
- *
- * 1) We have seq to represent a 0 count.  Instead of looping through all of the
- * qgroups and resetting their refcount to 0 we just constantly bump this
- * sequence number to act as the base reference count.  This means that if
- * anybody is equal to or below this sequence they were never referenced.  We
- * jack this sequence up by the number of roots we found each time in order to
- * make sure we don't have any overlap.
- *
- * 2) We first search all the roots that reference the area _except_ the root
- * we're acting on currently.  This makes up the old_refcnt of all the qgroups
- * before.
- *
- * 3) We walk all of the qgroups referenced by the root we are currently acting
- * on, and will either adjust old_refcnt in the case of a removal or the
- * new_refcnt in the case of an addition.
- *
- * 4) Finally we walk all the qgroups that are referenced by this range
- * including the root we are acting on currently.  We will adjust the counters
- * based on the number of roots we had and will have after this operation.
- *
- * Take this example as an illustration
- *
- *			[qgroup 1/0]
- *		     /         |          \
- *		[qg 0/0]   [qg 0/1]	[qg 0/2]
- *		   \          |            /
- *		  [	   extent	    ]
- *
- * Say we are adding a reference that is covered by qg 0/0.  The first step
- * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
- * old_roots being 2.  Because it is adding new_roots will be 1.  We then go
- * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
- * new_refcnt, bringing it to 3.  We then walk through all of the qgroups, we
- * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
- * reference and thus must add the size to the referenced bytes.  Everything
- * else is the same so nothing else changes.
- */
-static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info,
-				    struct btrfs_qgroup_operation *oper)
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info,
+			    u64 bytenr, u64 num_bytes,
+			    struct ulist *old_roots, struct ulist *new_roots)
 {
-	struct ulist *roots = NULL;
-	struct ulist *qgroups, *tmp;
-	struct btrfs_qgroup *qgroup;
-	struct seq_list elem = SEQ_LIST_INIT(elem);
+	struct ulist *qgroups = NULL;
+	struct ulist *tmp = NULL;
 	u64 seq;
-	int old_roots = 0;
-	int new_roots = 0;
+	u64 nr_new_roots = 0;
+	u64 nr_old_roots = 0;
 	int ret = 0;
 
-	if (oper->elem.seq) {
-		ret = check_existing_refs(trans, fs_info, oper);
-		if (ret < 0)
-			return ret;
-		if (ret)
-			return 0;
-	}
+	if (new_roots)
+		nr_new_roots = new_roots->nnodes;
+	if (old_roots)
+		nr_old_roots = old_roots->nnodes;
 
-	qgroups = ulist_alloc(GFP_NOFS);
-	if (!qgroups)
-		return -ENOMEM;
+	if (!fs_info->quota_enabled)
+		goto out_free;
+	BUG_ON(!fs_info->quota_root);
 
+	qgroups = ulist_alloc(GFP_NOFS);
+	if (!qgroups) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
 	tmp = ulist_alloc(GFP_NOFS);
 	if (!tmp) {
-		ulist_free(qgroups);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_free;
 	}
 
-	btrfs_get_tree_mod_seq(fs_info, &elem);
-	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
-				   &roots);
-	btrfs_put_tree_mod_seq(fs_info, &elem);
-	if (ret < 0) {
-		ulist_free(qgroups);
-		ulist_free(tmp);
-		return ret;
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+		if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
+			mutex_unlock(&fs_info->qgroup_rescan_lock);
+			ret = 0;
+			goto out_free;
+		}
 	}
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
 	spin_lock(&fs_info->qgroup_lock);
-	qgroup = find_qgroup_rb(fs_info, oper->ref_root);
-	if (!qgroup)
-		goto out;
 	seq = fs_info->qgroup_seq;
 
-	/*
-	 * So roots is the list of all the roots currently pointing at the
-	 * bytenr, including the ref we are adding if we are adding, or not if
-	 * we are removing a ref.  So we pass in the ref_root to skip that root
-	 * in our calculations.  We set old_refnct and new_refcnt cause who the
-	 * hell knows what everything looked like before, and it doesn't matter
-	 * except...
-	 */
-	ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
-				     seq, &old_roots, 0);
-	if (ret < 0)
-		goto out;
-
-	/*
-	 * Now adjust the refcounts of the qgroups that care about this
-	 * reference, either the old_count in the case of removal or new_count
-	 * in the case of an addition.
-	 */
-	ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
-				     seq);
+	/* Update old refcnts using old_roots */
+	ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
+				   UPDATE_OLD);
 	if (ret < 0)
 		goto out;
 
-	/*
-	 * ...in the case of removals.  If we had a removal before we got around
-	 * to processing this operation then we need to find that guy and count
-	 * his references as if they really existed so we don't end up screwing
-	 * up the exclusive counts.  Then whenever we go to process the delete
-	 * everything will be grand and we can account for whatever exclusive
-	 * changes need to be made there.  We also have to pass in old_roots so
-	 * we have an accurate count of the roots as it pertains to this
-	 * operations view of the world.
-	 */
-	ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
-					  &old_roots);
+	/* Update new refcnts using new_roots */
+	ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
+				   UPDATE_NEW);
 	if (ret < 0)
 		goto out;
 
-	/*
-	 * We are adding our root, need to adjust up the number of roots,
-	 * otherwise old_roots is the number of roots we want.
-	 */
-	if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
-		new_roots = old_roots + 1;
-	} else {
-		new_roots = old_roots;
-		old_roots++;
-	}
-	fs_info->qgroup_seq += old_roots + 1;
-
+	qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+			       num_bytes, seq);
 
 	/*
-	 * And now the magic happens, bless Arne for having a pretty elegant
-	 * solution for this.
+	 * Bump qgroup_seq to avoid seq overlap
 	 */
-	qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
-			       qgroups, seq, old_roots, new_roots, 0);
+	fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
 out:
 	spin_unlock(&fs_info->qgroup_lock);
-	ulist_free(qgroups);
-	ulist_free(roots);
+out_free:
 	ulist_free(tmp);
+	ulist_free(qgroups);
+	ulist_free(old_roots);
+	ulist_free(new_roots);
 	return ret;
 }
 
-/*
- * Process a reference to a shared subtree. This type of operation is
- * queued during snapshot removal when we encounter extents which are
- * shared between more than one root.
- */
-static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
-				     struct btrfs_fs_info *fs_info,
-				     struct btrfs_qgroup_operation *oper)
-{
-	struct ulist *roots = NULL;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	struct btrfs_qgroup_list *glist;
-	struct ulist *parents;
-	int ret = 0;
-	int err;
-	struct btrfs_qgroup *qg;
-	u64 root_obj = 0;
-	struct seq_list elem = SEQ_LIST_INIT(elem);
-
-	parents = ulist_alloc(GFP_NOFS);
-	if (!parents)
-		return -ENOMEM;
-
-	btrfs_get_tree_mod_seq(fs_info, &elem);
-	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
-				   elem.seq, &roots);
-	btrfs_put_tree_mod_seq(fs_info, &elem);
-	if (ret < 0)
-		goto out;
-
-	if (roots->nnodes != 1)
-		goto out;
-
-	ULIST_ITER_INIT(&uiter);
-	unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
-	/*
-	 * If we find our ref root then that means all refs
-	 * this extent has to the root have not yet been
-	 * deleted. In that case, we do nothing and let the
-	 * last ref for this bytenr drive our update.
-	 *
-	 * This can happen for example if an extent is
-	 * referenced multiple times in a snapshot (clone,
-	 * etc). If we are in the middle of snapshot removal,
-	 * queued updates for such an extent will find the
-	 * root if we have not yet finished removing the
-	 * snapshot.
-	 */
-	if (unode->val == oper->ref_root)
-		goto out;
-
-	root_obj = unode->val;
-	BUG_ON(!root_obj);
-
-	spin_lock(&fs_info->qgroup_lock);
-	qg = find_qgroup_rb(fs_info, root_obj);
-	if (!qg)
-		goto out_unlock;
-
-	qg->excl += oper->num_bytes;
-	qg->excl_cmpr += oper->num_bytes;
-	qgroup_dirty(fs_info, qg);
-
-	/*
-	 * Adjust counts for parent groups. First we find all
-	 * parents, then in the 2nd loop we do the adjustment
-	 * while adding parents of the parents to our ulist.
-	 */
-	list_for_each_entry(glist, &qg->groups, next_group) {
-		err = ulist_add(parents, glist->group->qgroupid,
-				ptr_to_u64(glist->group), GFP_ATOMIC);
-		if (err < 0) {
-			ret = err;
-			goto out_unlock;
-		}
-	}
-
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(parents, &uiter))) {
-		qg = u64_to_ptr(unode->aux);
-		qg->excl += oper->num_bytes;
-		qg->excl_cmpr += oper->num_bytes;
-		qgroup_dirty(fs_info, qg);
-
-		/* Add any parents of the parents */
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			err = ulist_add(parents, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (err < 0) {
-				ret = err;
-				goto out_unlock;
-			}
-		}
-	}
-
-out_unlock:
-	spin_unlock(&fs_info->qgroup_lock);
-
-out:
-	ulist_free(roots);
-	ulist_free(parents);
-	return ret;
-}
-
-/*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
- */
-static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
-				struct btrfs_fs_info *fs_info,
-				struct btrfs_qgroup_operation *oper)
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+				 struct btrfs_fs_info *fs_info)
 {
+	struct btrfs_qgroup_extent_record *record;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct ulist *new_roots = NULL;
+	struct rb_node *node;
+	u64 qgroup_to_skip;
 	int ret = 0;
 
-	if (!fs_info->quota_enabled)
-		return 0;
-
-	BUG_ON(!fs_info->quota_root);
+	delayed_refs = &trans->transaction->delayed_refs;
+	qgroup_to_skip = delayed_refs->qgroup_to_skip;
+	while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
+		record = rb_entry(node, struct btrfs_qgroup_extent_record,
+				  node);
 
-	mutex_lock(&fs_info->qgroup_rescan_lock);
-	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-		if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
-			mutex_unlock(&fs_info->qgroup_rescan_lock);
-			return 0;
+		if (!ret) {
+			/*
+			 * Use (u64)-1 as time_seq to do special search, which
+			 * doesn't lock tree or delayed_refs and search current
+			 * root. It's safe inside commit_transaction().
+			 */
+			ret = btrfs_find_all_roots(trans, fs_info,
+					record->bytenr, (u64)-1, &new_roots);
+			if (ret < 0)
+				goto cleanup;
+			if (qgroup_to_skip)
+				ulist_del(new_roots, qgroup_to_skip, 0);
+			ret = btrfs_qgroup_account_extent(trans, fs_info,
+					record->bytenr, record->num_bytes,
+					record->old_roots, new_roots);
+			record->old_roots = NULL;
+			new_roots = NULL;
 		}
-	}
-	mutex_unlock(&fs_info->qgroup_rescan_lock);
-
-	ASSERT(is_fstree(oper->ref_root));
-
-	trace_btrfs_qgroup_account(oper);
-
-	switch (oper->type) {
-	case BTRFS_QGROUP_OPER_ADD_EXCL:
-	case BTRFS_QGROUP_OPER_SUB_EXCL:
-		ret = qgroup_excl_accounting(fs_info, oper);
-		break;
-	case BTRFS_QGROUP_OPER_ADD_SHARED:
-	case BTRFS_QGROUP_OPER_SUB_SHARED:
-		ret = qgroup_shared_accounting(trans, fs_info, oper);
-		break;
-	case BTRFS_QGROUP_OPER_SUB_SUBTREE:
-		ret = qgroup_subtree_accounting(trans, fs_info, oper);
-		break;
-	default:
-		ASSERT(0);
-	}
-	return ret;
-}
-
-/*
- * Needs to be called everytime we run delayed refs, even if there is an error
- * in order to cleanup outstanding operations.
- */
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_qgroup_operation *oper;
-	int ret = 0;
+cleanup:
+		ulist_free(record->old_roots);
+		ulist_free(new_roots);
+		new_roots = NULL;
+		rb_erase(node, &delayed_refs->dirty_extent_root);
+		kfree(record);
 
-	while (!list_empty(&trans->qgroup_ref_list)) {
-		oper = list_first_entry(&trans->qgroup_ref_list,
-					struct btrfs_qgroup_operation, list);
-		list_del_init(&oper->list);
-		if (!ret || !trans->aborted)
-			ret = btrfs_qgroup_account(trans, fs_info, oper);
-		spin_lock(&fs_info->qgroup_op_lock);
-		rb_erase(&oper->n, &fs_info->qgroup_op_tree);
-		spin_unlock(&fs_info->qgroup_op_lock);
-		btrfs_put_tree_mod_seq(fs_info, &oper->elem);
-		kfree(oper);
 	}
 	return ret;
 }
@@ -2637,15 +2188,13 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
  */
 static int
 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
-		   struct btrfs_trans_handle *trans, struct ulist *qgroups,
-		   struct ulist *tmp, struct extent_buffer *scratch_leaf)
+		   struct btrfs_trans_handle *trans,
+		   struct extent_buffer *scratch_leaf)
 {
 	struct btrfs_key found;
 	struct ulist *roots = NULL;
 	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
 	u64 num_bytes;
-	u64 seq;
-	int new_roots;
 	int slot;
 	int ret;
 
@@ -2695,33 +2244,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 		else
 			num_bytes = found.offset;
 
-		ulist_reinit(qgroups);
 		ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
 					   &roots);
 		if (ret < 0)
 			goto out;
-		spin_lock(&fs_info->qgroup_lock);
-		seq = fs_info->qgroup_seq;
-		fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
-
-		new_roots = 0;
-		ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
-					     seq, &new_roots, 1);
-		if (ret < 0) {
-			spin_unlock(&fs_info->qgroup_lock);
-			ulist_free(roots);
-			goto out;
-		}
-
-		ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
-					     seq, 0, new_roots, 1);
-		if (ret < 0) {
-			spin_unlock(&fs_info->qgroup_lock);
-			ulist_free(roots);
+		/* For rescan, just pass old_roots as NULL */
+		ret = btrfs_qgroup_account_extent(trans, fs_info,
+				found.objectid, num_bytes, NULL, roots);
+		if (ret < 0)
 			goto out;
-		}
-		spin_unlock(&fs_info->qgroup_lock);
-		ulist_free(roots);
 	}
 out:
 	btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
@@ -2735,7 +2266,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 						     qgroup_rescan_work);
 	struct btrfs_path *path;
 	struct btrfs_trans_handle *trans = NULL;
-	struct ulist *tmp = NULL, *qgroups = NULL;
 	struct extent_buffer *scratch_leaf = NULL;
 	int err = -ENOMEM;
 	int ret = 0;
@@ -2743,12 +2273,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	path = btrfs_alloc_path();
 	if (!path)
 		goto out;
-	qgroups = ulist_alloc(GFP_NOFS);
-	if (!qgroups)
-		goto out;
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
-		goto out;
 	scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
 	if (!scratch_leaf)
 		goto out;
@@ -2764,7 +2288,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 			err = -EINTR;
 		} else {
 			err = qgroup_rescan_leaf(fs_info, path, trans,
-						 qgroups, tmp, scratch_leaf);
+						 scratch_leaf);
 		}
 		if (err > 0)
 			btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2774,8 +2298,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 
 out:
 	kfree(scratch_leaf);
-	ulist_free(qgroups);
-	ulist_free(tmp);
 	btrfs_free_path(path);
 
 	mutex_lock(&fs_info->qgroup_rescan_lock);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index c5242aa9a..6387dcfa3 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -19,43 +19,18 @@
 #ifndef __BTRFS_QGROUP__
 #define __BTRFS_QGROUP__
 
+#include "ulist.h"
+#include "delayed-ref.h"
+
 /*
- * A description of the operations, all of these operations only happen when we
- * are adding the 1st reference for that subvolume in the case of adding space
- * or on the last reference delete in the case of subtraction.  The only
- * exception is the last one, which is added for confusion.
- *
- * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
- * one pointing at the bytes we are adding.  This is called on the first
- * allocation.
- *
- * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
- * shared between subvols.  This is called on the creation of a ref that already
- * has refs from a different subvolume, so basically reflink.
- *
- * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
- * one referencing the range.
- *
- * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
- * refs with other subvolumes.
+ * Record a dirty extent, and info qgroup to update quota on it
+ * TODO: Use kmem cache to alloc it.
  */
-enum btrfs_qgroup_operation_type {
-	BTRFS_QGROUP_OPER_ADD_EXCL,
-	BTRFS_QGROUP_OPER_ADD_SHARED,
-	BTRFS_QGROUP_OPER_SUB_EXCL,
-	BTRFS_QGROUP_OPER_SUB_SHARED,
-	BTRFS_QGROUP_OPER_SUB_SUBTREE,
-};
-
-struct btrfs_qgroup_operation {
-	u64 ref_root;
+struct btrfs_qgroup_extent_record {
+	struct rb_node node;
 	u64 bytenr;
 	u64 num_bytes;
-	u64 seq;
-	enum btrfs_qgroup_operation_type type;
-	struct seq_list elem;
-	struct rb_node n;
-	struct list_head list;
+	struct ulist *old_roots;
 };
 
 int btrfs_quota_enable(struct btrfs_trans_handle *trans,
@@ -79,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
 struct btrfs_delayed_extent_op;
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_fs_info *fs_info, u64 ref_root,
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info);
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+				  struct btrfs_qgroup_extent_record *record);
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info,
 			    u64 bytenr, u64 num_bytes,
-			    enum btrfs_qgroup_operation_type type,
-			    int mod_seq);
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info);
-void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
-				   struct btrfs_fs_info *fs_info,
-				   struct btrfs_qgroup_operation *oper);
+			    struct ulist *old_roots, struct ulist *new_roots);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+				 struct btrfs_fs_info *fs_info);
 int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 		      struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 74b24b01d..88cbb5995 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1847,8 +1847,10 @@ again:
 			}
 
 			eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
-			if (!eb || !extent_buffer_uptodate(eb)) {
-				ret = (!eb) ? -ENOMEM : -EIO;
+			if (IS_ERR(eb)) {
+				ret = PTR_ERR(eb);
+			} else if (!extent_buffer_uptodate(eb)) {
+				ret = -EIO;
 				free_extent_buffer(eb);
 				break;
 			}
@@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
 
 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
 		eb = read_tree_block(root, bytenr, ptr_gen);
-		if (!eb || !extent_buffer_uptodate(eb)) {
+		if (IS_ERR(eb)) {
+			return PTR_ERR(eb);
+		} else if (!extent_buffer_uptodate(eb)) {
 			free_extent_buffer(eb);
 			return -EIO;
 		}
@@ -2710,7 +2714,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 		blocksize = root->nodesize;
 		generation = btrfs_node_ptr_generation(upper->eb, slot);
 		eb = read_tree_block(root, bytenr, generation);
-		if (!eb || !extent_buffer_uptodate(eb)) {
+		if (IS_ERR(eb)) {
+			err = PTR_ERR(eb);
+			goto next;
+		} else if (!extent_buffer_uptodate(eb)) {
 			free_extent_buffer(eb);
 			err = -EIO;
 			goto next;
@@ -2873,7 +2880,9 @@ static int get_tree_block_key(struct reloc_control *rc,
 	BUG_ON(block->key_ready);
 	eb = read_tree_block(rc->extent_root, block->bytenr,
 			     block->key.offset);
-	if (!eb || !extent_buffer_uptodate(eb)) {
+	if (IS_ERR(eb)) {
+		return PTR_ERR(eb);
+	} else if (!extent_buffer_uptodate(eb)) {
 		free_extent_buffer(eb);
 		return -EIO;
 	}
@@ -4040,7 +4049,7 @@ restart:
 	if (trans && progress && err == -ENOSPC) {
 		ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
 					      rc->block_group->flags);
-		if (ret == 0) {
+		if (ret == 1) {
 			err = 0;
 			progress = 0;
 			goto restart;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ab5811545..94db0fa52 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2662,18 +2662,30 @@ static void scrub_free_parity(struct scrub_parity *sparity)
 	kfree(sparity);
 }
 
+static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
+{
+	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
+						    work);
+	struct scrub_ctx *sctx = sparity->sctx;
+
+	scrub_free_parity(sparity);
+	scrub_pending_bio_dec(sctx);
+}
+
 static void scrub_parity_bio_endio(struct bio *bio, int error)
 {
 	struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
-	struct scrub_ctx *sctx = sparity->sctx;
 
 	if (error)
 		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
 			  sparity->nsectors);
 
-	scrub_free_parity(sparity);
-	scrub_pending_bio_dec(sctx);
 	bio_put(bio);
+
+	btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
+			scrub_parity_bio_endio_worker, NULL, NULL);
+	btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers,
+			 &sparity->work);
 }
 
 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
@@ -3559,7 +3571,6 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 						int is_dev_replace)
 {
-	int ret = 0;
 	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
 	int max_active = fs_info->thread_pool_size;
 
@@ -3572,27 +3583,36 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 			fs_info->scrub_workers =
 				btrfs_alloc_workqueue("btrfs-scrub", flags,
 						      max_active, 4);
-		if (!fs_info->scrub_workers) {
-			ret = -ENOMEM;
-			goto out;
-		}
+		if (!fs_info->scrub_workers)
+			goto fail_scrub_workers;
+
 		fs_info->scrub_wr_completion_workers =
 			btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
 					      max_active, 2);
-		if (!fs_info->scrub_wr_completion_workers) {
-			ret = -ENOMEM;
-			goto out;
-		}
+		if (!fs_info->scrub_wr_completion_workers)
+			goto fail_scrub_wr_completion_workers;
+
 		fs_info->scrub_nocow_workers =
 			btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
-		if (!fs_info->scrub_nocow_workers) {
-			ret = -ENOMEM;
-			goto out;
-		}
+		if (!fs_info->scrub_nocow_workers)
+			goto fail_scrub_nocow_workers;
+		fs_info->scrub_parity_workers =
+			btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+					      max_active, 2);
+		if (!fs_info->scrub_parity_workers)
+			goto fail_scrub_parity_workers;
 	}
 	++fs_info->scrub_workers_refcnt;
-out:
-	return ret;
+	return 0;
+
+fail_scrub_parity_workers:
+	btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+fail_scrub_nocow_workers:
+	btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+fail_scrub_wr_completion_workers:
+	btrfs_destroy_workqueue(fs_info->scrub_workers);
+fail_scrub_workers:
+	return -ENOMEM;
 }
 
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
@@ -3601,6 +3621,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 		btrfs_destroy_workqueue(fs_info->scrub_workers);
 		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
 		btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
 	}
 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
 }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 5cf7838fb..aa72bfd28 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -243,6 +243,7 @@ struct waiting_dir_move {
 	 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
 	 */
 	u64 rmdir_ino;
+	bool orphanized;
 };
 
 struct orphan_dir_info {
@@ -1916,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx,
 		goto out;
 	}
 
-	/* we know that it is or will be overwritten. check this now */
-	if (ow_inode < sctx->send_progress)
+	/*
+	 * We know that it is or will be overwritten. Check this now.
+	 * The current inode being processed might have been the one that caused
+	 * inode 'ino' to be orphanized, therefore ow_inode can actually be the
+	 * same as sctx->send_progress.
+	 */
+	if (ow_inode <= sctx->send_progress)
 		ret = 1;
 	else
 		ret = 0;
@@ -2239,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	fs_path_reset(dest);
 
 	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+		struct waiting_dir_move *wdm;
+
 		fs_path_reset(name);
 
 		if (is_waiting_for_rm(sctx, ino)) {
@@ -2249,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 			break;
 		}
 
-		if (is_waiting_for_move(sctx, ino)) {
+		wdm = get_waiting_dir_move(sctx, ino);
+		if (wdm && wdm->orphanized) {
+			ret = gen_unique_name(sctx, ino, gen, name);
+			stop = 1;
+		} else if (wdm) {
 			ret = get_first_ref(sctx->parent_root, ino,
 					    &parent_inode, &parent_gen, name);
 		} else {
@@ -2344,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
 		    le64_to_cpu(sctx->send_root->root_item.ctransid));
 	if (parent_root) {
-		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
-				sctx->parent_root->root_item.uuid);
+		if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
+			TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+				     parent_root->root_item.received_uuid);
+		else
+			TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+				     parent_root->root_item.uuid);
 		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
 			    le64_to_cpu(sctx->parent_root->root_item.ctransid));
 	}
@@ -2939,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
 	return entry != NULL;
 }
 
-static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
 {
 	struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
 	struct rb_node *parent = NULL;
@@ -2950,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
 		return -ENOMEM;
 	dm->ino = ino;
 	dm->rmdir_ino = 0;
+	dm->orphanized = orphanized;
 
 	while (*p) {
 		parent = *p;
@@ -3046,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
 			goto out;
 	}
 
-	ret = add_waiting_dir_move(sctx, pm->ino);
+	ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
 	if (ret)
 		goto out;
 
@@ -3369,8 +3386,40 @@ out:
 	return ret;
 }
 
+/*
+ * Check if ino ino1 is an ancestor of inode ino2 in the given root.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int is_ancestor(struct btrfs_root *root,
+		       const u64 ino1,
+		       const u64 ino1_gen,
+		       const u64 ino2,
+		       struct fs_path *fs_path)
+{
+	u64 ino = ino2;
+
+	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+		int ret;
+		u64 parent;
+		u64 parent_gen;
+
+		fs_path_reset(fs_path);
+		ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+		if (ret < 0) {
+			if (ret == -ENOENT && ino == ino2)
+				ret = 0;
+			return ret;
+		}
+		if (parent == ino1)
+			return parent_gen == ino1_gen ? 1 : 0;
+		ino = parent;
+	}
+	return 0;
+}
+
 static int wait_for_parent_move(struct send_ctx *sctx,
-				struct recorded_ref *parent_ref)
+				struct recorded_ref *parent_ref,
+				const bool is_orphan)
 {
 	int ret = 0;
 	u64 ino = parent_ref->dir;
@@ -3390,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	 * Our current directory inode may not yet be renamed/moved because some
 	 * ancestor (immediate or not) has to be renamed/moved first. So find if
 	 * such ancestor exists and make sure our own rename/move happens after
-	 * that ancestor is processed.
+	 * that ancestor is processed to avoid path build infinite loops (done
+	 * at get_cur_path()).
 	 */
 	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
 		if (is_waiting_for_move(sctx, ino)) {
-			ret = 1;
+			/*
+			 * If the current inode is an ancestor of ino in the
+			 * parent root, we need to delay the rename of the
+			 * current inode, otherwise don't delayed the rename
+			 * because we can end up with a circular dependency
+			 * of renames, resulting in some directories never
+			 * getting the respective rename operations issued in
+			 * the send stream or getting into infinite path build
+			 * loops.
+			 */
+			ret = is_ancestor(sctx->parent_root,
+					  sctx->cur_ino, sctx->cur_inode_gen,
+					  ino, path_before);
 			break;
 		}
 
@@ -3436,7 +3498,7 @@ out:
 					   ino,
 					   &sctx->new_refs,
 					   &sctx->deleted_refs,
-					   false);
+					   is_orphan);
 		if (!ret)
 			ret = 1;
 	}
@@ -3605,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			}
 		}
 
+		if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
+		    can_rename) {
+			ret = wait_for_parent_move(sctx, cur, is_orphan);
+			if (ret < 0)
+				goto out;
+			if (ret == 1) {
+				can_rename = false;
+				*pending_move = 1;
+			}
+		}
+
 		/*
 		 * link/move the ref to the new place. If we have an orphan
 		 * inode, move it and update valid_path. If not, link or move
@@ -3625,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 				 * dirs, we always have one new and one deleted
 				 * ref. The deleted ref is ignored later.
 				 */
-				ret = wait_for_parent_move(sctx, cur);
-				if (ret < 0)
-					goto out;
-				if (ret) {
-					*pending_move = 1;
-				} else {
-					ret = send_rename(sctx, valid_path,
-							  cur->full_path);
-					if (!ret)
-						ret = fs_path_copy(valid_path,
-							       cur->full_path);
-				}
+				ret = send_rename(sctx, valid_path,
+						  cur->full_path);
+				if (!ret)
+					ret = fs_path_copy(valid_path,
+							   cur->full_path);
 				if (ret < 0)
 					goto out;
 			} else {
@@ -4524,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 	if (ret < 0)
 		goto out;
 
-	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
-			clone_root->root->root_item.uuid);
+	/*
+	 * If the parent we're using has a received_uuid set then use that as
+	 * our clone source as that is what we will look for when doing a
+	 * receive.
+	 *
+	 * This covers the case that we create a snapshot off of a received
+	 * subvolume and then use that as the parent and try to receive on a
+	 * different host.
+	 */
+	if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+			     clone_root->root->root_item.received_uuid);
+	else
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+			     clone_root->root->root_item.uuid);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
 		    le64_to_cpu(clone_root->root->root_item.ctransid));
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9e66f5e72..cd7ef34d2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
  * __btrfs_std_error decodes expected errors from the caller and
  * invokes the approciate error response.
  */
+__cold
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		       unsigned int line, int errno, const char *fmt, ...)
 {
@@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
  * We'll complete the cleanup in btrfs_end_transaction and
  * btrfs_commit_transaction.
  */
+__cold
 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root, const char *function,
 			       unsigned int line, int errno)
 {
-	/*
-	 * Report first abort since mount
-	 */
-	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
-				&root->fs_info->fs_state)) {
-		WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
-				errno);
-	}
 	trans->aborted = errno;
 	/* Nothing used. The other threads that have joined this
 	 * transaction may be able to continue. */
@@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
  * __btrfs_panic decodes unexpected, fatal errors from the caller,
  * issues an alert, and either panics or BUGs, depending on mount options.
  */
+__cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 		   unsigned int line, int errno, const char *fmt, ...)
 {
@@ -841,33 +836,153 @@ out:
 	return error;
 }
 
-static struct dentry *get_default_root(struct super_block *sb,
-				       u64 subvol_objectid)
+static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+					   u64 subvol_objectid)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *root = fs_info->tree_root;
-	struct btrfs_root *new_root;
-	struct btrfs_dir_item *di;
-	struct btrfs_path *path;
-	struct btrfs_key location;
-	struct inode *inode;
-	u64 dir_id;
-	int new = 0;
+	struct btrfs_root *fs_root;
+	struct btrfs_root_ref *root_ref;
+	struct btrfs_inode_ref *inode_ref;
+	struct btrfs_key key;
+	struct btrfs_path *path = NULL;
+	char *name = NULL, *ptr;
+	u64 dirid;
+	int len;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	path->leave_spinning = 1;
+
+	name = kmalloc(PATH_MAX, GFP_NOFS);
+	if (!name) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	ptr = name + PATH_MAX - 1;
+	ptr[0] = '\0';
 
 	/*
-	 * We have a specific subvol we want to mount, just setup location and
-	 * go look up the root.
+	 * Walk up the subvolume trees in the tree of tree roots by root
+	 * backrefs until we hit the top-level subvolume.
 	 */
-	if (subvol_objectid) {
-		location.objectid = subvol_objectid;
-		location.type = BTRFS_ROOT_ITEM_KEY;
-		location.offset = (u64)-1;
-		goto find_root;
+	while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+		key.objectid = subvol_objectid;
+		key.type = BTRFS_ROOT_BACKREF_KEY;
+		key.offset = (u64)-1;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0) {
+			goto err;
+		} else if (ret > 0) {
+			ret = btrfs_previous_item(root, path, subvol_objectid,
+						  BTRFS_ROOT_BACKREF_KEY);
+			if (ret < 0) {
+				goto err;
+			} else if (ret > 0) {
+				ret = -ENOENT;
+				goto err;
+			}
+		}
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		subvol_objectid = key.offset;
+
+		root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					  struct btrfs_root_ref);
+		len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
+		ptr -= len + 1;
+		if (ptr < name) {
+			ret = -ENAMETOOLONG;
+			goto err;
+		}
+		read_extent_buffer(path->nodes[0], ptr + 1,
+				   (unsigned long)(root_ref + 1), len);
+		ptr[0] = '/';
+		dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
+		btrfs_release_path(path);
+
+		key.objectid = subvol_objectid;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+		fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
+		if (IS_ERR(fs_root)) {
+			ret = PTR_ERR(fs_root);
+			goto err;
+		}
+
+		/*
+		 * Walk up the filesystem tree by inode refs until we hit the
+		 * root directory.
+		 */
+		while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
+			key.objectid = dirid;
+			key.type = BTRFS_INODE_REF_KEY;
+			key.offset = (u64)-1;
+
+			ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+			if (ret < 0) {
+				goto err;
+			} else if (ret > 0) {
+				ret = btrfs_previous_item(fs_root, path, dirid,
+							  BTRFS_INODE_REF_KEY);
+				if (ret < 0) {
+					goto err;
+				} else if (ret > 0) {
+					ret = -ENOENT;
+					goto err;
+				}
+			}
+
+			btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+			dirid = key.offset;
+
+			inode_ref = btrfs_item_ptr(path->nodes[0],
+						   path->slots[0],
+						   struct btrfs_inode_ref);
+			len = btrfs_inode_ref_name_len(path->nodes[0],
+						       inode_ref);
+			ptr -= len + 1;
+			if (ptr < name) {
+				ret = -ENAMETOOLONG;
+				goto err;
+			}
+			read_extent_buffer(path->nodes[0], ptr + 1,
+					   (unsigned long)(inode_ref + 1), len);
+			ptr[0] = '/';
+			btrfs_release_path(path);
+		}
 	}
 
+	btrfs_free_path(path);
+	if (ptr == name + PATH_MAX - 1) {
+		name[0] = '/';
+		name[1] = '\0';
+	} else {
+		memmove(name, ptr, name + PATH_MAX - ptr);
+	}
+	return name;
+
+err:
+	btrfs_free_path(path);
+	kfree(name);
+	return ERR_PTR(ret);
+}
+
+static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
+{
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct btrfs_key location;
+	u64 dir_id;
+
 	path = btrfs_alloc_path();
 	if (!path)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	path->leave_spinning = 1;
 
 	/*
@@ -879,58 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb,
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
 	if (IS_ERR(di)) {
 		btrfs_free_path(path);
-		return ERR_CAST(di);
+		return PTR_ERR(di);
 	}
 	if (!di) {
 		/*
 		 * Ok the default dir item isn't there.  This is weird since
 		 * it's always been there, but don't freak out, just try and
-		 * mount to root most subvolume.
+		 * mount the top-level subvolume.
 		 */
 		btrfs_free_path(path);
-		dir_id = BTRFS_FIRST_FREE_OBJECTID;
-		new_root = fs_info->fs_root;
-		goto setup_root;
+		*objectid = BTRFS_FS_TREE_OBJECTID;
+		return 0;
 	}
 
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 	btrfs_free_path(path);
-
-find_root:
-	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
-	if (IS_ERR(new_root))
-		return ERR_CAST(new_root);
-
-	if (!(sb->s_flags & MS_RDONLY)) {
-		int ret;
-		down_read(&fs_info->cleanup_work_sem);
-		ret = btrfs_orphan_cleanup(new_root);
-		up_read(&fs_info->cleanup_work_sem);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	dir_id = btrfs_root_dirid(&new_root->root_item);
-setup_root:
-	location.objectid = dir_id;
-	location.type = BTRFS_INODE_ITEM_KEY;
-	location.offset = 0;
-
-	inode = btrfs_iget(sb, &location, new_root, &new);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	/*
-	 * If we're just mounting the root most subvol put the inode and return
-	 * a reference to the dentry.  We will have already gotten a reference
-	 * to the inode in btrfs_fill_super so we're good to go.
-	 */
-	if (!new && d_inode(sb->s_root) == inode) {
-		iput(inode);
-		return dget(sb->s_root);
-	}
-
-	return d_obtain_root(inode);
+	*objectid = location.objectid;
+	return 0;
 }
 
 static int btrfs_fill_super(struct super_block *sb,
@@ -1108,6 +1188,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",fatal_errors=panic");
 	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
 		seq_printf(seq, ",commit=%d", info->commit_interval);
+	seq_printf(seq, ",subvolid=%llu",
+		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+	seq_puts(seq, ",subvol=");
+	seq_dentry(seq, dentry, " \t\n\\");
 	return 0;
 }
 
@@ -1138,107 +1222,139 @@ static inline int is_subvolume_inode(struct inode *inode)
 }
 
 /*
- * This will strip out the subvol=%s argument for an argument string and add
- * subvolid=0 to make sure we get the actual tree root for path walking to the
- * subvol we want.
+ * This will add subvolid=0 to the argument string while removing any subvol=
+ * and subvolid= arguments to make sure we get the top-level root for path
+ * walking to the subvol we want.
  */
 static char *setup_root_args(char *args)
 {
-	unsigned len = strlen(args) + 2 + 1;
-	char *src, *dst, *buf;
+	char *buf, *dst, *sep;
 
-	/*
-	 * We need the same args as before, but with this substitution:
-	 * s!subvol=[^,]+!subvolid=0!
-	 *
-	 * Since the replacement string is up to 2 bytes longer than the
-	 * original, allocate strlen(args) + 2 + 1 bytes.
-	 */
+	if (!args)
+		return kstrdup("subvolid=0", GFP_NOFS);
 
-	src = strstr(args, "subvol=");
-	/* This shouldn't happen, but just in case.. */
-	if (!src)
-		return NULL;
-
-	buf = dst = kmalloc(len, GFP_NOFS);
+	/* The worst case is that we add ",subvolid=0" to the end. */
+	buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS);
 	if (!buf)
 		return NULL;
 
-	/*
-	 * If the subvol= arg is not at the start of the string,
-	 * copy whatever precedes it into buf.
-	 */
-	if (src != args) {
-		*src++ = '\0';
-		strcpy(buf, args);
-		dst += strlen(args);
+	while (1) {
+		sep = strchrnul(args, ',');
+		if (!strstarts(args, "subvol=") &&
+		    !strstarts(args, "subvolid=")) {
+			memcpy(dst, args, sep - args);
+			dst += sep - args;
+			*dst++ = ',';
+		}
+		if (*sep)
+			args = sep + 1;
+		else
+			break;
 	}
-
 	strcpy(dst, "subvolid=0");
-	dst += strlen("subvolid=0");
-
-	/*
-	 * If there is a "," after the original subvol=... string,
-	 * copy that suffix into our buffer.  Otherwise, we're done.
-	 */
-	src = strchr(src, ',');
-	if (src)
-		strcpy(dst, src);
 
 	return buf;
 }
 
-static struct dentry *mount_subvol(const char *subvol_name, int flags,
-				   const char *device_name, char *data)
+static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
+				   int flags, const char *device_name,
+				   char *data)
 {
 	struct dentry *root;
-	struct vfsmount *mnt;
+	struct vfsmount *mnt = NULL;
 	char *newargs;
+	int ret;
 
 	newargs = setup_root_args(data);
-	if (!newargs)
-		return ERR_PTR(-ENOMEM);
-	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
-			     newargs);
+	if (!newargs) {
+		root = ERR_PTR(-ENOMEM);
+		goto out;
+	}
 
-	if (PTR_RET(mnt) == -EBUSY) {
+	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
+	if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
 		if (flags & MS_RDONLY) {
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
-					     newargs);
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
+					     device_name, newargs);
 		} else {
-			int r;
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
-					     newargs);
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
+					     device_name, newargs);
 			if (IS_ERR(mnt)) {
-				kfree(newargs);
-				return ERR_CAST(mnt);
+				root = ERR_CAST(mnt);
+				mnt = NULL;
+				goto out;
 			}
 
-			r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
-			if (r < 0) {
-				/* FIXME: release vfsmount mnt ??*/
-				kfree(newargs);
-				return ERR_PTR(r);
+			down_write(&mnt->mnt_sb->s_umount);
+			ret = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+			up_write(&mnt->mnt_sb->s_umount);
+			if (ret < 0) {
+				root = ERR_PTR(ret);
+				goto out;
 			}
 		}
 	}
+	if (IS_ERR(mnt)) {
+		root = ERR_CAST(mnt);
+		mnt = NULL;
+		goto out;
+	}
 
-	kfree(newargs);
+	if (!subvol_name) {
+		if (!subvol_objectid) {
+			ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
+							  &subvol_objectid);
+			if (ret) {
+				root = ERR_PTR(ret);
+				goto out;
+			}
+		}
+		subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
+							    subvol_objectid);
+		if (IS_ERR(subvol_name)) {
+			root = ERR_CAST(subvol_name);
+			subvol_name = NULL;
+			goto out;
+		}
 
-	if (IS_ERR(mnt))
-		return ERR_CAST(mnt);
+	}
 
 	root = mount_subtree(mnt, subvol_name);
+	/* mount_subtree() drops our reference on the vfsmount. */
+	mnt = NULL;
 
-	if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) {
+	if (!IS_ERR(root)) {
 		struct super_block *s = root->d_sb;
-		dput(root);
-		root = ERR_PTR(-EINVAL);
-		deactivate_locked_super(s);
-		printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
-				subvol_name);
+		struct inode *root_inode = d_inode(root);
+		u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
+
+		ret = 0;
+		if (!is_subvolume_inode(root_inode)) {
+			pr_err("BTRFS: '%s' is not a valid subvolume\n",
+			       subvol_name);
+			ret = -EINVAL;
+		}
+		if (subvol_objectid && root_objectid != subvol_objectid) {
+			/*
+			 * This will also catch a race condition where a
+			 * subvolume which was passed by ID is renamed and
+			 * another subvolume is renamed over the old location.
+			 */
+			pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n",
+			       subvol_name, subvol_objectid);
+			ret = -EINVAL;
+		}
+		if (ret) {
+			dput(root);
+			root = ERR_PTR(ret);
+			deactivate_locked_super(s);
+		}
 	}
 
+out:
+	mntput(mnt);
+	kfree(newargs);
+	kfree(subvol_name);
 	return root;
 }
 
@@ -1303,7 +1419,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 {
 	struct block_device *bdev = NULL;
 	struct super_block *s;
-	struct dentry *root;
 	struct btrfs_fs_devices *fs_devices = NULL;
 	struct btrfs_fs_info *fs_info = NULL;
 	struct security_mnt_opts new_sec_opts;
@@ -1323,10 +1438,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		return ERR_PTR(error);
 	}
 
-	if (subvol_name) {
-		root = mount_subvol(subvol_name, flags, device_name, data);
-		kfree(subvol_name);
-		return root;
+	if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+		/* mount_subvol() will free subvol_name. */
+		return mount_subvol(subvol_name, subvol_objectid, flags,
+				    device_name, data);
 	}
 
 	security_init_mnt_opts(&new_sec_opts);
@@ -1392,23 +1507,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 	}
-
-	root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
-	if (IS_ERR(root)) {
+	if (error) {
 		deactivate_locked_super(s);
-		error = PTR_ERR(root);
 		goto error_sec_opts;
 	}
 
 	fs_info = btrfs_sb(s);
 	error = setup_security_options(fs_info, s, &new_sec_opts);
 	if (error) {
-		dput(root);
 		deactivate_locked_super(s);
 		goto error_sec_opts;
 	}
 
-	return root;
+	return dget(s->s_root);
 
 error_close_devices:
 	btrfs_close_devices(fs_devices);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e8a4c86d2..603b0cc2b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -33,6 +33,7 @@
 #include "volumes.h"
 
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
 
 static u64 get_features(struct btrfs_fs_info *fs_info,
 			enum btrfs_feature_set set)
@@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
 
 BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
 
-static struct attribute *btrfs_attrs[] = {
+static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(label),
 	BTRFS_ATTR_PTR(nodesize),
 	BTRFS_ATTR_PTR(sectorsize),
@@ -438,21 +439,29 @@ static struct attribute *btrfs_attrs[] = {
 
 static void btrfs_release_super_kobj(struct kobject *kobj)
 {
-	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
-	complete(&fs_info->kobj_unregister);
+	struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
+
+	memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+	complete(&fs_devs->kobj_unregister);
 }
 
 static struct kobj_type btrfs_ktype = {
 	.sysfs_ops	= &kobj_sysfs_ops,
 	.release	= btrfs_release_super_kobj,
-	.default_attrs	= btrfs_attrs,
 };
 
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
+{
+	if (kobj->ktype != &btrfs_ktype)
+		return NULL;
+	return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+}
+
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
 {
 	if (kobj->ktype != &btrfs_ktype)
 		return NULL;
-	return container_of(kobj, struct btrfs_fs_info, super_kobj);
+	return to_fs_devs(kobj)->fs_info;
 }
 
 #define NUM_FEATURE_BITS 64
@@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
 			attrs[0] = &fa->kobj_attr.attr;
 			if (add) {
 				int ret;
-				ret = sysfs_merge_group(&fs_info->super_kobj,
+				ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
 							&agroup);
 				if (ret)
 					return ret;
 			} else
-				sysfs_unmerge_group(&fs_info->super_kobj,
+				sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
 						    &agroup);
 		}
 
@@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
 	return 0;
 }
 
-static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
+{
+	if (fs_devs->device_dir_kobj) {
+		kobject_del(fs_devs->device_dir_kobj);
+		kobject_put(fs_devs->device_dir_kobj);
+		fs_devs->device_dir_kobj = NULL;
+	}
+
+	if (fs_devs->super_kobj.state_initialized) {
+		kobject_del(&fs_devs->super_kobj);
+		kobject_put(&fs_devs->super_kobj);
+		wait_for_completion(&fs_devs->kobj_unregister);
+	}
+}
+
+/* when fs_devs is NULL it will remove all fsid kobject */
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 {
-	kobject_del(&fs_info->super_kobj);
-	kobject_put(&fs_info->super_kobj);
-	wait_for_completion(&fs_info->kobj_unregister);
+	struct list_head *fs_uuids = btrfs_get_fs_uuids();
+
+	if (fs_devs) {
+		__btrfs_sysfs_remove_fsid(fs_devs);
+		return;
+	}
+
+	list_for_each_entry(fs_devs, fs_uuids, list) {
+		__btrfs_sysfs_remove_fsid(fs_devs);
+	}
 }
 
 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 {
+	btrfs_reset_fs_info_ptr(fs_info);
+
 	if (fs_info->space_info_kobj) {
 		sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
 		kobject_del(fs_info->space_info_kobj);
 		kobject_put(fs_info->space_info_kobj);
 	}
-	kobject_del(fs_info->device_dir_kobj);
-	kobject_put(fs_info->device_dir_kobj);
 	addrm_unknown_feature_attrs(fs_info, false);
-	sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
-	__btrfs_sysfs_remove_one(fs_info);
+	sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
+	sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
+	btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
 }
 
 const char * const btrfs_feature_set_names[3] = {
@@ -602,40 +635,60 @@ static void init_feature_attrs(void)
 	}
 }
 
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+/* when one_device is NULL, it removes all device links */
+
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device)
 {
 	struct hd_struct *disk;
 	struct kobject *disk_kobj;
 
-	if (!fs_info->device_dir_kobj)
+	if (!fs_devices->device_dir_kobj)
 		return -EINVAL;
 
 	if (one_device && one_device->bdev) {
 		disk = one_device->bdev->bd_part;
 		disk_kobj = &part_to_dev(disk)->kobj;
 
-		sysfs_remove_link(fs_info->device_dir_kobj,
+		sysfs_remove_link(fs_devices->device_dir_kobj,
+						disk_kobj->name);
+	}
+
+	if (one_device)
+		return 0;
+
+	list_for_each_entry(one_device,
+			&fs_devices->devices, dev_list) {
+		if (!one_device->bdev)
+			continue;
+		disk = one_device->bdev->bd_part;
+		disk_kobj = &part_to_dev(disk)->kobj;
+
+		sysfs_remove_link(fs_devices->device_dir_kobj,
 						disk_kobj->name);
 	}
 
 	return 0;
 }
 
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
-		struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
 {
-	int error = 0;
-	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
-	struct btrfs_device *dev;
-
-	if (!fs_info->device_dir_kobj)
-		fs_info->device_dir_kobj = kobject_create_and_add("devices",
-						&fs_info->super_kobj);
+	if (!fs_devs->device_dir_kobj)
+		fs_devs->device_dir_kobj = kobject_create_and_add("devices",
+						&fs_devs->super_kobj);
 
-	if (!fs_info->device_dir_kobj)
+	if (!fs_devs->device_dir_kobj)
 		return -ENOMEM;
 
+	return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+				struct btrfs_device *one_device)
+{
+	int error = 0;
+	struct btrfs_device *dev;
+
 	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
 		struct hd_struct *disk;
 		struct kobject *disk_kobj;
@@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
 		disk = dev->bdev->bd_part;
 		disk_kobj = &part_to_dev(disk)->kobj;
 
-		error = sysfs_create_link(fs_info->device_dir_kobj,
+		error = sysfs_create_link(fs_devices->device_dir_kobj,
 					  disk_kobj, disk_kobj->name);
 		if (error)
 			break;
@@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry;
 /* Debugging tunables and exported data */
 u64 btrfs_debugfs_test;
 
+/*
+ * Can be called by the device discovery thread.
+ * And parent can be specified for seed device
+ */
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+				struct kobject *parent)
+{
+	int error;
+
+	init_completion(&fs_devs->kobj_unregister);
+	fs_devs->super_kobj.kset = btrfs_kset;
+	error = kobject_init_and_add(&fs_devs->super_kobj,
+				&btrfs_ktype, parent, "%pU", fs_devs->fsid);
+	return error;
+}
+
 int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 {
 	int error;
+	struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
+	struct kobject *super_kobj = &fs_devs->super_kobj;
+
+	btrfs_set_fs_info_ptr(fs_info);
 
-	init_completion(&fs_info->kobj_unregister);
-	fs_info->super_kobj.kset = btrfs_kset;
-	error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
-				     "%pU", fs_info->fsid);
+	error = btrfs_kobj_add_device(fs_devs, NULL);
 	if (error)
 		return error;
 
-	error = sysfs_create_group(&fs_info->super_kobj,
-				   &btrfs_feature_attr_group);
+	error = sysfs_create_files(super_kobj, btrfs_attrs);
 	if (error) {
-		__btrfs_sysfs_remove_one(fs_info);
+		btrfs_kobj_rm_device(fs_devs, NULL);
 		return error;
 	}
 
-	error = addrm_unknown_feature_attrs(fs_info, true);
+	error = sysfs_create_group(super_kobj,
+				   &btrfs_feature_attr_group);
 	if (error)
 		goto failure;
 
-	error = btrfs_kobj_add_device(fs_info, NULL);
+	error = addrm_unknown_feature_attrs(fs_info, true);
 	if (error)
 		goto failure;
 
 	fs_info->space_info_kobj = kobject_create_and_add("allocation",
-						  &fs_info->super_kobj);
+						  super_kobj);
 	if (!fs_info->space_info_kobj) {
 		error = -ENOMEM;
 		goto failure;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 3a4bbed72..6392527bc 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,8 +82,12 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
 extern const char * const btrfs_feature_set_names[3];
 extern struct kobj_type space_info_ktype;
 extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
                 struct btrfs_device *one_device);
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+				struct kobject *parent);
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
 #endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index c32a7ba76..846d277b1 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -21,6 +21,7 @@
 #include "../transaction.h"
 #include "../disk-io.h"
 #include "../qgroup.h"
+#include "../backref.h"
 
 static void init_dummy_trans(struct btrfs_trans_handle *trans)
 {
@@ -227,6 +228,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
 {
 	struct btrfs_trans_handle trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct ulist *old_roots = NULL;
+	struct ulist *new_roots = NULL;
 	int ret;
 
 	init_dummy_trans(&trans);
@@ -238,10 +241,15 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
 		return ret;
 	}
 
-	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
-				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+	/*
+	 * Since the test trans doesn't havee the complicated delayed refs,
+	 * we can only call btrfs_qgroup_account_extent() directly to test
+	 * quota.
+	 */
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
 	if (ret) {
-		test_msg("Couldn't add space to a qgroup %d\n", ret);
+		ulist_free(old_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
 		return ret;
 	}
 
@@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
 	if (ret)
 		return ret;
 
-	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+	if (ret) {
+		ulist_free(old_roots);
+		ulist_free(new_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+					  old_roots, new_roots);
 	if (ret) {
-		test_msg("Delayed qgroup accounting failed %d\n", ret);
+		test_msg("Couldn't account space for a qgroup %d\n", ret);
 		return ret;
 	}
 
@@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
 		test_msg("Qgroup counts didn't match expected values\n");
 		return -EINVAL;
 	}
+	old_roots = NULL;
+	new_roots = NULL;
+
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+	if (ret) {
+		ulist_free(old_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
+	}
 
 	ret = remove_extent_item(root, 4096, 4096);
 	if (ret)
 		return -EINVAL;
 
-	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
-				      BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
 	if (ret) {
-		test_msg("Couldn't remove space from the qgroup %d\n", ret);
-		return -EINVAL;
+		ulist_free(old_roots);
+		ulist_free(new_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
 	}
 
-	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+					  old_roots, new_roots);
 	if (ret) {
-		test_msg("Qgroup accounting failed %d\n", ret);
+		test_msg("Couldn't account space for a qgroup %d\n", ret);
 		return -EINVAL;
 	}
 
@@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root)
 {
 	struct btrfs_trans_handle trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct ulist *old_roots = NULL;
+	struct ulist *new_roots = NULL;
 	int ret;
 
 	init_dummy_trans(&trans);
@@ -307,20 +337,29 @@ static int test_multiple_refs(struct btrfs_root *root)
 		return ret;
 	}
 
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+	if (ret) {
+		ulist_free(old_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
+	}
+
 	ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
 	if (ret)
 		return ret;
 
-	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
-				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
 	if (ret) {
-		test_msg("Couldn't add space to a qgroup %d\n", ret);
+		ulist_free(old_roots);
+		ulist_free(new_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+					  old_roots, new_roots);
 	if (ret) {
-		test_msg("Delayed qgroup accounting failed %d\n", ret);
+		test_msg("Couldn't account space for a qgroup %d\n", ret);
 		return ret;
 	}
 
@@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root)
 		return -EINVAL;
 	}
 
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+	if (ret) {
+		ulist_free(old_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
+	}
+
 	ret = add_tree_ref(root, 4096, 4096, 0, 256);
 	if (ret)
 		return ret;
 
-	ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
-				      BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
 	if (ret) {
-		test_msg("Qgroup record ref failed %d\n", ret);
+		ulist_free(old_roots);
+		ulist_free(new_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+					  old_roots, new_roots);
 	if (ret) {
-		test_msg("Qgroup accounting failed %d\n", ret);
+		test_msg("Couldn't account space for a qgroup %d\n", ret);
 		return ret;
 	}
 
@@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root)
 		return -EINVAL;
 	}
 
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+	if (ret) {
+		ulist_free(old_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
+	}
+
 	ret = remove_extent_ref(root, 4096, 4096, 0, 256);
 	if (ret)
 		return ret;
 
-	ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
-				      BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
 	if (ret) {
-		test_msg("Qgroup record ref failed %d\n", ret);
+		ulist_free(old_roots);
+		ulist_free(new_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+					  old_roots, new_roots);
 	if (ret) {
-		test_msg("Qgroup accounting failed %d\n", ret);
+		test_msg("Couldn't account space for a qgroup %d\n", ret);
 		return ret;
 	}
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 94e909c5a..f5021fcb1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -225,12 +225,14 @@ loop:
 	cur_trans->dirty_bg_run = 0;
 
 	cur_trans->delayed_refs.href_root = RB_ROOT;
+	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
 	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
 	cur_trans->delayed_refs.num_heads_ready = 0;
 	cur_trans->delayed_refs.pending_csums = 0;
 	cur_trans->delayed_refs.num_heads = 0;
 	cur_trans->delayed_refs.flushing = 0;
 	cur_trans->delayed_refs.run_delayed_start = 0;
+	cur_trans->delayed_refs.qgroup_to_skip = 0;
 
 	/*
 	 * although the tree mod log is per file system and not per transaction,
@@ -509,6 +511,7 @@ again:
 	h->transaction = cur_trans;
 	h->blocks_used = 0;
 	h->bytes_reserved = 0;
+	h->chunk_bytes_reserved = 0;
 	h->root = root;
 	h->delayed_ref_updates = 0;
 	h->use_count = 1;
@@ -792,6 +795,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans, root);
 
+	btrfs_trans_release_chunk_metadata(trans);
+
 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
 	    should_end_transaction(trans, root) &&
 	    ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
@@ -1290,6 +1295,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (pending->error)
 		goto no_free_objectid;
 
+	/*
+	 * Make qgroup to skip current new snapshot's qgroupid, as it is
+	 * accounted by later btrfs_qgroup_inherit().
+	 */
+	btrfs_set_skip_qgroup(trans, objectid);
+
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
@@ -1298,7 +1309,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 						     to_reserve,
 						     BTRFS_RESERVE_NO_FLUSH);
 		if (pending->error)
-			goto no_free_objectid;
+			goto clear_skip_qgroup;
 	}
 
 	key.objectid = objectid;
@@ -1396,25 +1407,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		btrfs_abort_transaction(trans, root, ret);
 		goto fail;
 	}
-
-	/*
-	 * We need to flush delayed refs in order to make sure all of our quota
-	 * operations have been done before we call btrfs_qgroup_inherit.
-	 */
-	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	if (ret) {
-		btrfs_abort_transaction(trans, root, ret);
-		goto fail;
-	}
-
-	ret = btrfs_qgroup_inherit(trans, fs_info,
-				   root->root_key.objectid,
-				   objectid, pending->inherit);
-	if (ret) {
-		btrfs_abort_transaction(trans, root, ret);
-		goto fail;
-	}
-
 	/* see comments in should_cow_block() */
 	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
 	smp_wmb();
@@ -1497,11 +1489,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 			goto fail;
 		}
 	}
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	/*
+	 * account qgroup counters before qgroup_inherit()
+	 */
+	ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+	if (ret)
+		goto fail;
+	ret = btrfs_qgroup_account_extents(trans, fs_info);
+	if (ret)
+		goto fail;
+	ret = btrfs_qgroup_inherit(trans, fs_info,
+				   root->root_key.objectid,
+				   objectid, pending->inherit);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
 fail:
 	pending->error = ret;
 dir_item_existed:
 	trans->block_rsv = rsv;
 	trans->bytes_reserved = 0;
+clear_skip_qgroup:
+	btrfs_clear_skip_qgroup(trans);
 no_free_objectid:
 	kfree(new_root_item);
 root_item_alloc_fail:
@@ -1963,6 +1981,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		goto scrub_continue;
 	}
 
+	/* Reocrd old roots for later qgroup accounting */
+	ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
 	/*
 	 * make sure none of the code above managed to slip in a
 	 * delayed item
@@ -2004,6 +2029,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_free_log_root_tree(trans, root->fs_info);
 
+	/*
+	 * Since fs roots are all committed, we can get a quite accurate
+	 * new_roots. So let's do quota accounting.
+	 */
+	ret = btrfs_qgroup_account_extents(trans, root->fs_info);
+	if (ret < 0) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
 	ret = commit_cowonly_roots(trans, root);
 	if (ret) {
 		mutex_unlock(&root->fs_info->tree_log_mutex);
@@ -2054,6 +2090,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
 	clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
 
+	btrfs_trans_release_chunk_metadata(trans);
+
 	spin_lock(&root->fs_info->trans_lock);
 	cur_trans->state = TRANS_STATE_UNBLOCKED;
 	root->fs_info->running_transaction = NULL;
@@ -2114,7 +2152,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
-	if (current != root->fs_info->transaction_kthread)
+	if (current != root->fs_info->transaction_kthread &&
+	    current != root->fs_info->cleaner_kthread)
 		btrfs_run_delayed_iputs(root);
 
 	return ret;
@@ -2123,6 +2162,7 @@ scrub_continue:
 	btrfs_scrub_continue(root);
 cleanup_transaction:
 	btrfs_trans_release_metadata(trans, root);
+	btrfs_trans_release_chunk_metadata(trans);
 	trans->block_rsv = NULL;
 	if (trans->qgroup_reserved) {
 		btrfs_qgroup_free(root, trans->qgroup_reserved);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0b2475559..eb09c2067 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -102,6 +102,7 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
+	u64 chunk_bytes_reserved;
 	u64 qgroup_reserved;
 	unsigned long use_count;
 	unsigned long blocks_reserved;
@@ -153,6 +154,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 	spin_unlock(&BTRFS_I(inode)->lock);
 }
 
+/*
+ * Make qgroup codes to skip given qgroupid, means the old/new_roots for
+ * qgroup won't contain the qgroupid in it.
+ */
+static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans,
+					 u64 qgroupid)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	WARN_ON(delayed_refs->qgroup_to_skip);
+	delayed_refs->qgroup_to_skip = qgroupid;
+}
+
+static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	WARN_ON(!delayed_refs->qgroup_to_skip);
+	delayed_refs->qgroup_to_skip = 0;
+}
+
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a63719cc9..a4b9c8b2d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 		goto out;
 
-	if (btrfs_test_opt(root, SSD))
-		goto out;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4920fceff..9c45431e6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3881,12 +3881,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
 				     &ordered->flags))
 			continue;
 
-		if (ordered->csum_bytes_left) {
-			btrfs_start_ordered_extent(inode, ordered, 0);
-			wait_event(ordered->wait,
-				   ordered->csum_bytes_left == 0);
-		}
-
 		list_for_each_entry(sum, &ordered->list, list) {
 			ret = btrfs_csum_file_blocks(trans, log, sum);
 			if (ret)
@@ -4123,6 +4117,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
 	return 0;
 }
 
+/*
+ * At the moment we always log all xattrs. This is to figure out at log replay
+ * time which xattrs must have their deletion replayed. If a xattr is missing
+ * in the log tree and exists in the fs/subvol tree, we delete it. This is
+ * because if a xattr is deleted, the inode is fsynced and a power failure
+ * happens, causing the log to be replayed the next time the fs is mounted,
+ * we want the xattr to not exist anymore (same behaviour as other filesystems
+ * with a journal, ext3/4, xfs, f2fs, etc).
+ */
+static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct inode *inode,
+				struct btrfs_path *path,
+				struct btrfs_path *dst_path)
+{
+	int ret;
+	struct btrfs_key key;
+	const u64 ino = btrfs_ino(inode);
+	int ins_nr = 0;
+	int start_slot = 0;
+
+	key.objectid = ino;
+	key.type = BTRFS_XATTR_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	while (true) {
+		int slot = path->slots[0];
+		struct extent_buffer *leaf = path->nodes[0];
+		int nritems = btrfs_header_nritems(leaf);
+
+		if (slot >= nritems) {
+			if (ins_nr > 0) {
+				u64 last_extent = 0;
+
+				ret = copy_items(trans, inode, dst_path, path,
+						 &last_extent, start_slot,
+						 ins_nr, 1, 0);
+				/* can't be 1, extent items aren't processed */
+				ASSERT(ret <= 0);
+				if (ret < 0)
+					return ret;
+				ins_nr = 0;
+			}
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				return ret;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
+			break;
+
+		if (ins_nr == 0)
+			start_slot = slot;
+		ins_nr++;
+		path->slots[0]++;
+		cond_resched();
+	}
+	if (ins_nr > 0) {
+		u64 last_extent = 0;
+
+		ret = copy_items(trans, inode, dst_path, path,
+				 &last_extent, start_slot,
+				 ins_nr, 1, 0);
+		/* can't be 1, extent items aren't processed */
+		ASSERT(ret <= 0);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * If the no holes feature is enabled we need to make sure any hole between the
+ * last extent and the i_size of our inode is explicitly marked in the log. This
+ * is to make sure that doing something like:
+ *
+ *      1) create file with 128Kb of data
+ *      2) truncate file to 64Kb
+ *      3) truncate file to 256Kb
+ *      4) fsync file
+ *      5) <crash/power failure>
+ *      6) mount fs and trigger log replay
+ *
+ * Will give us a file with a size of 256Kb, the first 64Kb of data match what
+ * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
+ * file correspond to a hole. The presence of explicit holes in a log tree is
+ * what guarantees that log replay will remove/adjust file extent items in the
+ * fs/subvol tree.
+ *
+ * Here we do not need to care about holes between extents, that is already done
+ * by copy_items(). We also only need to do this in the full sync path, where we
+ * lookup for extents from the fs/subvol tree only. In the fast path case, we
+ * lookup the list of modified extent maps and if any represents a hole, we
+ * insert a corresponding extent representing a hole in the log tree.
+ */
+static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct inode *inode,
+				   struct btrfs_path *path)
+{
+	int ret;
+	struct btrfs_key key;
+	u64 hole_start;
+	u64 hole_size;
+	struct extent_buffer *leaf;
+	struct btrfs_root *log = root->log_root;
+	const u64 ino = btrfs_ino(inode);
+	const u64 i_size = i_size_read(inode);
+
+	if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
+		return 0;
+
+	key.objectid = ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	ASSERT(ret != 0);
+	if (ret < 0)
+		return ret;
+
+	ASSERT(path->slots[0] > 0);
+	path->slots[0]--;
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+		/* inode does not have any extents */
+		hole_start = 0;
+		hole_size = i_size;
+	} else {
+		struct btrfs_file_extent_item *extent;
+		u64 len;
+
+		/*
+		 * If there's an extent beyond i_size, an explicit hole was
+		 * already inserted by copy_items().
+		 */
+		if (key.offset >= i_size)
+			return 0;
+
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_type(leaf, extent) ==
+		    BTRFS_FILE_EXTENT_INLINE) {
+			len = btrfs_file_extent_inline_len(leaf,
+							   path->slots[0],
+							   extent);
+			ASSERT(len == i_size);
+			return 0;
+		}
+
+		len = btrfs_file_extent_num_bytes(leaf, extent);
+		/* Last extent goes beyond i_size, no need to log a hole. */
+		if (key.offset + len > i_size)
+			return 0;
+		hole_start = key.offset + len;
+		hole_size = i_size - hole_start;
+	}
+	btrfs_release_path(path);
+
+	/* Last extent ends at i_size. */
+	if (hole_size == 0)
+		return 0;
+
+	hole_size = ALIGN(hole_size, root->sectorsize);
+	ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
+				       hole_size, 0, hole_size, 0, 0, 0);
+	return ret;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -4295,6 +4470,25 @@ again:
 		if (min_key.type == BTRFS_INODE_ITEM_KEY)
 			need_log_inode_item = false;
 
+		/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+		if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
+			if (ins_nr == 0)
+				goto next_slot;
+			ret = copy_items(trans, inode, dst_path, path,
+					 &last_extent, ins_start_slot,
+					 ins_nr, inode_only, logged_isize);
+			if (ret < 0) {
+				err = ret;
+				goto out_unlock;
+			}
+			ins_nr = 0;
+			if (ret) {
+				btrfs_release_path(path);
+				continue;
+			}
+			goto next_slot;
+		}
+
 		src = path->nodes[0];
 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
 			ins_nr++;
@@ -4362,6 +4556,18 @@ next_slot:
 		ins_nr = 0;
 	}
 
+	btrfs_release_path(path);
+	btrfs_release_path(dst_path);
+	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+	if (err)
+		goto out_unlock;
+	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+		btrfs_release_path(path);
+		btrfs_release_path(dst_path);
+		err = btrfs_log_trailing_hole(trans, root, inode, path);
+		if (err)
+			goto out_unlock;
+	}
 log_extents:
 	btrfs_release_path(path);
 	btrfs_release_path(dst_path);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 840a38b27..91feb2bde 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
 	return NULL;
 }
 
+static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
+{
+	rb_erase(&node->rb_node, &ulist->root);
+	list_del(&node->list);
+	kfree(node);
+	BUG_ON(ulist->nnodes == 0);
+	ulist->nnodes--;
+}
+
 static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
 {
 	struct rb_node **p = &ulist->root.rb_node;
@@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 
 	node->val = val;
 	node->aux = aux;
-#ifdef CONFIG_BTRFS_DEBUG
-	node->seqnum = ulist->nnodes;
-#endif
 
 	ret = ulist_rbtree_insert(ulist, node);
 	ASSERT(!ret);
@@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 	return 1;
 }
 
+/*
+ * ulist_del - delete one node from ulist
+ * @ulist:	ulist to remove node from
+ * @val:	value to delete
+ * @aux:	aux to delete
+ *
+ * The deletion will only be done when *BOTH* val and aux matches.
+ * Return 0 for successful delete.
+ * Return > 0 for not found.
+ */
+int ulist_del(struct ulist *ulist, u64 val, u64 aux)
+{
+	struct ulist_node *node;
+
+	node = ulist_rbtree_search(ulist, val);
+	/* Not found */
+	if (!node)
+		return 1;
+
+	if (node->aux != aux)
+		return 1;
+
+	/* Found and delete */
+	ulist_rbtree_erase(ulist, node);
+	return 0;
+}
+
 /**
  * ulist_next - iterate ulist
  * @ulist:	ulist to iterate
@@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
 		uiter->cur_list = uiter->cur_list->next;
 	} else {
 		uiter->cur_list = ulist->nodes.next;
-#ifdef CONFIG_BTRFS_DEBUG
-		uiter->i = 0;
-#endif
 	}
 	node = list_entry(uiter->cur_list, struct ulist_node, list);
-#ifdef CONFIG_BTRFS_DEBUG
-	ASSERT(node->seqnum == uiter->i);
-	ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
-	uiter->i++;
-#endif
 	return node;
 }
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 4c29db604..a01a2c458 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist);
 int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
 int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 		    u64 *old_aux, gfp_t gfp_mask);
+int ulist_del(struct ulist *ulist, u64 val, u64 aux);
 
 /* just like ulist_add_merge() but take a pointer for the aux data */
 static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 174f5e1e0..fbe7c1045 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 
 DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
+struct list_head *btrfs_get_fs_uuids(void)
+{
+	return &fs_uuids;
+}
 
 static struct btrfs_fs_devices *__alloc_fs_devices(void)
 {
@@ -345,7 +349,7 @@ loop_lock:
 		    waitqueue_active(&fs_info->async_submit_wait))
 			wake_up(&fs_info->async_submit_wait);
 
-		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
 
 		/*
 		 * if we're doing the sync list, record that our
@@ -441,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work)
 	run_scheduled_bios(device);
 }
 
+
+void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+{
+	struct btrfs_fs_devices *fs_devs;
+	struct btrfs_device *dev;
+
+	if (!cur_dev->name)
+		return;
+
+	list_for_each_entry(fs_devs, &fs_uuids, list) {
+		int del = 1;
+
+		if (fs_devs->opened)
+			continue;
+		if (fs_devs->seeding)
+			continue;
+
+		list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+
+			if (dev == cur_dev)
+				continue;
+			if (!dev->name)
+				continue;
+
+			/*
+			 * Todo: This won't be enough. What if the same device
+			 * comes back (with new uuid and) with its mapper path?
+			 * But for now, this does help as mostly an admin will
+			 * either use mapper or non mapper path throughout.
+			 */
+			rcu_read_lock();
+			del = strcmp(rcu_str_deref(dev->name),
+						rcu_str_deref(cur_dev->name));
+			rcu_read_unlock();
+			if (!del)
+				break;
+		}
+
+		if (!del) {
+			/* delete the stale device */
+			if (fs_devs->num_devices == 1) {
+				btrfs_sysfs_remove_fsid(fs_devs);
+				list_del(&fs_devs->list);
+				free_fs_devices(fs_devs);
+			} else {
+				fs_devs->num_devices--;
+				list_del(&dev->dev_list);
+				rcu_string_free(dev->name);
+				kfree(dev);
+			}
+			break;
+		}
+	}
+}
+
 /*
  * Add new device to list of registered devices
  *
@@ -556,6 +615,12 @@ static noinline int device_list_add(const char *path,
 	if (!fs_devices->opened)
 		device->generation = found_transid;
 
+	/*
+	 * if there is new btrfs on an already registered device,
+	 * then remove the stale device entry.
+	 */
+	btrfs_free_stale_device(device);
+
 	*fs_devices_ret = fs_devices;
 
 	return ret;
@@ -693,13 +758,13 @@ static void free_device(struct rcu_head *head)
 
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct btrfs_device *device;
+	struct btrfs_device *device, *tmp;
 
 	if (--fs_devices->opened > 0)
 		return 0;
 
 	mutex_lock(&fs_devices->device_list_mutex);
-	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
 		struct btrfs_device *new_device;
 		struct rcu_string *name;
 
@@ -1067,15 +1132,31 @@ again:
 
 		map = (struct map_lookup *)em->bdev;
 		for (i = 0; i < map->num_stripes; i++) {
+			u64 end;
+
 			if (map->stripes[i].dev != device)
 				continue;
 			if (map->stripes[i].physical >= physical_start + len ||
 			    map->stripes[i].physical + em->orig_block_len <=
 			    physical_start)
 				continue;
-			*start = map->stripes[i].physical +
-				em->orig_block_len;
-			ret = 1;
+			/*
+			 * Make sure that while processing the pinned list we do
+			 * not override our *start with a lower value, because
+			 * we can have pinned chunks that fall within this
+			 * device hole and that have lower physical addresses
+			 * than the pending chunks we processed before. If we
+			 * do not take this special care we can end up getting
+			 * 2 pending chunks that start at the same physical
+			 * device offsets because the end offset of a pinned
+			 * chunk can be equal to the start offset of some
+			 * pending chunk.
+			 */
+			end = map->stripes[i].physical + em->orig_block_len;
+			if (end > *start) {
+				*start = end;
+				ret = 1;
+			}
 		}
 	}
 	if (search_list == &trans->transaction->pending_chunks) {
@@ -1706,7 +1787,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		device->fs_devices->open_devices--;
 		/* remove sysfs entry */
-		btrfs_kobj_rm_device(root->fs_info, device);
+		btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
 	}
 
 	call_rcu(&device->rcu, free_device);
@@ -1875,6 +1956,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	mutex_lock(&uuid_mutex);
 	WARN_ON(!tgtdev);
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+
+	btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+
 	if (tgtdev->bdev) {
 		btrfs_scratch_superblock(tgtdev);
 		fs_info->fs_devices->open_devices--;
@@ -2211,7 +2295,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 				    tmp + 1);
 
 	/* add sysfs device entry */
-	btrfs_kobj_add_device(root->fs_info, device);
+	btrfs_kobj_add_device(root->fs_info->fs_devices, device);
 
 	/*
 	 * we've got more storage, clear any full flags on the space
@@ -2252,8 +2336,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		 */
 		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
 						root->fs_info->fsid);
-		if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
-			goto error_trans;
+		if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+								fsid_buf))
+			pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
 	}
 
 	root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2289,7 +2374,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 error_trans:
 	btrfs_end_transaction(trans, root);
 	rcu_string_free(device->name);
-	btrfs_kobj_rm_device(root->fs_info, device);
+	btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
 	kfree(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
@@ -2609,6 +2694,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 		return -EINVAL;
 	}
 	map = (struct map_lookup *)em->bdev;
+	lock_chunks(root->fs_info->chunk_root);
+	check_system_chunk(trans, extent_root, map->type);
+	unlock_chunks(root->fs_info->chunk_root);
 
 	for (i = 0; i < map->num_stripes; i++) {
 		struct btrfs_device *device = map->stripes[i].dev;
@@ -2678,6 +2766,20 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
 
+	/*
+	 * Prevent races with automatic removal of unused block groups.
+	 * After we relocate and before we remove the chunk with offset
+	 * chunk_offset, automatic removal of the block group can kick in,
+	 * resulting in a failure when calling btrfs_remove_chunk() below.
+	 *
+	 * Make sure to acquire this mutex before doing a tree search (dev
+	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
+	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
+	 * we release the path used to search the chunk/dev tree and before
+	 * the current task acquires this mutex and calls us.
+	 */
+	ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex));
+
 	ret = btrfs_can_relocate(extent_root, chunk_offset);
 	if (ret)
 		return -ENOSPC;
@@ -2726,13 +2828,18 @@ again:
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
+		mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
-		if (ret < 0)
+		if (ret < 0) {
+			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 			goto error;
+		}
 		BUG_ON(ret == 0); /* Corruption */
 
 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
 					  key.type);
+		if (ret)
+			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 		if (ret < 0)
 			goto error;
 		if (ret > 0)
@@ -2755,6 +2862,7 @@ again:
 			else
 				BUG_ON(ret);
 		}
+		mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 
 		if (found_key.offset == 0)
 			break;
@@ -3211,9 +3319,12 @@ again:
 			goto error;
 		}
 
+		mutex_lock(&fs_info->delete_unused_bgs_mutex);
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
-		if (ret < 0)
+		if (ret < 0) {
+			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 			goto error;
+		}
 
 		/*
 		 * this shouldn't happen, it means the last relocate
@@ -3225,6 +3336,7 @@ again:
 		ret = btrfs_previous_item(chunk_root, path, 0,
 					  BTRFS_CHUNK_ITEM_KEY);
 		if (ret) {
+			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 			ret = 0;
 			break;
 		}
@@ -3233,8 +3345,10 @@ again:
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
-		if (found_key.objectid != key.objectid)
+		if (found_key.objectid != key.objectid) {
+			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 			break;
+		}
 
 		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
 
@@ -3247,10 +3361,13 @@ again:
 		ret = should_balance_chunk(chunk_root, leaf, chunk,
 					   found_key.offset);
 		btrfs_release_path(path);
-		if (!ret)
+		if (!ret) {
+			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 			goto loop;
+		}
 
 		if (counting) {
+			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 			spin_lock(&fs_info->balance_lock);
 			bctl->stat.expected++;
 			spin_unlock(&fs_info->balance_lock);
@@ -3260,6 +3377,7 @@ again:
 		ret = btrfs_relocate_chunk(chunk_root,
 					   found_key.objectid,
 					   found_key.offset);
+		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 		if (ret && ret != -ENOSPC)
 			goto error;
 		if (ret == -ENOSPC) {
@@ -3908,9 +4026,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
 	uuid_root = btrfs_create_tree(trans, fs_info,
 				      BTRFS_UUID_TREE_OBJECTID);
 	if (IS_ERR(uuid_root)) {
-		btrfs_abort_transaction(trans, tree_root,
-					PTR_ERR(uuid_root));
-		return PTR_ERR(uuid_root);
+		ret = PTR_ERR(uuid_root);
+		btrfs_abort_transaction(trans, tree_root, ret);
+		return ret;
 	}
 
 	fs_info->uuid_root = uuid_root;
@@ -3965,6 +4083,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	int slot;
 	int failed = 0;
 	bool retried = false;
+	bool checked_pending_chunks = false;
 	struct extent_buffer *l;
 	struct btrfs_key key;
 	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
@@ -3998,11 +4117,16 @@ again:
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
 	do {
+		mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		if (ret < 0)
+		if (ret < 0) {
+			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 			goto done;
+		}
 
 		ret = btrfs_previous_item(root, path, 0, key.type);
+		if (ret)
+			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 		if (ret < 0)
 			goto done;
 		if (ret) {
@@ -4016,6 +4140,7 @@ again:
 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 
 		if (key.objectid != device->devid) {
+			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 			btrfs_release_path(path);
 			break;
 		}
@@ -4024,6 +4149,7 @@ again:
 		length = btrfs_dev_extent_length(l, dev_extent);
 
 		if (key.offset + length <= new_size) {
+			mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 			btrfs_release_path(path);
 			break;
 		}
@@ -4033,6 +4159,7 @@ again:
 		btrfs_release_path(path);
 
 		ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
+		mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
 		if (ret && ret != -ENOSPC)
 			goto done;
 		if (ret == -ENOSPC)
@@ -4045,15 +4172,6 @@ again:
 		goto again;
 	} else if (failed && retried) {
 		ret = -ENOSPC;
-		lock_chunks(root);
-
-		btrfs_device_set_total_bytes(device, old_size);
-		if (device->writeable)
-			device->fs_devices->total_rw_bytes += diff;
-		spin_lock(&root->fs_info->free_chunk_lock);
-		root->fs_info->free_chunk_space += diff;
-		spin_unlock(&root->fs_info->free_chunk_lock);
-		unlock_chunks(root);
 		goto done;
 	}
 
@@ -4065,6 +4183,35 @@ again:
 	}
 
 	lock_chunks(root);
+
+	/*
+	 * We checked in the above loop all device extents that were already in
+	 * the device tree. However before we have updated the device's
+	 * total_bytes to the new size, we might have had chunk allocations that
+	 * have not complete yet (new block groups attached to transaction
+	 * handles), and therefore their device extents were not yet in the
+	 * device tree and we missed them in the loop above. So if we have any
+	 * pending chunk using a device extent that overlaps the device range
+	 * that we can not use anymore, commit the current transaction and
+	 * repeat the search on the device tree - this way we guarantee we will
+	 * not have chunks using device extents that end beyond 'new_size'.
+	 */
+	if (!checked_pending_chunks) {
+		u64 start = new_size;
+		u64 len = old_size - new_size;
+
+		if (contains_pending_extent(trans, device, &start, len)) {
+			unlock_chunks(root);
+			checked_pending_chunks = true;
+			failed = 0;
+			retried = false;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				goto done;
+			goto again;
+		}
+	}
+
 	btrfs_device_set_disk_total_bytes(device, new_size);
 	if (list_empty(&device->resized_list))
 		list_add_tail(&device->resized_list,
@@ -4079,6 +4226,16 @@ again:
 	btrfs_end_transaction(trans, root);
 done:
 	btrfs_free_path(path);
+	if (ret) {
+		lock_chunks(root);
+		btrfs_device_set_total_bytes(device, old_size);
+		if (device->writeable)
+			device->fs_devices->total_rw_bytes += diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+		unlock_chunks(root);
+	}
 	return ret;
 }
 
@@ -5586,17 +5743,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
 {
-	if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
-		bio_endio_nodec(bio, err);
-	else
-		bio_endio(bio, err);
+	bio->bi_private = bbio->private;
+	bio->bi_end_io = bbio->end_io;
+	bio_endio(bio, err);
+
 	btrfs_put_bbio(bbio);
 }
 
 static void btrfs_end_bio(struct bio *bio, int err)
 {
 	struct btrfs_bio *bbio = bio->bi_private;
-	struct btrfs_device *dev = bbio->stripes[0].dev;
 	int is_orig_bio = 0;
 
 	if (err) {
@@ -5604,6 +5760,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
 		if (err == -EIO || err == -EREMOTEIO) {
 			unsigned int stripe_index =
 				btrfs_io_bio(bio)->stripe_index;
+			struct btrfs_device *dev;
 
 			BUG_ON(stripe_index >= bbio->num_stripes);
 			dev = bbio->stripes[stripe_index].dev;
@@ -5633,8 +5790,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
 			bio = bbio->orig_bio;
 		}
 
-		bio->bi_private = bbio->private;
-		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the btrfs bio
@@ -5816,8 +5971,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 		/* Shoud be the original bio. */
 		WARN_ON(bio != bbio->orig_bio);
 
-		bio->bi_private = bbio->private;
-		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		bio->bi_iter.bi_sector = logical >> 9;
 
@@ -5898,10 +6051,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		if (dev_nr < total_devs - 1) {
 			bio = btrfs_bio_clone(first_bio, GFP_NOFS);
 			BUG_ON(!bio); /* -ENOMEM */
-		} else {
+		} else
 			bio = first_bio;
-			bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
-		}
 
 		submit_stripe_bio(root, bbio, bio,
 				  bbio->stripes[dev_nr].physical, dev_nr, rw,
@@ -6078,6 +6229,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 				free_extent_map(em);
 				return -EIO;
 			}
+			btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing",
+						devid, uuid);
 		}
 		map->stripes[i].dev->in_fs_metadata = 1;
 	}
@@ -6197,10 +6350,11 @@ static int read_one_dev(struct btrfs_root *root,
 		if (!btrfs_test_opt(root, DEGRADED))
 			return -EIO;
 
-		btrfs_warn(root->fs_info, "devid %llu missing", devid);
 		device = add_missing_dev(root, fs_devices, devid, dev_uuid);
 		if (!device)
 			return -ENOMEM;
+		btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
+				devid, dev_uuid);
 	} else {
 		if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
 			return -EIO;
@@ -6728,3 +6882,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
 	}
 	unlock_chunks(root);
 }
+
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	while (fs_devices) {
+		fs_devices->fs_info = fs_info;
+		fs_devices = fs_devices->seed;
+	}
+}
+
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	while (fs_devices) {
+		fs_devices->fs_info = NULL;
+		fs_devices = fs_devices->seed;
+	}
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ebc31331a..95842a909 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -253,6 +253,12 @@ struct btrfs_fs_devices {
 	 * nonrot flag set
 	 */
 	int rotating;
+
+	struct btrfs_fs_info *fs_info;
+	/* sysfs kobjects */
+	struct kobject super_kobj;
+	struct kobject *device_dir_kobj;
+	struct completion kobj_unregister;
 };
 
 #define BTRFS_BIO_INLINE_CSUM_SIZE	64
@@ -292,8 +298,6 @@ struct btrfs_bio_stripe {
 struct btrfs_bio;
 typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 
-#define BTRFS_BIO_ORIG_BIO_SUBMITTED	(1 << 0)
-
 struct btrfs_bio {
 	atomic_t refs;
 	atomic_t stripes_pending;
@@ -537,5 +541,8 @@ static inline void unlock_chunks(struct btrfs_root *root)
 	mutex_unlock(&root->fs_info->chunk_mutex);
 }
 
+struct list_head *btrfs_get_fs_uuids(void);
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
 
 #endif
diff --git a/fs/buffer.c b/fs/buffer.c
index 8c50a2250..1cf7a53a0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -30,6 +30,7 @@
 #include <linux/quotaops.h>
 #include <linux/highmem.h>
 #include <linux/export.h>
+#include <linux/backing-dev.h>
 #include <linux/writeback.h>
 #include <linux/hash.h>
 #include <linux/suspend.h>
@@ -44,6 +45,9 @@
 #include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+			 unsigned long bio_flags,
+			 struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -623,21 +627,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  *
  * If warn is true, then emit a warning if the page is not uptodate and has
  * not been truncated.
+ *
+ * The caller must hold mem_cgroup_begin_page_stat() lock.
  */
-static void __set_page_dirty(struct page *page,
-		struct address_space *mapping, int warn)
+static void __set_page_dirty(struct page *page, struct address_space *mapping,
+			     struct mem_cgroup *memcg, int warn)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&mapping->tree_lock, flags);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
-		account_page_dirtied(page, mapping);
+		account_page_dirtied(page, mapping, memcg);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
 	spin_unlock_irqrestore(&mapping->tree_lock, flags);
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 }
 
 /*
@@ -668,6 +673,7 @@ static void __set_page_dirty(struct page *page,
 int __set_page_dirty_buffers(struct page *page)
 {
 	int newly_dirty;
+	struct mem_cgroup *memcg;
 	struct address_space *mapping = page_mapping(page);
 
 	if (unlikely(!mapping))
@@ -683,11 +689,22 @@ int __set_page_dirty_buffers(struct page *page)
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
+	/*
+	 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+	 * per-memcg dirty page counters.
+	 */
+	memcg = mem_cgroup_begin_page_stat(page);
 	newly_dirty = !TestSetPageDirty(page);
 	spin_unlock(&mapping->private_lock);
 
 	if (newly_dirty)
-		__set_page_dirty(page, mapping, 1);
+		__set_page_dirty(page, mapping, memcg, 1);
+
+	mem_cgroup_end_page_stat(memcg);
+
+	if (newly_dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
 	return newly_dirty;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -1158,11 +1175,18 @@ void mark_buffer_dirty(struct buffer_head *bh)
 
 	if (!test_set_buffer_dirty(bh)) {
 		struct page *page = bh->b_page;
+		struct address_space *mapping = NULL;
+		struct mem_cgroup *memcg;
+
+		memcg = mem_cgroup_begin_page_stat(page);
 		if (!TestSetPageDirty(page)) {
-			struct address_space *mapping = page_mapping(page);
+			mapping = page_mapping(page);
 			if (mapping)
-				__set_page_dirty(page, mapping, 0);
+				__set_page_dirty(page, mapping, memcg, 0);
 		}
+		mem_cgroup_end_page_stat(memcg);
+		if (mapping)
+			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
 }
 EXPORT_SYMBOL(mark_buffer_dirty);
@@ -1684,8 +1708,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
-	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-			WRITE_SYNC : WRITE);
+	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1774,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh(write_op, bh);
+			submit_bh_wbc(write_op, bh, 0, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -1828,7 +1851,7 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh(write_op, bh);
+			submit_bh_wbc(write_op, bh, 0, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -2450,7 +2473,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	 * Update file times before taking page lock. We may end up failing the
 	 * fault so this update may be superfluous but who really cares...
 	 */
-	vma_file_update_time(vma);
+	file_update_time(vma->vm_file);
 
 	ret = __block_page_mkwrite(vma, vmf, get_block);
 	sb_end_pagefault(sb);
@@ -2938,10 +2961,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 {
 	struct buffer_head *bh = bio->bi_private;
 
-	if (err == -EOPNOTSUPP) {
-		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-	}
-
 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
 		set_bit(BH_Quiet, &bh->b_state);
 
@@ -2997,10 +3016,10 @@ void guard_bio_eod(int rw, struct bio *bio)
 	}
 }
 
-int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+			 unsigned long bio_flags, struct writeback_control *wbc)
 {
 	struct bio *bio;
-	int ret = 0;
 
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
@@ -3020,6 +3039,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
+	if (wbc) {
+		wbc_init_bio(wbc, bio);
+		wbc_account_io(wbc, bh->b_page, bh->b_size);
+	}
+
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	bio->bi_io_vec[0].bv_page = bh->b_page;
@@ -3041,20 +3065,19 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	if (buffer_prio(bh))
 		rw |= REQ_PRIO;
 
-	bio_get(bio);
 	submit_bio(rw, bio);
+	return 0;
+}
 
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-
-	bio_put(bio);
-	return ret;
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+{
+	return submit_bh_wbc(rw, bh, bio_flags, NULL);
 }
 EXPORT_SYMBOL_GPL(_submit_bh);
 
 int submit_bh(int rw, struct buffer_head *bh)
 {
-	return _submit_bh(rw, bh, 0);
+	return submit_bh_wbc(rw, bh, 0, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
@@ -3243,8 +3266,8 @@ int try_to_free_buffers(struct page *page)
 	 * to synchronise against __set_page_dirty_buffers and prevent the
 	 * dirty bit from being lost.
 	 */
-	if (ret && TestClearPageDirty(page))
-		account_page_cleaned(page, mapping);
+	if (ret)
+		cancel_dirty_page(page);
 	spin_unlock(&mapping->private_lock);
 out:
 	if (buffers_to_free) {
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 8c52472d2..aecd0859e 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -43,7 +43,6 @@ struct cachefiles_object {
 	loff_t				i_size;		/* object size */
 	unsigned long			flags;
 #define CACHEFILES_OBJECT_ACTIVE	0		/* T if marked active */
-#define CACHEFILES_OBJECT_BURIED	1		/* T if preemptively buried */
 	atomic_t			usage;		/* object usage count */
 	uint8_t				type;		/* object type */
 	uint8_t				new;		/* T if object new */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ab857ab9f..fc1056f5c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -97,7 +97,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object,
  *   call vfs_unlink(), vfs_rmdir() or vfs_rename()
  */
 static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
-					  struct dentry *dentry)
+					  struct dentry *dentry,
+					  enum fscache_why_object_killed why)
 {
 	struct cachefiles_object *object;
 	struct rb_node *p;
@@ -132,8 +133,9 @@ found_dentry:
 		pr_err("\n");
 		pr_err("Error: Can't preemptively bury live object\n");
 		cachefiles_printk_object(object, NULL);
-	} else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
-		pr_err("Error: Object already preemptively buried\n");
+	} else {
+		if (why != FSCACHE_OBJECT_IS_STALE)
+			fscache_object_mark_killed(&object->fscache, why);
 	}
 
 	write_unlock(&cache->active_lock);
@@ -265,7 +267,8 @@ requeue:
 static int cachefiles_bury_object(struct cachefiles_cache *cache,
 				  struct dentry *dir,
 				  struct dentry *rep,
-				  bool preemptive)
+				  bool preemptive,
+				  enum fscache_why_object_killed why)
 {
 	struct dentry *grave, *trap;
 	struct path path, path_to_graveyard;
@@ -289,7 +292,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 			ret = vfs_unlink(d_inode(dir), rep, NULL);
 
 			if (preemptive)
-				cachefiles_mark_object_buried(cache, rep);
+				cachefiles_mark_object_buried(cache, rep, why);
 		}
 
 		mutex_unlock(&d_inode(dir)->i_mutex);
@@ -394,7 +397,7 @@ try_again:
 					    "Rename failed with error %d", ret);
 
 		if (preemptive)
-			cachefiles_mark_object_buried(cache, rep);
+			cachefiles_mark_object_buried(cache, rep, why);
 	}
 
 	unlock_rename(cache->graveyard, dir);
@@ -422,7 +425,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 
 	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
 
-	if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+	if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) {
 		/* object allocation for the same key preemptively deleted this
 		 * object's file so that it could create its own file */
 		_debug("object preemptively buried");
@@ -433,7 +436,8 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 		 * may have been renamed */
 		if (dir == object->dentry->d_parent) {
 			ret = cachefiles_bury_object(cache, dir,
-						     object->dentry, false);
+						     object->dentry, false,
+						     FSCACHE_OBJECT_WAS_RETIRED);
 		} else {
 			/* it got moved, presumably by cachefilesd culling it,
 			 * so it's no longer in the key path and we can ignore
@@ -522,7 +526,7 @@ lookup_again:
 		if (d_is_negative(next)) {
 			ret = cachefiles_has_space(cache, 1, 0);
 			if (ret < 0)
-				goto create_error;
+				goto no_space_error;
 
 			path.dentry = dir;
 			ret = security_path_mkdir(&path, next, 0);
@@ -551,7 +555,7 @@ lookup_again:
 		if (d_is_negative(next)) {
 			ret = cachefiles_has_space(cache, 1, 0);
 			if (ret < 0)
-				goto create_error;
+				goto no_space_error;
 
 			path.dentry = dir;
 			ret = security_path_mknod(&path, next, S_IFREG, 0);
@@ -602,7 +606,8 @@ lookup_again:
 			 * mutex) */
 			object->dentry = NULL;
 
-			ret = cachefiles_bury_object(cache, dir, next, true);
+			ret = cachefiles_bury_object(cache, dir, next, true,
+						     FSCACHE_OBJECT_IS_STALE);
 			dput(next);
 			next = NULL;
 
@@ -610,6 +615,7 @@ lookup_again:
 				goto delete_error;
 
 			_debug("redo lookup");
+			fscache_object_retrying_stale(&object->fscache);
 			goto lookup_again;
 		}
 	}
@@ -662,6 +668,8 @@ lookup_again:
 	_leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino);
 	return 0;
 
+no_space_error:
+	fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE);
 create_error:
 	_debug("create error %d", ret);
 	if (ret == -EIO)
@@ -927,7 +935,8 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
 	/*  actually remove the victim (drops the dir mutex) */
 	_debug("bury");
 
-	ret = cachefiles_bury_object(cache, dir, victim, false);
+	ret = cachefiles_bury_object(cache, dir, victim, false,
+				     FSCACHE_OBJECT_WAS_CULLED);
 	if (ret < 0)
 		goto error;
 
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 64fa24834..8f84646f1 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
 		val_size2 = posix_acl_xattr_size(default_acl->a_count);
 
 	err = -ENOMEM;
-	tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
+	tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
 	if (!tmp_buf)
 		goto out_err;
-	pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
+	pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
 	if (!pagelist)
 		goto out_err;
 	ceph_pagelist_init(pagelist);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e162bcd10..890c50971 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,17 +87,21 @@ static int ceph_set_page_dirty(struct page *page)
 	inode = mapping->host;
 	ci = ceph_inode(inode);
 
-	/*
-	 * Note that we're grabbing a snapc ref here without holding
-	 * any locks!
-	 */
-	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
-
 	/* dirty the head */
 	spin_lock(&ci->i_ceph_lock);
-	if (ci->i_head_snapc == NULL)
-		ci->i_head_snapc = ceph_get_snap_context(snapc);
-	++ci->i_wrbuffer_ref_head;
+	BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
+	if (__ceph_have_pending_cap_snap(ci)) {
+		struct ceph_cap_snap *capsnap =
+				list_last_entry(&ci->i_cap_snaps,
+						struct ceph_cap_snap,
+						ci_item);
+		snapc = ceph_get_snap_context(capsnap->context);
+		capsnap->dirty_pages++;
+	} else {
+		BUG_ON(!ci->i_head_snapc);
+		snapc = ceph_get_snap_context(ci->i_head_snapc);
+		++ci->i_wrbuffer_ref_head;
+	}
 	if (ci->i_wrbuffer_ref == 0)
 		ihold(inode);
 	++ci->i_wrbuffer_ref;
@@ -346,7 +350,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 
 	/* build page vector */
 	nr_pages = calc_pages_for(0, len);
-	pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
+	pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL);
 	ret = -ENOMEM;
 	if (!pages)
 		goto out;
@@ -358,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
  		dout("start_read %p adding %p idx %lu\n", inode, page,
 		     page->index);
 		if (add_to_page_cache_lru(page, &inode->i_data, page->index,
-					  GFP_NOFS)) {
+					  GFP_KERNEL)) {
 			ceph_fscache_uncache_page(inode, page);
 			page_cache_release(page);
 			dout("start_read %p add_to_page_cache failed %p\n",
@@ -436,7 +440,7 @@ out:
  * only snap context we are allowed to write back.
  */
 static struct ceph_snap_context *get_oldest_context(struct inode *inode,
-						    u64 *snap_size)
+						    loff_t *snap_size)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_snap_context *snapc = NULL;
@@ -476,8 +480,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	struct ceph_osd_client *osdc;
 	struct ceph_snap_context *snapc, *oldest;
 	loff_t page_off = page_offset(page);
+	loff_t snap_size = -1;
 	long writeback_stat;
-	u64 truncate_size, snap_size = 0;
+	u64 truncate_size;
 	u32 truncate_seq;
 	int err = 0, len = PAGE_CACHE_SIZE;
 
@@ -512,7 +517,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	spin_lock(&ci->i_ceph_lock);
 	truncate_seq = ci->i_truncate_seq;
 	truncate_size = ci->i_truncate_size;
-	if (!snap_size)
+	if (snap_size == -1)
 		snap_size = i_size_read(inode);
 	spin_unlock(&ci->i_ceph_lock);
 
@@ -695,7 +700,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 	unsigned wsize = 1 << inode->i_blkbits;
 	struct ceph_osd_request *req = NULL;
 	int do_sync = 0;
-	u64 truncate_size, snap_size;
+	loff_t snap_size, i_size;
+	u64 truncate_size;
 	u32 truncate_seq;
 
 	/*
@@ -741,7 +747,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 retry:
 	/* find oldest snap context with dirty data */
 	ceph_put_snap_context(snapc);
-	snap_size = 0;
+	snap_size = -1;
 	snapc = get_oldest_context(inode, &snap_size);
 	if (!snapc) {
 		/* hmm, why does writepages get called when there
@@ -749,16 +755,13 @@ retry:
 		dout(" no snap context with dirty data?\n");
 		goto out;
 	}
-	if (snap_size == 0)
-		snap_size = i_size_read(inode);
 	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
 	     snapc, snapc->seq, snapc->num_snaps);
 
 	spin_lock(&ci->i_ceph_lock);
 	truncate_seq = ci->i_truncate_seq;
 	truncate_size = ci->i_truncate_size;
-	if (!snap_size)
-		snap_size = i_size_read(inode);
+	i_size = i_size_read(inode);
 	spin_unlock(&ci->i_ceph_lock);
 
 	if (last_snapc && snapc != last_snapc) {
@@ -828,8 +831,10 @@ get_more_pages:
 				dout("waiting on writeback %p\n", page);
 				wait_on_page_writeback(page);
 			}
-			if (page_offset(page) >= snap_size) {
-				dout("%p page eof %llu\n", page, snap_size);
+			if (page_offset(page) >=
+			    (snap_size == -1 ? i_size : snap_size)) {
+				dout("%p page eof %llu\n", page,
+				     (snap_size == -1 ? i_size : snap_size));
 				done = 1;
 				unlock_page(page);
 				break;
@@ -884,7 +889,8 @@ get_more_pages:
 				}
 
 				if (do_sync)
-					osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+					osd_req_op_init(req, 1,
+							CEPH_OSD_OP_STARTSYNC, 0);
 
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
@@ -944,10 +950,18 @@ get_more_pages:
 		}
 
 		/* Format the osd request message and submit the write */
-
 		offset = page_offset(pages[0]);
-		len = min(snap_size - offset,
-			  (u64)locked_pages << PAGE_CACHE_SHIFT);
+		len = (u64)locked_pages << PAGE_CACHE_SHIFT;
+		if (snap_size == -1) {
+			len = min(len, (u64)i_size_read(inode) - offset);
+			 /* writepages_finish() clears writeback pages
+			  * according to the data length, so make sure
+			  * data length covers all locked pages */
+			len = max(len, 1 +
+				((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
+		} else {
+			len = min(len, snap_size - offset);
+		}
 		dout("writepages got %d pages at %llu~%llu\n",
 		     locked_pages, offset, len);
 
@@ -1032,7 +1046,6 @@ static int ceph_update_writeable_page(struct file *file,
 {
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	loff_t page_off = pos & PAGE_CACHE_MASK;
 	int pos_in_page = pos & ~PAGE_CACHE_MASK;
 	int end_in_page = pos_in_page + len;
@@ -1044,10 +1057,6 @@ retry_locked:
 	/* writepages currently holds page lock, but if we change that later, */
 	wait_on_page_writeback(page);
 
-	/* check snap context */
-	BUG_ON(!ci->i_snap_realm);
-	down_read(&mdsc->snap_rwsem);
-	BUG_ON(!ci->i_snap_realm->cached_context);
 	snapc = page_snap_context(page);
 	if (snapc && snapc != ci->i_head_snapc) {
 		/*
@@ -1055,7 +1064,6 @@ retry_locked:
 		 * context!  is it writeable now?
 		 */
 		oldest = get_oldest_context(inode, NULL);
-		up_read(&mdsc->snap_rwsem);
 
 		if (snapc->seq > oldest->seq) {
 			ceph_put_snap_context(oldest);
@@ -1112,7 +1120,6 @@ retry_locked:
 	}
 
 	/* we need to read it. */
-	up_read(&mdsc->snap_rwsem);
 	r = readpage_nounlock(file, page);
 	if (r < 0)
 		goto fail_nosnap;
@@ -1157,16 +1164,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 
 /*
  * we don't do anything in here that simple_write_end doesn't do
- * except adjust dirty page accounting and drop read lock on
- * mdsc->snap_rwsem.
+ * except adjust dirty page accounting
  */
 static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  loff_t pos, unsigned len, unsigned copied,
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = file_inode(file);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	int check_cap = 0;
 
@@ -1188,7 +1192,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 	set_page_dirty(page);
 
 	unlock_page(page);
-	up_read(&mdsc->snap_rwsem);
 	page_cache_release(page);
 
 	if (check_cap)
@@ -1314,13 +1317,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_cap_flush *prealloc_cf;
 	struct page *page = vmf->page;
 	loff_t off = page_offset(page);
 	loff_t size = i_size_read(inode);
 	size_t len;
 	int want, got, ret;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return VM_FAULT_SIGBUS;
+
 	if (ci->i_inline_version != CEPH_INLINE_NONE) {
 		struct page *locked_page = NULL;
 		if (off == 0) {
@@ -1330,8 +1337,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		ret = ceph_uninline_data(vma->vm_file, locked_page);
 		if (locked_page)
 			unlock_page(locked_page);
-		if (ret < 0)
-			return VM_FAULT_SIGBUS;
+		if (ret < 0) {
+			ret = VM_FAULT_SIGBUS;
+			goto out_free;
+		}
 	}
 
 	if (off + PAGE_CACHE_SIZE <= size)
@@ -1353,7 +1362,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 			break;
 		if (ret != -ERESTARTSYS) {
 			WARN_ON(1);
-			return VM_FAULT_SIGBUS;
+			ret = VM_FAULT_SIGBUS;
+			goto out_free;
 		}
 	}
 	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
@@ -1373,7 +1383,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret == 0) {
 		/* success.  we'll keep the page locked. */
 		set_page_dirty(page);
-		up_read(&mdsc->snap_rwsem);
 		ret = VM_FAULT_LOCKED;
 	} else {
 		if (ret == -ENOMEM)
@@ -1389,7 +1398,8 @@ out:
 		int dirty;
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_inline_version = CEPH_INLINE_NONE;
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
@@ -1398,6 +1408,8 @@ out:
 	dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
 	     inode, off, len, ceph_cap_string(got), ret);
 	ceph_put_cap_refs(ci, got);
+out_free:
+	ceph_free_cap_flush(prealloc_cf);
 
 	return ret;
 }
@@ -1509,8 +1521,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 				    ceph_vino(inode), 0, &len, 0, 1,
 				    CEPH_OSD_OP_CREATE,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
-				    ci->i_snap_realm->cached_context,
-				    0, 0, false);
+				    ceph_empty_snapc, 0, 0, false);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto out;
@@ -1528,7 +1539,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 				    ceph_vino(inode), 0, &len, 1, 3,
 				    CEPH_OSD_OP_WRITE,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
-				    ci->i_snap_realm->cached_context,
+				    ceph_empty_snapc,
 				    ci->i_truncate_seq, ci->i_truncate_size,
 				    false);
 	if (IS_ERR(req)) {
@@ -1597,3 +1608,206 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_ops = &ceph_vmops;
 	return 0;
 }
+
+enum {
+	POOL_READ	= 1,
+	POOL_WRITE	= 2,
+};
+
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+{
+	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
+	struct rb_node **p, *parent;
+	struct ceph_pool_perm *perm;
+	struct page **pages;
+	int err = 0, err2 = 0, have = 0;
+
+	down_read(&mdsc->pool_perm_rwsem);
+	p = &mdsc->pool_perm_tree.rb_node;
+	while (*p) {
+		perm = rb_entry(*p, struct ceph_pool_perm, node);
+		if (pool < perm->pool)
+			p = &(*p)->rb_left;
+		else if (pool > perm->pool)
+			p = &(*p)->rb_right;
+		else {
+			have = perm->perm;
+			break;
+		}
+	}
+	up_read(&mdsc->pool_perm_rwsem);
+	if (*p)
+		goto out;
+
+	dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+
+	down_write(&mdsc->pool_perm_rwsem);
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		perm = rb_entry(parent, struct ceph_pool_perm, node);
+		if (pool < perm->pool)
+			p = &(*p)->rb_left;
+		else if (pool > perm->pool)
+			p = &(*p)->rb_right;
+		else {
+			have = perm->perm;
+			break;
+		}
+	}
+	if (*p) {
+		up_write(&mdsc->pool_perm_rwsem);
+		goto out;
+	}
+
+	rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
+					 ceph_empty_snapc,
+					 1, false, GFP_NOFS);
+	if (!rd_req) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	rd_req->r_flags = CEPH_OSD_FLAG_READ;
+	osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
+	rd_req->r_base_oloc.pool = pool;
+	snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
+		 "%llx.00000000", ci->i_vino.ino);
+	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+
+	wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
+					 ceph_empty_snapc,
+					 1, false, GFP_NOFS);
+	if (!wr_req) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
+			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
+	wr_req->r_base_oloc.pool = pool;
+	wr_req->r_base_oid = rd_req->r_base_oid;
+
+	/* one page should be large enough for STAT data */
+	pages = ceph_alloc_page_vector(1, GFP_KERNEL);
+	if (IS_ERR(pages)) {
+		err = PTR_ERR(pages);
+		goto out_unlock;
+	}
+
+	osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
+				     0, false, true);
+	ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
+				&ci->vfs_inode.i_mtime);
+	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+
+	ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
+				&ci->vfs_inode.i_mtime);
+	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+
+	if (!err)
+		err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+	if (!err2)
+		err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+
+	if (err >= 0 || err == -ENOENT)
+		have |= POOL_READ;
+	else if (err != -EPERM)
+		goto out_unlock;
+
+	if (err2 == 0 || err2 == -EEXIST)
+		have |= POOL_WRITE;
+	else if (err2 != -EPERM) {
+		err = err2;
+		goto out_unlock;
+	}
+
+	perm = kmalloc(sizeof(*perm), GFP_NOFS);
+	if (!perm) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	perm->pool = pool;
+	perm->perm = have;
+	rb_link_node(&perm->node, parent, p);
+	rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
+	err = 0;
+out_unlock:
+	up_write(&mdsc->pool_perm_rwsem);
+
+	if (rd_req)
+		ceph_osdc_put_request(rd_req);
+	if (wr_req)
+		ceph_osdc_put_request(wr_req);
+out:
+	if (!err)
+		err = have;
+	dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+	return err;
+}
+
+int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
+{
+	u32 pool;
+	int ret, flags;
+
+	if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
+				NOPOOLPERM))
+		return 0;
+
+	spin_lock(&ci->i_ceph_lock);
+	flags = ci->i_ceph_flags;
+	pool = ceph_file_layout_pg_pool(ci->i_layout);
+	spin_unlock(&ci->i_ceph_lock);
+check:
+	if (flags & CEPH_I_POOL_PERM) {
+		if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
+			dout("ceph_pool_perm_check pool %u no read perm\n",
+			     pool);
+			return -EPERM;
+		}
+		if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
+			dout("ceph_pool_perm_check pool %u no write perm\n",
+			     pool);
+			return -EPERM;
+		}
+		return 0;
+	}
+
+	ret = __ceph_pool_perm_get(ci, pool);
+	if (ret < 0)
+		return ret;
+
+	flags = CEPH_I_POOL_PERM;
+	if (ret & POOL_READ)
+		flags |= CEPH_I_POOL_RD;
+	if (ret & POOL_WRITE)
+		flags |= CEPH_I_POOL_WR;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
+		ci->i_ceph_flags = flags;
+        } else {
+		pool = ceph_file_layout_pg_pool(ci->i_layout);
+		flags = ci->i_ceph_flags;
+	}
+	spin_unlock(&ci->i_ceph_lock);
+	goto check;
+}
+
+void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
+{
+	struct ceph_pool_perm *perm;
+	struct rb_node *n;
+
+	while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
+		n = rb_first(&mdsc->pool_perm_tree);
+		perm = rb_entry(n, struct ceph_pool_perm, node);
+		rb_erase(n, &mdsc->pool_perm_tree);
+		kfree(perm);
+	}
+}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index be5ea6af8..ddd5e9471 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
 		used |= CEPH_CAP_PIN;
 	if (ci->i_rd_ref)
 		used |= CEPH_CAP_FILE_RD;
-	if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
+	if (ci->i_rdcache_ref ||
+	    (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
+	     ci->vfs_inode.i_data.nrpages))
 		used |= CEPH_CAP_FILE_CACHE;
 	if (ci->i_wr_ref)
 		used |= CEPH_CAP_FILE_WR;
@@ -926,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 
 	/* remove from session list */
 	spin_lock(&session->s_cap_lock);
-	/*
-	 * s_cap_reconnect is protected by s_cap_lock. no one changes
-	 * s_cap_gen while session is in the reconnect state.
-	 */
-	if (queue_release &&
-	    (!session->s_cap_reconnect ||
-	     cap->cap_gen == session->s_cap_gen))
-		__queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
-				    cap->mseq, cap->issue_seq);
-
 	if (session->s_cap_iterator == cap) {
 		/* not yet, we are iterating over this very cap */
 		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
@@ -948,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 	}
 	/* protect backpointer with s_cap_lock: see iterate_session_caps */
 	cap->ci = NULL;
+
+	/*
+	 * s_cap_reconnect is protected by s_cap_lock. no one changes
+	 * s_cap_gen while session is in the reconnect state.
+	 */
+	if (queue_release &&
+	    (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
+		cap->queue_release = 1;
+		if (removed) {
+			list_add_tail(&cap->session_caps,
+				      &session->s_cap_releases);
+			session->s_num_cap_releases++;
+			removed = 0;
+		}
+	} else {
+		cap->queue_release = 0;
+	}
+	cap->cap_ino = ci->i_vino.ino;
+
 	spin_unlock(&session->s_cap_lock);
 
 	/* remove from inode list */
@@ -977,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 static int send_cap_msg(struct ceph_mds_session *session,
 			u64 ino, u64 cid, int op,
 			int caps, int wanted, int dirty,
-			u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
-			u64 size, u64 max_size,
+			u32 seq, u64 flush_tid, u64 oldest_flush_tid,
+			u32 issue_seq, u32 mseq, u64 size, u64 max_size,
 			struct timespec *mtime, struct timespec *atime,
 			u64 time_warp_seq,
 			kuid_t uid, kgid_t gid, umode_t mode,
@@ -992,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	size_t extra_len;
 
 	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
-	     " seq %u/%u mseq %u follows %lld size %llu/%llu"
+	     " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
 	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
 	     cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
 	     ceph_cap_string(dirty),
-	     seq, issue_seq, mseq, follows, size, max_size,
+	     seq, issue_seq, flush_tid, oldest_flush_tid,
+	     mseq, follows, size, max_size,
 	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 
-	/* flock buffer size + inline version + inline data size */
-	extra_len = 4 + 8 + 4;
+	/* flock buffer size + inline version + inline data size +
+	 * osd_epoch_barrier + oldest_flush_tid */
+	extra_len = 4 + 8 + 4 + 4 + 8;
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
 			   GFP_NOFS, false);
 	if (!msg)
 		return -ENOMEM;
 
+	msg->hdr.version = cpu_to_le16(6);
 	msg->hdr.tid = cpu_to_le64(flush_tid);
 
 	fc = msg->front.iov_base;
@@ -1041,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
 	/* inline data size */
 	ceph_encode_32(&p, 0);
+	/* osd_epoch_barrier */
+	ceph_encode_32(&p, 0);
+	/* oldest_flush_tid */
+	ceph_encode_64(&p, oldest_flush_tid);
 
 	fc->xattr_version = cpu_to_le64(xattr_version);
 	if (xattrs_buf) {
@@ -1053,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	return 0;
 }
 
-void __queue_cap_release(struct ceph_mds_session *session,
-			 u64 ino, u64 cap_id, u32 migrate_seq,
-			 u32 issue_seq)
-{
-	struct ceph_msg *msg;
-	struct ceph_mds_cap_release *head;
-	struct ceph_mds_cap_item *item;
-
-	BUG_ON(!session->s_num_cap_releases);
-	msg = list_first_entry(&session->s_cap_releases,
-			       struct ceph_msg, list_head);
-
-	dout(" adding %llx release to mds%d msg %p (%d left)\n",
-	     ino, session->s_mds, msg, session->s_num_cap_releases);
-
-	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
-	head = msg->front.iov_base;
-	le32_add_cpu(&head->num, 1);
-	item = msg->front.iov_base + msg->front.iov_len;
-	item->ino = cpu_to_le64(ino);
-	item->cap_id = cpu_to_le64(cap_id);
-	item->migrate_seq = cpu_to_le32(migrate_seq);
-	item->seq = cpu_to_le32(issue_seq);
-
-	session->s_num_cap_releases--;
-
-	msg->front.iov_len += sizeof(*item);
-	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
-		dout(" release msg %p full\n", msg);
-		list_move_tail(&msg->list_head, &session->s_cap_releases_done);
-	} else {
-		dout(" release msg %p at %d/%d (%d)\n", msg,
-		     (int)le32_to_cpu(head->num),
-		     (int)CEPH_CAPS_PER_RELEASE,
-		     (int)msg->front.iov_len);
-	}
-}
-
 /*
  * Queue cap releases when an inode is dropped from our cache.  Since
  * inode is about to be destroyed, there is no need for i_ceph_lock.
@@ -1127,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode)
  */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 		      int op, int used, int want, int retain, int flushing,
-		      unsigned *pflush_tid)
+		      u64 flush_tid, u64 oldest_flush_tid)
 	__releases(cap->ci->i_ceph_lock)
 {
 	struct ceph_inode_info *ci = cap->ci;
@@ -1145,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	u64 xattr_version = 0;
 	struct ceph_buffer *xattr_blob = NULL;
 	int delayed = 0;
-	u64 flush_tid = 0;
-	int i;
 	int ret;
 	bool inline_data;
 
@@ -1190,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	cap->implemented &= cap->issued | used;
 	cap->mds_wanted = want;
 
-	if (flushing) {
-		/*
-		 * assign a tid for flush operations so we can avoid
-		 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
-		 * clean type races.  track latest tid for every bit
-		 * so we can handle flush AxFw, flush Fw, and have the
-		 * first ack clean Ax.
-		 */
-		flush_tid = ++ci->i_cap_flush_last_tid;
-		if (pflush_tid)
-			*pflush_tid = flush_tid;
-		dout(" cap_flush_tid %d\n", (int)flush_tid);
-		for (i = 0; i < CEPH_CAP_BITS; i++)
-			if (flushing & (1 << i))
-				ci->i_cap_flush_tid[i] = flush_tid;
-
-		follows = ci->i_head_snapc->seq;
-	} else {
-		follows = 0;
-	}
+	follows = flushing ? ci->i_head_snapc->seq : 0;
 
 	keep = cap->implemented;
 	seq = cap->seq;
@@ -1237,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	spin_unlock(&ci->i_ceph_lock);
 
 	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
-		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+		op, keep, want, flushing, seq,
+		flush_tid, oldest_flush_tid, issue_seq, mseq,
 		size, max_size, &mtime, &atime, time_warp_seq,
 		uid, gid, mode, xattr_version, xattr_blob,
 		follows, inline_data);
@@ -1259,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
  * asynchronously back to the MDS once sync writes complete and dirty
  * data is written out.
  *
- * Unless @again is true, skip cap_snaps that were already sent to
+ * Unless @kick is true, skip cap_snaps that were already sent to
  * the MDS (i.e., during this session).
  *
  * Called under i_ceph_lock.  Takes s_mutex as needed.
  */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
 			struct ceph_mds_session **psession,
-			int again)
+			int kick)
 		__releases(ci->i_ceph_lock)
 		__acquires(ci->i_ceph_lock)
 {
@@ -1297,11 +1257,8 @@ retry:
 		if (capsnap->dirty_pages || capsnap->writing)
 			break;
 
-		/*
-		 * if cap writeback already occurred, we should have dropped
-		 * the capsnap in ceph_put_wrbuffer_cap_refs.
-		 */
-		BUG_ON(capsnap->dirty == 0);
+		/* should be removed by ceph_try_drop_cap_snap() */
+		BUG_ON(!capsnap->need_flush);
 
 		/* pick mds, take s_mutex */
 		if (ci->i_auth_cap == NULL) {
@@ -1310,7 +1267,7 @@ retry:
 		}
 
 		/* only flush each capsnap once */
-		if (!again && !list_empty(&capsnap->flushing_item)) {
+		if (!kick && !list_empty(&capsnap->flushing_item)) {
 			dout("already flushed %p, skipping\n", capsnap);
 			continue;
 		}
@@ -1320,6 +1277,9 @@ retry:
 
 		if (session && session->s_mds != mds) {
 			dout("oops, wrong session %p mutex\n", session);
+			if (kick)
+				goto out;
+
 			mutex_unlock(&session->s_mutex);
 			ceph_put_mds_session(session);
 			session = NULL;
@@ -1343,20 +1303,22 @@ retry:
 			goto retry;
 		}
 
-		capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+		spin_lock(&mdsc->cap_dirty_lock);
+		capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
+		spin_unlock(&mdsc->cap_dirty_lock);
+
 		atomic_inc(&capsnap->nref);
-		if (!list_empty(&capsnap->flushing_item))
-			list_del_init(&capsnap->flushing_item);
-		list_add_tail(&capsnap->flushing_item,
-			      &session->s_cap_snaps_flushing);
+		if (list_empty(&capsnap->flushing_item))
+			list_add_tail(&capsnap->flushing_item,
+				      &session->s_cap_snaps_flushing);
 		spin_unlock(&ci->i_ceph_lock);
 
 		dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
 		     inode, capsnap, capsnap->follows, capsnap->flush_tid);
 		send_cap_msg(session, ceph_vino(inode).ino, 0,
 			     CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
-			     capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
-			     capsnap->size, 0,
+			     capsnap->dirty, 0, capsnap->flush_tid, 0,
+			     0, mseq, capsnap->size, 0,
 			     &capsnap->mtime, &capsnap->atime,
 			     capsnap->time_warp_seq,
 			     capsnap->uid, capsnap->gid, capsnap->mode,
@@ -1396,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
  * Caller is then responsible for calling __mark_inode_dirty with the
  * returned flags value.
  */
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+			   struct ceph_cap_flush **pcf)
 {
 	struct ceph_mds_client *mdsc =
 		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
@@ -1416,9 +1379,14 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 	     ceph_cap_string(was | mask));
 	ci->i_dirty_caps |= mask;
 	if (was == 0) {
-		if (!ci->i_head_snapc)
+		WARN_ON_ONCE(ci->i_prealloc_cap_flush);
+		swap(ci->i_prealloc_cap_flush, *pcf);
+
+		if (!ci->i_head_snapc) {
+			WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
 			ci->i_head_snapc = ceph_get_snap_context(
 				ci->i_snap_realm->cached_context);
+		}
 		dout(" inode %p now dirty snapc %p auth cap %p\n",
 		     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
 		BUG_ON(!list_empty(&ci->i_dirty_item));
@@ -1429,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 			ihold(inode);
 			dirty |= I_DIRTY_SYNC;
 		}
+	} else {
+		WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
 	}
 	BUG_ON(list_empty(&ci->i_dirty_item));
 	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
@@ -1438,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 	return dirty;
 }
 
+static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
+					struct ceph_cap_flush *cf)
+{
+	struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_cap_flush *other = NULL;
+
+	while (*p) {
+		parent = *p;
+		other = rb_entry(parent, struct ceph_cap_flush, i_node);
+
+		if (cf->tid < other->tid)
+			p = &(*p)->rb_left;
+		else if (cf->tid > other->tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&cf->i_node, parent, p);
+	rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
+}
+
+static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
+				       struct ceph_cap_flush *cf)
+{
+	struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_cap_flush *other = NULL;
+
+	while (*p) {
+		parent = *p;
+		other = rb_entry(parent, struct ceph_cap_flush, g_node);
+
+		if (cf->tid < other->tid)
+			p = &(*p)->rb_left;
+		else if (cf->tid > other->tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&cf->g_node, parent, p);
+	rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
+}
+
+struct ceph_cap_flush *ceph_alloc_cap_flush(void)
+{
+	return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+}
+
+void ceph_free_cap_flush(struct ceph_cap_flush *cf)
+{
+	if (cf)
+		kmem_cache_free(ceph_cap_flush_cachep, cf);
+}
+
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
+{
+	struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
+	if (n) {
+		struct ceph_cap_flush *cf =
+			rb_entry(n, struct ceph_cap_flush, g_node);
+		return cf->tid;
+	}
+	return 0;
+}
+
 /*
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
@@ -1445,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
  * Called under i_ceph_lock.
  */
 static int __mark_caps_flushing(struct inode *inode,
-				 struct ceph_mds_session *session)
+				struct ceph_mds_session *session,
+				u64 *flush_tid, u64 *oldest_flush_tid)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap_flush *cf = NULL;
 	int flushing;
 
 	BUG_ON(ci->i_dirty_caps == 0);
 	BUG_ON(list_empty(&ci->i_dirty_item));
+	BUG_ON(!ci->i_prealloc_cap_flush);
 
 	flushing = ci->i_dirty_caps;
 	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
@@ -1463,22 +1504,30 @@ static int __mark_caps_flushing(struct inode *inode,
 	ci->i_dirty_caps = 0;
 	dout(" inode %p now !dirty\n", inode);
 
+	swap(cf, ci->i_prealloc_cap_flush);
+	cf->caps = flushing;
+
 	spin_lock(&mdsc->cap_dirty_lock);
 	list_del_init(&ci->i_dirty_item);
 
+	cf->tid = ++mdsc->last_cap_flush_tid;
+	__add_cap_flushing_to_mdsc(mdsc, cf);
+	*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+
 	if (list_empty(&ci->i_flushing_item)) {
-		ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
 		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
 		mdsc->num_cap_flushing++;
-		dout(" inode %p now flushing seq %lld\n", inode,
-		     ci->i_cap_flush_seq);
+		dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
 	} else {
 		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
-		dout(" inode %p now flushing (more) seq %lld\n", inode,
-		     ci->i_cap_flush_seq);
+		dout(" inode %p now flushing (more) tid %llu\n",
+		     inode, cf->tid);
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
 
+	__add_cap_flushing_to_inode(ci, cf);
+
+	*flush_tid = cf->tid;
 	return flushing;
 }
 
@@ -1524,6 +1573,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
+	u64 flush_tid, oldest_flush_tid;
 	int file_wanted, used, cap_used;
 	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
 	int issued, implemented, want, retain, revoking, flushing = 0;
@@ -1553,13 +1603,13 @@ retry:
 retry_locked:
 	file_wanted = __ceph_caps_file_wanted(ci);
 	used = __ceph_caps_used(ci);
-	want = file_wanted | used;
 	issued = __ceph_caps_issued(ci, &implemented);
 	revoking = implemented & ~issued;
 
-	retain = want | CEPH_CAP_PIN;
+	want = file_wanted;
+	retain = file_wanted | used | CEPH_CAP_PIN;
 	if (!mdsc->stopping && inode->i_nlink > 0) {
-		if (want) {
+		if (file_wanted) {
 			retain |= CEPH_CAP_ANY;       /* be greedy */
 		} else if (S_ISDIR(inode->i_mode) &&
 			   (issued & CEPH_CAP_FILE_SHARED) &&
@@ -1602,9 +1652,10 @@ retry_locked:
 	 * If we fail, it's because pages are locked.... try again later.
 	 */
 	if ((!is_delayed || mdsc->stopping) &&
-	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
-	    inode->i_data.nrpages &&                 /* have cached pages */
-	    (file_wanted == 0 ||                     /* no open files */
+	    !S_ISDIR(inode->i_mode) &&		/* ignore readdir cache */
+	    ci->i_wrbuffer_ref == 0 &&		/* no dirty pages... */
+	    inode->i_data.nrpages &&		/* have cached pages */
+	    (file_wanted == 0 ||		/* no open files */
 	     (revoking & (CEPH_CAP_FILE_CACHE|
 			  CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
 	    !tried_invalidate) {
@@ -1742,17 +1793,25 @@ ack:
 			took_snap_rwsem = 1;
 		}
 
-		if (cap == ci->i_auth_cap && ci->i_dirty_caps)
-			flushing = __mark_caps_flushing(inode, session);
-		else
+		if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
+			flushing = __mark_caps_flushing(inode, session,
+							&flush_tid,
+							&oldest_flush_tid);
+		} else {
 			flushing = 0;
+			flush_tid = 0;
+			spin_lock(&mdsc->cap_dirty_lock);
+			oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+			spin_unlock(&mdsc->cap_dirty_lock);
+		}
 
 		mds = cap->mds;  /* remember mds, so we don't repeat */
 		sent++;
 
 		/* __send_cap drops i_ceph_lock */
 		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
-				      want, retain, flushing, NULL);
+				      want, retain, flushing,
+				      flush_tid, oldest_flush_tid);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 
@@ -1781,12 +1840,13 @@ ack:
 /*
  * Try to flush dirty caps back to the auth mds.
  */
-static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
+static int try_flush_caps(struct inode *inode, u64 *ptid)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int flushing = 0;
 	struct ceph_mds_session *session = NULL;
+	int flushing = 0;
+	u64 flush_tid = 0, oldest_flush_tid = 0;
 
 retry:
 	spin_lock(&ci->i_ceph_lock);
@@ -1811,42 +1871,54 @@ retry:
 		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
 			goto out;
 
-		flushing = __mark_caps_flushing(inode, session);
+		flushing = __mark_caps_flushing(inode, session, &flush_tid,
+						&oldest_flush_tid);
 
 		/* __send_cap drops i_ceph_lock */
 		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
-				     cap->issued | cap->implemented, flushing,
-				     flush_tid);
-		if (!delayed)
-			goto out_unlocked;
+				     (cap->issued | cap->implemented),
+				     flushing, flush_tid, oldest_flush_tid);
 
-		spin_lock(&ci->i_ceph_lock);
-		__cap_delay_requeue(mdsc, ci);
+		if (delayed) {
+			spin_lock(&ci->i_ceph_lock);
+			__cap_delay_requeue(mdsc, ci);
+			spin_unlock(&ci->i_ceph_lock);
+		}
+	} else {
+		struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
+		if (n) {
+			struct ceph_cap_flush *cf =
+				rb_entry(n, struct ceph_cap_flush, i_node);
+			flush_tid = cf->tid;
+		}
+		flushing = ci->i_flushing_caps;
+		spin_unlock(&ci->i_ceph_lock);
 	}
 out:
-	spin_unlock(&ci->i_ceph_lock);
-out_unlocked:
 	if (session)
 		mutex_unlock(&session->s_mutex);
+
+	*ptid = flush_tid;
 	return flushing;
 }
 
 /*
  * Return true if we've flushed caps through the given flush_tid.
  */
-static int caps_are_flushed(struct inode *inode, unsigned tid)
+static int caps_are_flushed(struct inode *inode, u64 flush_tid)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int i, ret = 1;
+	struct ceph_cap_flush *cf;
+	struct rb_node *n;
+	int ret = 1;
 
 	spin_lock(&ci->i_ceph_lock);
-	for (i = 0; i < CEPH_CAP_BITS; i++)
-		if ((ci->i_flushing_caps & (1 << i)) &&
-		    ci->i_cap_flush_tid[i] <= tid) {
-			/* still flushing this bit */
+	n = rb_first(&ci->i_cap_flush_tree);
+	if (n) {
+		cf = rb_entry(n, struct ceph_cap_flush, i_node);
+		if (cf->tid <= flush_tid)
 			ret = 0;
-			break;
-		}
+	}
 	spin_unlock(&ci->i_ceph_lock);
 	return ret;
 }
@@ -1864,13 +1936,16 @@ static void sync_write_wait(struct inode *inode)
 	struct ceph_osd_request *req;
 	u64 last_tid;
 
+	if (!S_ISREG(inode->i_mode))
+		return;
+
 	spin_lock(&ci->i_unsafe_lock);
 	if (list_empty(head))
 		goto out;
 
 	/* set upper bound as _last_ entry in chain */
-	req = list_entry(head->prev, struct ceph_osd_request,
-			 r_unsafe_item);
+	req = list_last_entry(head, struct ceph_osd_request,
+			      r_unsafe_item);
 	last_tid = req->r_tid;
 
 	do {
@@ -1888,18 +1963,64 @@ static void sync_write_wait(struct inode *inode)
 		 */
 		if (list_empty(head))
 			break;
-		req = list_entry(head->next, struct ceph_osd_request,
-				 r_unsafe_item);
+		req = list_first_entry(head, struct ceph_osd_request,
+				       r_unsafe_item);
 	} while (req->r_tid < last_tid);
 out:
 	spin_unlock(&ci->i_unsafe_lock);
 }
 
+/*
+ * wait for any uncommitted directory operations to commit.
+ */
+static int unsafe_dirop_wait(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_dirops;
+	struct ceph_mds_request *req;
+	u64 last_tid;
+	int ret = 0;
+
+	if (!S_ISDIR(inode->i_mode))
+		return 0;
+
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	req = list_last_entry(head, struct ceph_mds_request,
+			      r_unsafe_dir_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_mdsc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+
+		dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
+		     inode, req->r_tid, last_tid);
+		ret = !wait_for_completion_timeout(&req->r_safe_completion,
+					ceph_timeout_jiffies(req->r_timeout));
+		if (ret)
+			ret = -EIO;  /* timed out */
+
+		ceph_mdsc_put_request(req);
+
+		spin_lock(&ci->i_unsafe_lock);
+		if (ret || list_empty(head))
+			break;
+		req = list_first_entry(head, struct ceph_mds_request,
+				       r_unsafe_dir_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+	return ret;
+}
+
 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	unsigned flush_tid;
+	u64 flush_tid;
 	int ret;
 	int dirty;
 
@@ -1908,25 +2029,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (ret < 0)
-		return ret;
+		goto out;
+
+	if (datasync)
+		goto out;
+
 	mutex_lock(&inode->i_mutex);
 
 	dirty = try_flush_caps(inode, &flush_tid);
 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
+	ret = unsafe_dirop_wait(inode);
+
 	/*
 	 * only wait on non-file metadata writeback (the mds
 	 * can recover size and mtime, so we don't need to
 	 * wait for that)
 	 */
-	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
-		dout("fsync waiting for flush_tid %u\n", flush_tid);
+	if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
 		ret = wait_event_interruptible(ci->i_cap_wq,
-				       caps_are_flushed(inode, flush_tid));
+					caps_are_flushed(inode, flush_tid));
 	}
-
-	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
 	mutex_unlock(&inode->i_mutex);
+out:
+	dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
 	return ret;
 }
 
@@ -1939,7 +2065,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	unsigned flush_tid;
+	u64 flush_tid;
 	int err = 0;
 	int dirty;
 	int wait = wbc->sync_mode == WB_SYNC_ALL;
@@ -1994,6 +2120,93 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
 	}
 }
 
+static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
+				struct ceph_mds_session *session,
+				struct ceph_inode_info *ci)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap *cap;
+	struct ceph_cap_flush *cf;
+	struct rb_node *n;
+	int delayed = 0;
+	u64 first_tid = 0;
+	u64 oldest_flush_tid;
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+	spin_unlock(&mdsc->cap_dirty_lock);
+
+	while (true) {
+		spin_lock(&ci->i_ceph_lock);
+		cap = ci->i_auth_cap;
+		if (!(cap && cap->session == session)) {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+					cap, session->s_mds);
+			spin_unlock(&ci->i_ceph_lock);
+			break;
+		}
+
+		for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
+			cf = rb_entry(n, struct ceph_cap_flush, i_node);
+			if (cf->tid >= first_tid)
+				break;
+		}
+		if (!n) {
+			spin_unlock(&ci->i_ceph_lock);
+			break;
+		}
+
+		cf = rb_entry(n, struct ceph_cap_flush, i_node);
+
+		first_tid = cf->tid + 1;
+
+		dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
+		     cap, cf->tid, ceph_cap_string(cf->caps));
+		delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+				      __ceph_caps_used(ci),
+				      __ceph_caps_wanted(ci),
+				      cap->issued | cap->implemented,
+				      cf->caps, cf->tid, oldest_flush_tid);
+	}
+	return delayed;
+}
+
+void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci;
+	struct ceph_cap *cap;
+
+	dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+		spin_lock(&ci->i_ceph_lock);
+		cap = ci->i_auth_cap;
+		if (!(cap && cap->session == session)) {
+			pr_err("%p auth cap %p not mds%d ???\n",
+				&ci->vfs_inode, cap, session->s_mds);
+			spin_unlock(&ci->i_ceph_lock);
+			continue;
+		}
+
+
+		/*
+		 * if flushing caps were revoked, we re-send the cap flush
+		 * in client reconnect stage. This guarantees MDS * processes
+		 * the cap flush message before issuing the flushing caps to
+		 * other client.
+		 */
+		if ((cap->issued & ci->i_flushing_caps) !=
+		    ci->i_flushing_caps) {
+			spin_unlock(&ci->i_ceph_lock);
+			if (!__kick_flushing_caps(mdsc, session, ci))
+				continue;
+			spin_lock(&ci->i_ceph_lock);
+		}
+
+		spin_unlock(&ci->i_ceph_lock);
+	}
+}
+
 void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 			     struct ceph_mds_session *session)
 {
@@ -2003,28 +2216,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 
 	dout("kick_flushing_caps mds%d\n", session->s_mds);
 	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
-		struct inode *inode = &ci->vfs_inode;
-		struct ceph_cap *cap;
-		int delayed = 0;
-
-		spin_lock(&ci->i_ceph_lock);
-		cap = ci->i_auth_cap;
-		if (cap && cap->session == session) {
-			dout("kick_flushing_caps %p cap %p %s\n", inode,
-			     cap, ceph_cap_string(ci->i_flushing_caps));
-			delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-					     __ceph_caps_used(ci),
-					     __ceph_caps_wanted(ci),
-					     cap->issued | cap->implemented,
-					     ci->i_flushing_caps, NULL);
-			if (delayed) {
-				spin_lock(&ci->i_ceph_lock);
-				__cap_delay_requeue(mdsc, ci);
-				spin_unlock(&ci->i_ceph_lock);
-			}
-		} else {
-			pr_err("%p auth cap %p not mds%d ???\n", inode,
-			       cap, session->s_mds);
+		int delayed = __kick_flushing_caps(mdsc, session, ci);
+		if (delayed) {
+			spin_lock(&ci->i_ceph_lock);
+			__cap_delay_requeue(mdsc, ci);
 			spin_unlock(&ci->i_ceph_lock);
 		}
 	}
@@ -2036,26 +2231,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_cap *cap;
-	int delayed = 0;
 
 	spin_lock(&ci->i_ceph_lock);
 	cap = ci->i_auth_cap;
-	dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
-	     ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+	dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+	     ceph_cap_string(ci->i_flushing_caps));
 
 	__ceph_flush_snaps(ci, &session, 1);
 
 	if (ci->i_flushing_caps) {
+		int delayed;
+
 		spin_lock(&mdsc->cap_dirty_lock);
 		list_move_tail(&ci->i_flushing_item,
 			       &cap->session->s_cap_flushing);
 		spin_unlock(&mdsc->cap_dirty_lock);
 
-		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-				     __ceph_caps_used(ci),
-				     __ceph_caps_wanted(ci),
-				     cap->issued | cap->implemented,
-				     ci->i_flushing_caps, NULL);
+		spin_unlock(&ci->i_ceph_lock);
+
+		delayed = __kick_flushing_caps(mdsc, session, ci);
 		if (delayed) {
 			spin_lock(&ci->i_ceph_lock);
 			__cap_delay_requeue(mdsc, ci);
@@ -2073,7 +2267,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
  *
  * Protected by i_ceph_lock.
  */
-static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+			    bool snap_rwsem_locked)
 {
 	if (got & CEPH_CAP_PIN)
 		ci->i_pin_ref++;
@@ -2081,8 +2276,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
 		ci->i_rd_ref++;
 	if (got & CEPH_CAP_FILE_CACHE)
 		ci->i_rdcache_ref++;
-	if (got & CEPH_CAP_FILE_WR)
+	if (got & CEPH_CAP_FILE_WR) {
+		if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
+			BUG_ON(!snap_rwsem_locked);
+			ci->i_head_snapc = ceph_get_snap_context(
+					ci->i_snap_realm->cached_context);
+		}
 		ci->i_wr_ref++;
+	}
 	if (got & CEPH_CAP_FILE_BUFFER) {
 		if (ci->i_wb_ref == 0)
 			ihold(&ci->vfs_inode);
@@ -2100,16 +2301,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
  * requested from the MDS.
  */
 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
-			    loff_t endoff, int *got, int *check_max, int *err)
+			    loff_t endoff, bool nonblock, int *got, int *err)
 {
 	struct inode *inode = &ci->vfs_inode;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	int ret = 0;
 	int have, implemented;
 	int file_wanted;
+	bool snap_rwsem_locked = false;
 
 	dout("get_cap_refs %p need %s want %s\n", inode,
 	     ceph_cap_string(need), ceph_cap_string(want));
 
+again:
 	spin_lock(&ci->i_ceph_lock);
 
 	/* make sure file is actually open */
@@ -2125,6 +2329,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 	/* finish pending truncate */
 	while (ci->i_truncate_pending) {
 		spin_unlock(&ci->i_ceph_lock);
+		if (snap_rwsem_locked) {
+			up_read(&mdsc->snap_rwsem);
+			snap_rwsem_locked = false;
+		}
 		__ceph_do_pending_vmtruncate(inode);
 		spin_lock(&ci->i_ceph_lock);
 	}
@@ -2136,7 +2344,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
 			     inode, endoff, ci->i_max_size);
 			if (endoff > ci->i_requested_max_size) {
-				*check_max = 1;
+				*err = -EAGAIN;
 				ret = 1;
 			}
 			goto out_unlock;
@@ -2164,8 +2372,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 		     inode, ceph_cap_string(have), ceph_cap_string(not),
 		     ceph_cap_string(revoking));
 		if ((revoking & not) == 0) {
+			if (!snap_rwsem_locked &&
+			    !ci->i_head_snapc &&
+			    (need & CEPH_CAP_FILE_WR)) {
+				if (!down_read_trylock(&mdsc->snap_rwsem)) {
+					/*
+					 * we can not call down_read() when
+					 * task isn't in TASK_RUNNING state
+					 */
+					if (nonblock) {
+						*err = -EAGAIN;
+						ret = 1;
+						goto out_unlock;
+					}
+
+					spin_unlock(&ci->i_ceph_lock);
+					down_read(&mdsc->snap_rwsem);
+					snap_rwsem_locked = true;
+					goto again;
+				}
+				snap_rwsem_locked = true;
+			}
 			*got = need | (have & want);
-			__take_cap_refs(ci, *got);
+			__take_cap_refs(ci, *got, true);
 			ret = 1;
 		}
 	} else {
@@ -2189,6 +2418,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 	}
 out_unlock:
 	spin_unlock(&ci->i_ceph_lock);
+	if (snap_rwsem_locked)
+		up_read(&mdsc->snap_rwsem);
 
 	dout("get_cap_refs %p ret %d got %s\n", inode,
 	     ret, ceph_cap_string(*got));
@@ -2231,50 +2462,70 @@ static void check_max_size(struct inode *inode, loff_t endoff)
 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 		  loff_t endoff, int *got, struct page **pinned_page)
 {
-	int _got, check_max, ret, err = 0;
+	int _got, ret, err = 0;
 
-retry:
-	if (endoff > 0)
-		check_max_size(&ci->vfs_inode, endoff);
-	_got = 0;
-	check_max = 0;
-	ret = wait_event_interruptible(ci->i_cap_wq,
-				try_get_cap_refs(ci, need, want, endoff,
-						 &_got, &check_max, &err));
-	if (err)
-		ret = err;
+	ret = ceph_pool_perm_check(ci, need);
 	if (ret < 0)
 		return ret;
 
-	if (check_max)
-		goto retry;
+	while (true) {
+		if (endoff > 0)
+			check_max_size(&ci->vfs_inode, endoff);
 
-	if (ci->i_inline_version != CEPH_INLINE_NONE &&
-	    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
-	    i_size_read(&ci->vfs_inode) > 0) {
-		struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
-		if (page) {
-			if (PageUptodate(page)) {
-				*pinned_page = page;
-				goto out;
-			}
-			page_cache_release(page);
-		}
-		/*
-		 * drop cap refs first because getattr while holding
-		 * caps refs can cause deadlock.
-		 */
-		ceph_put_cap_refs(ci, _got);
+		err = 0;
 		_got = 0;
+		ret = try_get_cap_refs(ci, need, want, endoff,
+				       false, &_got, &err);
+		if (ret) {
+			if (err == -EAGAIN)
+				continue;
+			if (err < 0)
+				return err;
+		} else {
+			ret = wait_event_interruptible(ci->i_cap_wq,
+					try_get_cap_refs(ci, need, want, endoff,
+							 true, &_got, &err));
+			if (err == -EAGAIN)
+				continue;
+			if (err < 0)
+				ret = err;
+			if (ret < 0)
+				return ret;
+		}
 
-		/* getattr request will bring inline data into page cache */
-		ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
-					CEPH_STAT_CAP_INLINE_DATA, true);
-		if (ret < 0)
-			return ret;
-		goto retry;
+		if (ci->i_inline_version != CEPH_INLINE_NONE &&
+		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+		    i_size_read(&ci->vfs_inode) > 0) {
+			struct page *page =
+				find_get_page(ci->vfs_inode.i_mapping, 0);
+			if (page) {
+				if (PageUptodate(page)) {
+					*pinned_page = page;
+					break;
+				}
+				page_cache_release(page);
+			}
+			/*
+			 * drop cap refs first because getattr while
+			 * holding * caps refs can cause deadlock.
+			 */
+			ceph_put_cap_refs(ci, _got);
+			_got = 0;
+
+			/*
+			 * getattr request will bring inline data into
+			 * page cache
+			 */
+			ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+						CEPH_STAT_CAP_INLINE_DATA,
+						true);
+			if (ret < 0)
+				return ret;
+			continue;
+		}
+		break;
 	}
-out:
+
 	*got = _got;
 	return 0;
 }
@@ -2286,10 +2537,31 @@ out:
 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
 {
 	spin_lock(&ci->i_ceph_lock);
-	__take_cap_refs(ci, caps);
+	__take_cap_refs(ci, caps, false);
 	spin_unlock(&ci->i_ceph_lock);
 }
 
+
+/*
+ * drop cap_snap that is not associated with any snapshot.
+ * we don't need to send FLUSHSNAP message for it.
+ */
+static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
+{
+	if (!capsnap->need_flush &&
+	    !capsnap->writing && !capsnap->dirty_pages) {
+
+		dout("dropping cap_snap %p follows %llu\n",
+		     capsnap, capsnap->follows);
+		ceph_put_snap_context(capsnap->context);
+		list_del(&capsnap->ci_item);
+		list_del(&capsnap->flushing_item);
+		ceph_put_cap_snap(capsnap);
+		return 1;
+	}
+	return 0;
+}
+
 /*
  * Release cap refs.
  *
@@ -2303,7 +2575,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 {
 	struct inode *inode = &ci->vfs_inode;
 	int last = 0, put = 0, flushsnaps = 0, wake = 0;
-	struct ceph_cap_snap *capsnap;
 
 	spin_lock(&ci->i_ceph_lock);
 	if (had & CEPH_CAP_PIN)
@@ -2325,17 +2596,24 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 	if (had & CEPH_CAP_FILE_WR)
 		if (--ci->i_wr_ref == 0) {
 			last++;
-			if (!list_empty(&ci->i_cap_snaps)) {
-				capsnap = list_first_entry(&ci->i_cap_snaps,
-						     struct ceph_cap_snap,
-						     ci_item);
-				if (capsnap->writing) {
-					capsnap->writing = 0;
-					flushsnaps =
-						__ceph_finish_cap_snap(ci,
-								       capsnap);
-					wake = 1;
-				}
+			if (__ceph_have_pending_cap_snap(ci)) {
+				struct ceph_cap_snap *capsnap =
+					list_last_entry(&ci->i_cap_snaps,
+							struct ceph_cap_snap,
+							ci_item);
+				capsnap->writing = 0;
+				if (ceph_try_drop_cap_snap(capsnap))
+					put++;
+				else if (__ceph_finish_cap_snap(ci, capsnap))
+					flushsnaps = 1;
+				wake = 1;
+			}
+			if (ci->i_wrbuffer_ref_head == 0 &&
+			    ci->i_dirty_caps == 0 &&
+			    ci->i_flushing_caps == 0) {
+				BUG_ON(!ci->i_head_snapc);
+				ceph_put_snap_context(ci->i_head_snapc);
+				ci->i_head_snapc = NULL;
 			}
 			/* see comment in __ceph_remove_cap() */
 			if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
@@ -2352,7 +2630,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 		ceph_flush_snaps(ci);
 	if (wake)
 		wake_up_all(&ci->i_cap_wq);
-	if (put)
+	while (put-- > 0)
 		iput(inode);
 }
 
@@ -2380,7 +2658,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 	if (ci->i_head_snapc == snapc) {
 		ci->i_wrbuffer_ref_head -= nr;
 		if (ci->i_wrbuffer_ref_head == 0 &&
-		    ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+		    ci->i_wr_ref == 0 &&
+		    ci->i_dirty_caps == 0 &&
+		    ci->i_flushing_caps == 0) {
 			BUG_ON(!ci->i_head_snapc);
 			ceph_put_snap_context(ci->i_head_snapc);
 			ci->i_head_snapc = NULL;
@@ -2401,25 +2681,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 		capsnap->dirty_pages -= nr;
 		if (capsnap->dirty_pages == 0) {
 			complete_capsnap = 1;
-			if (capsnap->dirty == 0)
-				/* cap writeback completed before we created
-				 * the cap_snap; no FLUSHSNAP is needed */
-				drop_capsnap = 1;
+			drop_capsnap = ceph_try_drop_cap_snap(capsnap);
 		}
 		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
-		     " snap %lld %d/%d -> %d/%d %s%s%s\n",
+		     " snap %lld %d/%d -> %d/%d %s%s\n",
 		     inode, capsnap, capsnap->context->seq,
 		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
 		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
 		     last ? " (wrbuffer last)" : "",
-		     complete_capsnap ? " (complete capsnap)" : "",
-		     drop_capsnap ? " (drop capsnap)" : "");
-		if (drop_capsnap) {
-			ceph_put_snap_context(capsnap->context);
-			list_del(&capsnap->ci_item);
-			list_del(&capsnap->flushing_item);
-			ceph_put_cap_snap(capsnap);
-		}
+		     complete_capsnap ? " (complete capsnap)" : "");
 	}
 
 	spin_unlock(&ci->i_ceph_lock);
@@ -2526,7 +2796,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 	 * try to invalidate (once).  (If there are dirty buffers, we
 	 * will invalidate _after_ writeback.)
 	 */
-	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+	if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+	    ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
 	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
 	    !ci->i_wrbuffer_ref) {
 		if (try_nonblocking_invalidate(inode)) {
@@ -2732,16 +3003,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_cap_flush *cf;
+	struct rb_node *n;
+	LIST_HEAD(to_remove);
 	unsigned seq = le32_to_cpu(m->seq);
 	int dirty = le32_to_cpu(m->dirty);
 	int cleaned = 0;
 	int drop = 0;
-	int i;
 
-	for (i = 0; i < CEPH_CAP_BITS; i++)
-		if ((dirty & (1 << i)) &&
-		    (u16)flush_tid == ci->i_cap_flush_tid[i])
-			cleaned |= 1 << i;
+	n = rb_first(&ci->i_cap_flush_tree);
+	while (n) {
+		cf = rb_entry(n, struct ceph_cap_flush, i_node);
+		n = rb_next(&cf->i_node);
+		if (cf->tid == flush_tid)
+			cleaned = cf->caps;
+		if (cf->tid <= flush_tid) {
+			rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
+			list_add_tail(&cf->list, &to_remove);
+		} else {
+			cleaned &= ~cf->caps;
+			if (!cleaned)
+				break;
+		}
+	}
 
 	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
 	     " flushing %s -> %s\n",
@@ -2749,12 +3033,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	     ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
 	     ceph_cap_string(ci->i_flushing_caps & ~cleaned));
 
-	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+	if (list_empty(&to_remove) && !cleaned)
 		goto out;
 
 	ci->i_flushing_caps &= ~cleaned;
 
 	spin_lock(&mdsc->cap_dirty_lock);
+
+	if (!list_empty(&to_remove)) {
+		list_for_each_entry(cf, &to_remove, list)
+			rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
+		n = rb_first(&mdsc->cap_flush_tree);
+		cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+		if (!cf || cf->tid > flush_tid)
+			wake_up_all(&mdsc->cap_flushing_wq);
+	}
+
 	if (ci->i_flushing_caps == 0) {
 		list_del_init(&ci->i_flushing_item);
 		if (!list_empty(&session->s_cap_flushing))
@@ -2764,14 +3059,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 					 struct ceph_inode_info,
 					 i_flushing_item)->vfs_inode);
 		mdsc->num_cap_flushing--;
-		wake_up_all(&mdsc->cap_flushing_wq);
 		dout(" inode %p now !flushing\n", inode);
 
 		if (ci->i_dirty_caps == 0) {
 			dout(" inode %p now clean\n", inode);
 			BUG_ON(!list_empty(&ci->i_dirty_item));
 			drop = 1;
-			if (ci->i_wrbuffer_ref_head == 0) {
+			if (ci->i_wr_ref == 0 &&
+			    ci->i_wrbuffer_ref_head == 0) {
 				BUG_ON(!ci->i_head_snapc);
 				ceph_put_snap_context(ci->i_head_snapc);
 				ci->i_head_snapc = NULL;
@@ -2785,6 +3080,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 
 out:
 	spin_unlock(&ci->i_ceph_lock);
+
+	while (!list_empty(&to_remove)) {
+		cf = list_first_entry(&to_remove,
+				      struct ceph_cap_flush, list);
+		list_del(&cf->list);
+		ceph_free_cap_flush(cf);
+	}
 	if (drop)
 		iput(inode);
 }
@@ -2800,6 +3102,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 				     struct ceph_mds_session *session)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	u64 follows = le64_to_cpu(m->snap_follows);
 	struct ceph_cap_snap *capsnap;
 	int drop = 0;
@@ -2823,6 +3126,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 			list_del(&capsnap->ci_item);
 			list_del(&capsnap->flushing_item);
 			ceph_put_cap_snap(capsnap);
+			wake_up_all(&mdsc->cap_flushing_wq);
 			drop = 1;
 			break;
 		} else {
@@ -2971,7 +3275,6 @@ retry:
 			mutex_lock_nested(&session->s_mutex,
 					  SINGLE_DEPTH_NESTING);
 		}
-		ceph_add_cap_releases(mdsc, tsession);
 		new_cap = ceph_get_cap(mdsc, NULL);
 	} else {
 		WARN_ON(1);
@@ -3167,16 +3470,20 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
 	     (unsigned)seq);
 
-	if (op == CEPH_CAP_OP_IMPORT)
-		ceph_add_cap_releases(mdsc, session);
-
 	if (!inode) {
 		dout(" i don't have ino %llx\n", vino.ino);
 
 		if (op == CEPH_CAP_OP_IMPORT) {
+			cap = ceph_get_cap(mdsc, NULL);
+			cap->cap_ino = vino.ino;
+			cap->queue_release = 1;
+			cap->cap_id = cap_id;
+			cap->mseq = mseq;
+			cap->seq = seq;
 			spin_lock(&session->s_cap_lock);
-			__queue_cap_release(session, vino.ino, cap_id,
-					    mseq, seq);
+			list_add_tail(&cap->session_caps,
+					&session->s_cap_releases);
+			session->s_num_cap_releases++;
 			spin_unlock(&session->s_cap_lock);
 		}
 		goto flush_cap_releases;
@@ -3252,11 +3559,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
 flush_cap_releases:
 	/*
-	 * send any full release message to try to move things
+	 * send any cap release message to try to move things
 	 * along for the mds (who clearly thinks we still have this
 	 * cap).
 	 */
-	ceph_add_cap_releases(mdsc, session);
 	ceph_send_cap_releases(mdsc, session);
 
 done:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4248307fe..9314b4ea2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
 	if (dentry->d_fsdata)
 		return 0;
 
-	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
+	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO);
 	if (!di)
 		return -ENOMEM;          /* oh well */
 
@@ -107,6 +107,27 @@ static int fpos_cmp(loff_t l, loff_t r)
 }
 
 /*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+		            int len, unsigned next_offset)
+{
+	char *buf = kmalloc(len+1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	kfree(fi->last_name);
+	fi->last_name = buf;
+	memcpy(fi->last_name, name, len);
+	fi->last_name[len] = 0;
+	fi->next_offset = next_offset;
+	dout("note_last_dentry '%s'\n", fi->last_name);
+	return 0;
+}
+
+/*
  * When possible, we try to satisfy a readdir by peeking at the
  * dcache.  We make this work by carefully ordering dentries on
  * d_child when we initially get results back from the MDS, and
@@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 	struct ceph_file_info *fi = file->private_data;
 	struct dentry *parent = file->f_path.dentry;
 	struct inode *dir = d_inode(parent);
-	struct list_head *p;
-	struct dentry *dentry, *last;
+	struct dentry *dentry, *last = NULL;
 	struct ceph_dentry_info *di;
+	unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *);
 	int err = 0;
+	loff_t ptr_pos = 0;
+	struct ceph_readdir_cache_control cache_ctl = {};
 
-	/* claim ref on last dentry we returned */
-	last = fi->dentry;
-	fi->dentry = NULL;
-
-	dout("__dcache_readdir %p v%u at %llu (last %p)\n",
-	     dir, shared_gen, ctx->pos, last);
+	dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
 
-	spin_lock(&parent->d_lock);
-
-	/* start at beginning? */
-	if (ctx->pos == 2 || last == NULL ||
-	    fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
-		if (list_empty(&parent->d_subdirs))
-			goto out_unlock;
-		p = parent->d_subdirs.prev;
-		dout(" initial p %p/%p\n", p->prev, p->next);
-	} else {
-		p = last->d_child.prev;
+	/* we can calculate cache index for the first dirfrag */
+	if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
+		cache_ctl.index = fpos_off(ctx->pos) - 2;
+		BUG_ON(cache_ctl.index < 0);
+		ptr_pos = cache_ctl.index * sizeof(struct dentry *);
 	}
 
-more:
-	dentry = list_entry(p, struct dentry, d_child);
-	di = ceph_dentry(dentry);
-	while (1) {
-		dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
-		     d_unhashed(dentry) ? "!hashed" : "hashed",
-		     parent->d_subdirs.prev, parent->d_subdirs.next);
-		if (p == &parent->d_subdirs) {
+	while (true) {
+		pgoff_t pgoff;
+		bool emit_dentry;
+
+		if (ptr_pos >= i_size_read(dir)) {
 			fi->flags |= CEPH_F_ATEND;
-			goto out_unlock;
+			err = 0;
+			break;
+		}
+
+		err = -EAGAIN;
+		pgoff = ptr_pos >> PAGE_CACHE_SHIFT;
+		if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
+			ceph_readdir_cache_release(&cache_ctl);
+			cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
+			if (!cache_ctl.page) {
+				dout(" page %lu not found\n", pgoff);
+				break;
+			}
+			/* reading/filling the cache are serialized by
+			 * i_mutex, no need to use page lock */
+			unlock_page(cache_ctl.page);
+			cache_ctl.dentries = kmap(cache_ctl.page);
 		}
-		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+
+		rcu_read_lock();
+		spin_lock(&parent->d_lock);
+		/* check i_size again here, because empty directory can be
+		 * marked as complete while not holding the i_mutex. */
+		if (ceph_dir_is_complete_ordered(dir) &&
+		    ptr_pos < i_size_read(dir))
+			dentry = cache_ctl.dentries[cache_ctl.index % nsize];
+		else
+			dentry = NULL;
+		spin_unlock(&parent->d_lock);
+		if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+			dentry = NULL;
+		rcu_read_unlock();
+		if (!dentry)
+			break;
+
+		emit_dentry = false;
+		di = ceph_dentry(dentry);
+		spin_lock(&dentry->d_lock);
 		if (di->lease_shared_gen == shared_gen &&
-		    !d_unhashed(dentry) && d_really_is_positive(dentry) &&
+		    d_really_is_positive(dentry) &&
 		    ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
 		    ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
-		    fpos_cmp(ctx->pos, di->offset) <= 0)
-			break;
-		dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry,
-		     dentry, di->offset,
-		     ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
-		     !d_inode(dentry) ? " null" : "");
+		    fpos_cmp(ctx->pos, di->offset) <= 0) {
+			emit_dentry = true;
+		}
 		spin_unlock(&dentry->d_lock);
-		p = p->prev;
-		dentry = list_entry(p, struct dentry, d_child);
-		di = ceph_dentry(dentry);
-	}
-
-	dget_dlock(dentry);
-	spin_unlock(&dentry->d_lock);
-	spin_unlock(&parent->d_lock);
 
-	/* make sure a dentry wasn't dropped while we didn't have parent lock */
-	if (!ceph_dir_is_complete_ordered(dir)) {
-		dout(" lost dir complete on %p; falling back to mds\n", dir);
-		dput(dentry);
-		err = -EAGAIN;
-		goto out;
-	}
+		if (emit_dentry) {
+			dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+			     dentry, dentry, d_inode(dentry));
+			ctx->pos = di->offset;
+			if (!dir_emit(ctx, dentry->d_name.name,
+				      dentry->d_name.len,
+				      ceph_translate_ino(dentry->d_sb,
+							 d_inode(dentry)->i_ino),
+				      d_inode(dentry)->i_mode >> 12)) {
+				dput(dentry);
+				err = 0;
+				break;
+			}
+			ctx->pos++;
 
-	dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
-	     dentry, dentry, d_inode(dentry));
-	if (!dir_emit(ctx, dentry->d_name.name,
-		      dentry->d_name.len,
-		      ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino),
-		      d_inode(dentry)->i_mode >> 12)) {
-		if (last) {
-			/* remember our position */
-			fi->dentry = last;
-			fi->next_offset = fpos_off(di->offset);
+			if (last)
+				dput(last);
+			last = dentry;
+		} else {
+			dput(dentry);
 		}
-		dput(dentry);
-		return 0;
-	}
-
-	ctx->pos = di->offset + 1;
 
-	if (last)
-		dput(last);
-	last = dentry;
-
-	spin_lock(&parent->d_lock);
-	p = p->prev;	/* advance to next dentry */
-	goto more;
-
-out_unlock:
-	spin_unlock(&parent->d_lock);
-out:
-	if (last)
+		cache_ctl.index++;
+		ptr_pos += sizeof(struct dentry *);
+	}
+	ceph_readdir_cache_release(&cache_ctl);
+	if (last) {
+		int ret;
+		di = ceph_dentry(last);
+		ret = note_last_dentry(fi, last->d_name.name, last->d_name.len,
+				       fpos_off(di->offset) + 1);
+		if (ret < 0)
+			err = ret;
 		dput(last);
+	}
 	return err;
 }
 
-/*
- * make note of the last dentry we read, so we can
- * continue at the same lexicographical point,
- * regardless of what dir changes take place on the
- * server.
- */
-static int note_last_dentry(struct ceph_file_info *fi, const char *name,
-			    int len)
-{
-	kfree(fi->last_name);
-	fi->last_name = kmalloc(len+1, GFP_NOFS);
-	if (!fi->last_name)
-		return -ENOMEM;
-	memcpy(fi->last_name, name, len);
-	fi->last_name[len] = 0;
-	dout("note_last_dentry '%s'\n", fi->last_name);
-	return 0;
-}
-
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct ceph_file_info *fi = file->private_data;
@@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
 	/* can we use the dcache? */
 	spin_lock(&ci->i_ceph_lock);
-	if ((ctx->pos == 2 || fi->dentry) &&
-	    ceph_test_mount_opt(fsc, DCACHE) &&
+	if (ceph_test_mount_opt(fsc, DCACHE) &&
 	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
 	    ceph_snap(inode) != CEPH_SNAPDIR &&
 	    __ceph_dir_is_complete_ordered(ci) &&
@@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	} else {
 		spin_unlock(&ci->i_ceph_lock);
 	}
-	if (fi->dentry) {
-		err = note_last_dentry(fi, fi->dentry->d_name.name,
-				       fi->dentry->d_name.len);
-		if (err)
-			return err;
-		dput(fi->dentry);
-		fi->dentry = NULL;
-	}
 
 	/* proceed with a normal readdir */
-
-	if (ctx->pos == 2) {
-		/* note dir version at start of readdir so we can tell
-		 * if any dentries get dropped */
-		fi->dir_release_count = atomic_read(&ci->i_release_count);
-		fi->dir_ordered_count = ci->i_ordered_count;
-	}
-
 more:
 	/* do we have the correct frag content buffered? */
 	if (fi->frag != frag || fi->last_readdir == NULL) {
@@ -342,12 +336,15 @@ more:
 		req->r_direct_hash = ceph_frag_value(frag);
 		req->r_direct_is_hash = true;
 		if (fi->last_name) {
-			req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+			req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
 			if (!req->r_path2) {
 				ceph_mdsc_put_request(req);
 				return -ENOMEM;
 			}
 		}
+		req->r_dir_release_cnt = fi->dir_release_count;
+		req->r_dir_ordered_cnt = fi->dir_ordered_count;
+		req->r_readdir_cache_idx = fi->readdir_cache_idx;
 		req->r_readdir_offset = fi->next_offset;
 		req->r_args.readdir.frag = cpu_to_le32(frag);
 
@@ -364,26 +361,38 @@ more:
 		     (int)req->r_reply_info.dir_end,
 		     (int)req->r_reply_info.dir_complete);
 
-		if (!req->r_did_prepopulate) {
-			dout("readdir !did_prepopulate");
-			/* preclude from marking dir complete */
-			fi->dir_release_count--;
-		}
 
 		/* note next offset and last dentry name */
 		rinfo = &req->r_reply_info;
 		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
 			frag = le32_to_cpu(rinfo->dir_dir->frag);
-			if (ceph_frag_is_leftmost(frag))
-				fi->next_offset = 2;
-			else
-				fi->next_offset = 0;
-			off = fi->next_offset;
+			off = req->r_readdir_offset;
+			fi->next_offset = off;
 		}
+
 		fi->frag = frag;
 		fi->offset = fi->next_offset;
 		fi->last_readdir = req;
 
+		if (req->r_did_prepopulate) {
+			fi->readdir_cache_idx = req->r_readdir_cache_idx;
+			if (fi->readdir_cache_idx < 0) {
+				/* preclude from marking dir ordered */
+				fi->dir_ordered_count = 0;
+			} else if (ceph_frag_is_leftmost(frag) && off == 2) {
+				/* note dir version at start of readdir so
+				 * we can tell if any dentries get dropped */
+				fi->dir_release_count = req->r_dir_release_cnt;
+				fi->dir_ordered_count = req->r_dir_ordered_cnt;
+			}
+		} else {
+			dout("readdir !did_prepopulate");
+			/* disable readdir cache */
+			fi->readdir_cache_idx = -1;
+			/* preclude from marking dir complete */
+			fi->dir_release_count = 0;
+		}
+
 		if (req->r_reply_info.dir_end) {
 			kfree(fi->last_name);
 			fi->last_name = NULL;
@@ -394,10 +403,10 @@ more:
 		} else {
 			err = note_last_dentry(fi,
 				       rinfo->dir_dname[rinfo->dir_nr-1],
-				       rinfo->dir_dname_len[rinfo->dir_nr-1]);
+				       rinfo->dir_dname_len[rinfo->dir_nr-1],
+				       fi->next_offset + rinfo->dir_nr);
 			if (err)
 				return err;
-			fi->next_offset += rinfo->dir_nr;
 		}
 	}
 
@@ -453,16 +462,22 @@ more:
 	 * were released during the whole readdir, and we should have
 	 * the complete dir contents in our cache.
 	 */
-	spin_lock(&ci->i_ceph_lock);
-	if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
-		if (ci->i_ordered_count == fi->dir_ordered_count)
+	if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
+		spin_lock(&ci->i_ceph_lock);
+		if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
 			dout(" marking %p complete and ordered\n", inode);
-		else
+			/* use i_size to track number of entries in
+			 * readdir cache */
+			BUG_ON(fi->readdir_cache_idx < 0);
+			i_size_write(inode, fi->readdir_cache_idx *
+				     sizeof(struct dentry*));
+		} else {
 			dout(" marking %p complete\n", inode);
+		}
 		__ceph_dir_set_complete(ci, fi->dir_release_count,
 					fi->dir_ordered_count);
+		spin_unlock(&ci->i_ceph_lock);
 	}
-	spin_unlock(&ci->i_ceph_lock);
 
 	dout("readdir %p file %p done.\n", inode, file);
 	return 0;
@@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
 	}
 	kfree(fi->last_name);
 	fi->last_name = NULL;
+	fi->dir_release_count = 0;
+	fi->readdir_cache_idx = -1;
 	if (ceph_frag_is_leftmost(frag))
 		fi->next_offset = 2;  /* compensate for . and .. */
 	else
 		fi->next_offset = 0;
-	if (fi->dentry) {
-		dput(fi->dentry);
-		fi->dentry = NULL;
-	}
 	fi->flags &= ~CEPH_F_ATEND;
 }
 
@@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 	mutex_lock(&inode->i_mutex);
 	retval = -EINVAL;
 	switch (whence) {
-	case SEEK_END:
-		offset += inode->i_size + 2;   /* FIXME */
-		break;
 	case SEEK_CUR:
 		offset += file->f_pos;
 	case SEEK_SET:
 		break;
+	case SEEK_END:
+		retval = -EOPNOTSUPP;
 	default:
 		goto out;
 	}
@@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 		}
 		retval = offset;
 
-		/*
-		 * discard buffered readdir content on seekdir(0), or
-		 * seek to new frag, or seek prior to current chunk.
-		 */
 		if (offset == 0 ||
 		    fpos_frag(offset) != fi->frag ||
 		    fpos_off(offset) < fi->offset) {
+			/* discard buffered readdir content on seekdir(0), or
+			 * seek to new frag, or seek prior to current chunk */
 			dout("dir_llseek dropping %p content\n", file);
 			reset_readdir(fi, fpos_frag(offset));
+		} else if (fpos_cmp(offset, old_offset) > 0) {
+			/* reset dir_release_count if we did a forward seek */
+			fi->dir_release_count = 0;
+			fi->readdir_cache_idx = -1;
 		}
-
-		/* bump dir_release_count if we did a forward seek */
-		if (fpos_cmp(offset, old_offset) > 0)
-			fi->dir_release_count--;
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
@@ -764,7 +774,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 		err = PTR_ERR(req);
 		goto out;
 	}
-	req->r_path2 = kstrdup(dest, GFP_NOFS);
+	req->r_path2 = kstrdup(dest, GFP_KERNEL);
 	if (!req->r_path2) {
 		err = -ENOMEM;
 		ceph_mdsc_put_request(req);
@@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 * to do it here.
 		 */
 
+		/* d_move screws up sibling dentries' offsets */
+		ceph_dir_clear_complete(old_dir);
+		ceph_dir_clear_complete(new_dir);
+
 		d_move(old_dentry, new_dentry);
 
 		/* ensure target dentry is invalidated, despite
 		   rehashing bug in vfs_rename_dir */
 		ceph_invalidate_dentry_lease(new_dentry);
-
-		/* d_move screws up sibling dentries' offsets */
-		ceph_dir_clear_complete(old_dir);
-		ceph_dir_clear_complete(new_dir);
-
 	}
 	ceph_mdsc_put_request(req);
 	return err;
@@ -1189,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 		return -EISDIR;
 
 	if (!cf->dir_info) {
-		cf->dir_info = kmalloc(bufsize, GFP_NOFS);
+		cf->dir_info = kmalloc(bufsize, GFP_KERNEL);
 		if (!cf->dir_info)
 			return -ENOMEM;
 		cf->dir_info_len =
@@ -1224,66 +1233,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 }
 
 /*
- * an fsync() on a dir will wait for any uncommitted directory
- * operations to commit.
- */
-static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
-			  int datasync)
-{
-	struct inode *inode = file_inode(file);
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct list_head *head = &ci->i_unsafe_dirops;
-	struct ceph_mds_request *req;
-	u64 last_tid;
-	int ret = 0;
-
-	dout("dir_fsync %p\n", inode);
-	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (ret)
-		return ret;
-	mutex_lock(&inode->i_mutex);
-
-	spin_lock(&ci->i_unsafe_lock);
-	if (list_empty(head))
-		goto out;
-
-	req = list_entry(head->prev,
-			 struct ceph_mds_request, r_unsafe_dir_item);
-	last_tid = req->r_tid;
-
-	do {
-		ceph_mdsc_get_request(req);
-		spin_unlock(&ci->i_unsafe_lock);
-
-		dout("dir_fsync %p wait on tid %llu (until %llu)\n",
-		     inode, req->r_tid, last_tid);
-		if (req->r_timeout) {
-			unsigned long time_left = wait_for_completion_timeout(
-							&req->r_safe_completion,
-							req->r_timeout);
-			if (time_left > 0)
-				ret = 0;
-			else
-				ret = -EIO;  /* timed out */
-		} else {
-			wait_for_completion(&req->r_safe_completion);
-		}
-		ceph_mdsc_put_request(req);
-
-		spin_lock(&ci->i_unsafe_lock);
-		if (ret || list_empty(head))
-			break;
-		req = list_entry(head->next,
-				 struct ceph_mds_request, r_unsafe_dir_item);
-	} while (req->r_tid < last_tid);
-out:
-	spin_unlock(&ci->i_unsafe_lock);
-	mutex_unlock(&inode->i_mutex);
-
-	return ret;
-}
-
-/*
  * We maintain a private dentry LRU.
  *
  * FIXME: this needs to be changed to a per-mds lru to be useful.
@@ -1353,7 +1302,7 @@ const struct file_operations ceph_dir_fops = {
 	.open = ceph_open,
 	.release = ceph_release,
 	.unlocked_ioctl = ceph_ioctl,
-	.fsync = ceph_dir_fsync,
+	.fsync = ceph_fsync,
 };
 
 const struct file_operations ceph_snapdir_fops = {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b6b522b4..8b79d87ea 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 	case S_IFDIR:
 		dout("init_file %p %p 0%o (regular)\n", inode, file,
 		     inode->i_mode);
-		cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+		cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
 		if (cf == NULL) {
 			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
 			return -ENOMEM;
 		}
 		cf->fmode = fmode;
 		cf->next_offset = 2;
+		cf->readdir_cache_idx = -1;
 		file->private_data = cf;
 		BUG_ON(inode->i_fop->release != ceph_release);
 		break;
@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file)
 		ceph_mdsc_put_request(cf->last_readdir);
 	kfree(cf->last_name);
 	kfree(cf->dir_info);
-	dput(cf->dentry);
 	kmem_cache_free(ceph_file_cachep, cf);
 
 	/* wake up anyone waiting for caps on this inode */
@@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 		}
 	} else {
 		num_pages = calc_pages_for(off, len);
-		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
 		if (IS_ERR(pages))
 			return PTR_ERR(pages);
 		ret = striped_read(inode, off, len, pages,
@@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
  * objects, rollback on failure, etc.)
  */
 static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+		       struct ceph_snap_context *snapc)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	struct ceph_snap_context *snapc;
 	struct ceph_vino vino;
 	struct ceph_osd_request *req;
 	struct page **pages;
@@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 		size_t start;
 		ssize_t n;
 
-		snapc = ci->i_snap_realm->cached_context;
 		vino = ceph_vino(inode);
 		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
 					    vino, pos, &len, 0,
@@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 			break;
 		}
 
-		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
 		n = iov_iter_get_pages_alloc(from, &pages, len, &start);
 		if (unlikely(n < 0)) {
@@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
  * objects, rollback on failure, etc.)
  */
 static ssize_t
-ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+		struct ceph_snap_context *snapc)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	struct ceph_snap_context *snapc;
 	struct ceph_vino vino;
 	struct ceph_osd_request *req;
 	struct page **pages;
@@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 		size_t left;
 		int n;
 
-		snapc = ci->i_snap_realm->cached_context;
 		vino = ceph_vino(inode);
 		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
 					    vino, pos, &len, 0, 1,
@@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 		 */
 		num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
 			goto out;
@@ -860,7 +858,7 @@ again:
 		struct page *page = NULL;
 		loff_t i_size;
 		if (retry_op == READ_INLINE) {
-			page = __page_cache_alloc(GFP_NOFS);
+			page = __page_cache_alloc(GFP_KERNEL);
 			if (!page)
 				return -ENOMEM;
 		}
@@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
+	struct ceph_cap_flush *prealloc_cf;
 	ssize_t count, written = 0;
 	int err, want, got;
 	loff_t pos;
@@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
@@ -959,7 +962,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	pos = iocb->ki_pos;
 	count = iov_iter_count(from);
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err)
 		goto out;
 
@@ -996,14 +999,30 @@ retry_snap:
 
 	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
 	    (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+		struct ceph_snap_context *snapc;
 		struct iov_iter data;
 		mutex_unlock(&inode->i_mutex);
+
+		spin_lock(&ci->i_ceph_lock);
+		if (__ceph_have_pending_cap_snap(ci)) {
+			struct ceph_cap_snap *capsnap =
+					list_last_entry(&ci->i_cap_snaps,
+							struct ceph_cap_snap,
+							ci_item);
+			snapc = ceph_get_snap_context(capsnap->context);
+		} else {
+			BUG_ON(!ci->i_head_snapc);
+			snapc = ceph_get_snap_context(ci->i_head_snapc);
+		}
+		spin_unlock(&ci->i_ceph_lock);
+
 		/* we might need to revert back to that point */
 		data = *from;
 		if (iocb->ki_flags & IOCB_DIRECT)
-			written = ceph_sync_direct_write(iocb, &data, pos);
+			written = ceph_sync_direct_write(iocb, &data, pos,
+							 snapc);
 		else
-			written = ceph_sync_write(iocb, &data, pos);
+			written = ceph_sync_write(iocb, &data, pos, snapc);
 		if (written == -EOLDSNAPC) {
 			dout("aio_write %p %llx.%llx %llu~%u"
 				"got EOLDSNAPC, retrying\n",
@@ -1014,6 +1033,7 @@ retry_snap:
 		}
 		if (written > 0)
 			iov_iter_advance(from, written);
+		ceph_put_snap_context(snapc);
 	} else {
 		loff_t old_size = inode->i_size;
 		/*
@@ -1035,7 +1055,8 @@ retry_snap:
 		int dirty;
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_inline_version = CEPH_INLINE_NONE;
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
@@ -1059,6 +1080,7 @@ retry_snap:
 out:
 	mutex_unlock(&inode->i_mutex);
 out_unlocked:
+	ceph_free_cap_flush(prealloc_cf);
 	current->backing_dev_info = NULL;
 	return written ? written : err;
 }
@@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_inode_to_client(inode)->client->osdc;
+	struct ceph_cap_flush *prealloc_cf;
 	int want, got = 0;
 	int dirty;
 	int ret = 0;
@@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode,
 	if (!S_ISREG(inode->i_mode))
 		return -EOPNOTSUPP;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	mutex_lock(&inode->i_mutex);
 
 	if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode,
 	if (!ret) {
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_inline_version = CEPH_INLINE_NONE;
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
@@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode,
 	ceph_put_cap_refs(ci, got);
 unlock:
 	mutex_unlock(&inode->i_mutex);
+	ceph_free_cap_flush(prealloc_cf);
 	return ret;
 }
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e876e1944..96d2bd829 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -6,7 +6,6 @@
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
-#include <linux/namei.h>
 #include <linux/writeback.h>
 #include <linux/vmalloc.h>
 #include <linux/posix_acl.h>
@@ -390,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_inline_version = 0;
 	ci->i_time_warp_seq = 0;
 	ci->i_ceph_flags = 0;
-	ci->i_ordered_count = 0;
-	atomic_set(&ci->i_release_count, 1);
-	atomic_set(&ci->i_complete_count, 0);
+	atomic64_set(&ci->i_ordered_count, 1);
+	atomic64_set(&ci->i_release_count, 1);
+	atomic64_set(&ci->i_complete_seq[0], 0);
+	atomic64_set(&ci->i_complete_seq[1], 0);
 	ci->i_symlink = NULL;
 
 	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
@@ -416,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_flushing_caps = 0;
 	INIT_LIST_HEAD(&ci->i_dirty_item);
 	INIT_LIST_HEAD(&ci->i_flushing_item);
-	ci->i_cap_flush_seq = 0;
-	ci->i_cap_flush_last_tid = 0;
-	memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+	ci->i_prealloc_cap_flush = NULL;
+	ci->i_cap_flush_tree = RB_ROOT;
 	init_waitqueue_head(&ci->i_cap_wq);
 	ci->i_hold_caps_min = 0;
 	ci->i_hold_caps_max = 0;
@@ -753,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 
 	if (new_version ||
 	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+		if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+			ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
 		ci->i_layout = info->layout;
+
 		queue_trunc = ceph_fill_file_size(inode, issued,
 					le32_to_cpu(info->truncate_seq),
 					le64_to_cpu(info->truncate_size),
@@ -819,6 +821,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			else
 				kfree(sym); /* lost a race */
 		}
+		inode->i_link = ci->i_symlink;
 		break;
 	case S_IFDIR:
 		inode->i_op = &ceph_dir_iops;
@@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 			    !__ceph_dir_is_complete(ci)) {
 				dout(" marking %p complete (empty)\n", inode);
+				i_size_write(inode, 0);
 				__ceph_dir_set_complete(ci,
-					atomic_read(&ci->i_release_count),
-					ci->i_ordered_count);
+					atomic64_read(&ci->i_release_count),
+					atomic64_read(&ci->i_ordered_count));
 			}
 
 			wake = true;
@@ -1212,6 +1216,10 @@ retry_lookup:
 			dout("fill_trace doing d_move %p -> %p\n",
 			     req->r_old_dentry, dn);
 
+			/* d_move screws up sibling dentries' offsets */
+			ceph_dir_clear_ordered(dir);
+			ceph_dir_clear_ordered(olddir);
+
 			d_move(req->r_old_dentry, dn);
 			dout(" src %p '%pd' dst %p '%pd'\n",
 			     req->r_old_dentry,
@@ -1222,10 +1230,6 @@ retry_lookup:
 			   rehashing bug in vfs_rename_dir */
 			ceph_invalidate_dentry_lease(dn);
 
-			/* d_move screws up sibling dentries' offsets */
-			ceph_dir_clear_ordered(dir);
-			ceph_dir_clear_ordered(olddir);
-
 			dout("dn %p gets new offset %lld\n", req->r_old_dentry,
 			     ceph_dentry(req->r_old_dentry)->offset);
 
@@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
 	return err;
 }
 
+void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
+{
+	if (ctl->page) {
+		kunmap(ctl->page);
+		page_cache_release(ctl->page);
+		ctl->page = NULL;
+	}
+}
+
+static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
+			      struct ceph_readdir_cache_control *ctl,
+			      struct ceph_mds_request *req)
+{
+	struct ceph_inode_info *ci = ceph_inode(dir);
+	unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
+	unsigned idx = ctl->index % nsize;
+	pgoff_t pgoff = ctl->index / nsize;
+
+	if (!ctl->page || pgoff != page_index(ctl->page)) {
+		ceph_readdir_cache_release(ctl);
+		ctl->page  = grab_cache_page(&dir->i_data, pgoff);
+		if (!ctl->page) {
+			ctl->index = -1;
+			return -ENOMEM;
+		}
+		/* reading/filling the cache are serialized by
+		 * i_mutex, no need to use page lock */
+		unlock_page(ctl->page);
+		ctl->dentries = kmap(ctl->page);
+	}
+
+	if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
+	    req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
+		dout("readdir cache dn %p idx %d\n", dn, ctl->index);
+		ctl->dentries[idx] = dn;
+		ctl->index++;
+	} else {
+		dout("disable readdir cache\n");
+		ctl->index = -1;
+	}
+	return 0;
+}
+
 int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 			     struct ceph_mds_session *session)
 {
@@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	struct inode *snapdir = NULL;
 	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
 	struct ceph_dentry_info *di;
-	u64 r_readdir_offset = req->r_readdir_offset;
 	u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+	struct ceph_readdir_cache_control cache_ctl = {};
+
+	if (req->r_aborted)
+		return readdir_prepopulate_inodes_only(req, session);
 
 	if (rinfo->dir_dir &&
 	    le32_to_cpu(rinfo->dir_dir->frag) != frag) {
@@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 		     frag, le32_to_cpu(rinfo->dir_dir->frag));
 		frag = le32_to_cpu(rinfo->dir_dir->frag);
 		if (ceph_frag_is_leftmost(frag))
-			r_readdir_offset = 2;
+			req->r_readdir_offset = 2;
 		else
-			r_readdir_offset = 0;
+			req->r_readdir_offset = 0;
 	}
 
-	if (req->r_aborted)
-		return readdir_prepopulate_inodes_only(req, session);
-
 	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
 		snapdir = ceph_get_snapdir(d_inode(parent));
 		parent = d_find_alias(snapdir);
@@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 			ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
 	}
 
+	if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
+		/* note dir version at start of readdir so we can tell
+		 * if any dentries get dropped */
+		struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
+		req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
+		req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
+		req->r_readdir_cache_idx = 0;
+	}
+
+	cache_ctl.index = req->r_readdir_cache_idx;
+
 	/* FIXME: release caps/leases if error occurs */
 	for (i = 0; i < rinfo->dir_nr; i++) {
 		struct ceph_vino vino;
@@ -1413,13 +1471,6 @@ retry_lookup:
 			d_delete(dn);
 			dput(dn);
 			goto retry_lookup;
-		} else {
-			/* reorder parent's d_subdirs */
-			spin_lock(&parent->d_lock);
-			spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
-			list_move(&dn->d_child, &parent->d_subdirs);
-			spin_unlock(&dn->d_lock);
-			spin_unlock(&parent->d_lock);
 		}
 
 		/* inode */
@@ -1436,13 +1487,15 @@ retry_lookup:
 			}
 		}
 
-		if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
-			       req->r_request_started, -1,
-			       &req->r_caps_reservation) < 0) {
+		ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+				 req->r_request_started, -1,
+				 &req->r_caps_reservation);
+		if (ret < 0) {
 			pr_err("fill_inode badness on %p\n", in);
 			if (d_really_is_negative(dn))
 				iput(in);
 			d_drop(dn);
+			err = ret;
 			goto next_item;
 		}
 
@@ -1458,19 +1511,28 @@ retry_lookup:
 		}
 
 		di = dn->d_fsdata;
-		di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
 
 		update_dentry_lease(dn, rinfo->dir_dlease[i],
 				    req->r_session,
 				    req->r_request_started);
+
+		if (err == 0 && cache_ctl.index >= 0) {
+			ret = fill_readdir_cache(d_inode(parent), dn,
+						 &cache_ctl, req);
+			if (ret < 0)
+				err = ret;
+		}
 next_item:
 		if (dn)
 			dput(dn);
 	}
-	if (err == 0)
-		req->r_did_prepopulate = true;
-
 out:
+	if (err == 0) {
+		req->r_did_prepopulate = true;
+		req->r_readdir_cache_idx = cache_ctl.index;
+	}
+	ceph_readdir_cache_release(&cache_ctl);
 	if (snapdir) {
 		iput(snapdir);
 		dput(parent);
@@ -1691,16 +1753,9 @@ retry:
 /*
  * symlinks
  */
-static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
-	nd_set_link(nd, ci->i_symlink);
-	return NULL;
-}
-
 static const struct inode_operations ceph_symlink_iops = {
 	.readlink = generic_readlink,
-	.follow_link = ceph_sym_follow_link,
+	.follow_link = simple_follow_link,
 	.setattr = ceph_setattr,
 	.getattr = ceph_getattr,
 	.setxattr = ceph_setxattr,
@@ -1719,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	const unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_cap_flush *prealloc_cf;
 	int issued;
 	int release = 0, dirtied = 0;
 	int mask = 0;
 	int err = 0;
 	int inode_dirty_flags = 0;
+	bool lock_snap_rwsem = false;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
@@ -1732,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err != 0)
 		return err;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
 				       USE_AUTH_MDS);
-	if (IS_ERR(req))
+	if (IS_ERR(req)) {
+		ceph_free_cap_flush(prealloc_cf);
 		return PTR_ERR(req);
+	}
 
 	spin_lock(&ci->i_ceph_lock);
 	issued = __ceph_caps_issued(ci, NULL);
+
+	if (!ci->i_head_snapc &&
+	    (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
+		lock_snap_rwsem = true;
+		if (!down_read_trylock(&mdsc->snap_rwsem)) {
+			spin_unlock(&ci->i_ceph_lock);
+			down_read(&mdsc->snap_rwsem);
+			spin_lock(&ci->i_ceph_lock);
+			issued = __ceph_caps_issued(ci, NULL);
+		}
+	}
+
 	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
 
 	if (ia_valid & ATTR_UID) {
@@ -1881,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
 
 	if (dirtied) {
-		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
+		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
+							   &prealloc_cf);
 		inode->i_ctime = CURRENT_TIME;
 	}
 
 	release &= issued;
 	spin_unlock(&ci->i_ceph_lock);
+	if (lock_snap_rwsem)
+		up_read(&mdsc->snap_rwsem);
 
 	if (inode_dirty_flags)
 		__mark_inode_dirty(inode, inode_dirty_flags);
@@ -1911,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	ceph_mdsc_put_request(req);
 	if (mask & CEPH_SETATTR_SIZE)
 		__ceph_do_pending_vmtruncate(inode);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 out_put:
 	ceph_mdsc_put_request(req);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 }
 
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 4347039ec..6706bde9a 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -287,7 +287,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 		return 0;
 
 	spin_lock(&ctx->flc_lock);
-	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+	list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
 		++seen_fcntl;
 		if (seen_fcntl > num_fcntl_locks) {
 			err = -ENOSPC;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 84f37f34f..6aa07af67 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -8,6 +8,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/utsname.h>
+#include <linux/ratelimit.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -458,7 +459,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_cap_reconnect = 0;
 	s->s_cap_iterator = NULL;
 	INIT_LIST_HEAD(&s->s_cap_releases);
-	INIT_LIST_HEAD(&s->s_cap_releases_done);
 	INIT_LIST_HEAD(&s->s_cap_flushing);
 	INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
 
@@ -629,6 +629,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
 	req->r_uid = current_fsuid();
 	req->r_gid = current_fsgid();
 
+	if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
+		mdsc->oldest_tid = req->r_tid;
+
 	if (dir) {
 		struct ceph_inode_info *ci = ceph_inode(dir);
 
@@ -644,6 +647,21 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_request *req)
 {
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+
+	if (req->r_tid == mdsc->oldest_tid) {
+		struct rb_node *p = rb_next(&req->r_node);
+		mdsc->oldest_tid = 0;
+		while (p) {
+			struct ceph_mds_request *next_req =
+				rb_entry(p, struct ceph_mds_request, r_node);
+			if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
+				mdsc->oldest_tid = next_req->r_tid;
+				break;
+			}
+			p = rb_next(p);
+		}
+	}
+
 	rb_erase(&req->r_node, &mdsc->request_tree);
 	RB_CLEAR_NODE(&req->r_node);
 
@@ -998,27 +1016,25 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
  * session caps
  */
 
-/*
- * Free preallocated cap messages assigned to this session
- */
-static void cleanup_cap_releases(struct ceph_mds_session *session)
+/* caller holds s_cap_lock, we drop it */
+static void cleanup_cap_releases(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+	__releases(session->s_cap_lock)
 {
-	struct ceph_msg *msg;
+	LIST_HEAD(tmp_list);
+	list_splice_init(&session->s_cap_releases, &tmp_list);
+	session->s_num_cap_releases = 0;
+	spin_unlock(&session->s_cap_lock);
 
-	spin_lock(&session->s_cap_lock);
-	while (!list_empty(&session->s_cap_releases)) {
-		msg = list_first_entry(&session->s_cap_releases,
-				       struct ceph_msg, list_head);
-		list_del_init(&msg->list_head);
-		ceph_msg_put(msg);
-	}
-	while (!list_empty(&session->s_cap_releases_done)) {
-		msg = list_first_entry(&session->s_cap_releases_done,
-				       struct ceph_msg, list_head);
-		list_del_init(&msg->list_head);
-		ceph_msg_put(msg);
+	dout("cleanup_cap_releases mds%d\n", session->s_mds);
+	while (!list_empty(&tmp_list)) {
+		struct ceph_cap *cap;
+		/* zero out the in-progress message */
+		cap = list_first_entry(&tmp_list,
+					struct ceph_cap, session_caps);
+		list_del(&cap->session_caps);
+		ceph_put_cap(mdsc, cap);
 	}
-	spin_unlock(&session->s_cap_lock);
 }
 
 static void cleanup_session_requests(struct ceph_mds_client *mdsc,
@@ -1033,7 +1049,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
 		req = list_first_entry(&session->s_unsafe,
 				       struct ceph_mds_request, r_unsafe_item);
 		list_del_init(&req->r_unsafe_item);
-		pr_info(" dropping unsafe request %llu\n", req->r_tid);
+		pr_warn_ratelimited(" dropping unsafe request %llu\n",
+				    req->r_tid);
 		__unregister_request(mdsc, req);
 	}
 	/* zero r_attempts, so kick_requests() will re-send requests */
@@ -1095,10 +1112,16 @@ static int iterate_session_caps(struct ceph_mds_session *session,
 			dout("iterate_session_caps  finishing cap %p removal\n",
 			     cap);
 			BUG_ON(cap->session != session);
+			cap->session = NULL;
 			list_del_init(&cap->session_caps);
 			session->s_nr_caps--;
-			cap->session = NULL;
-			old_cap = cap;  /* put_cap it w/o locks held */
+			if (cap->queue_release) {
+				list_add_tail(&cap->session_caps,
+					      &session->s_cap_releases);
+				session->s_num_cap_releases++;
+			} else {
+				old_cap = cap;  /* put_cap it w/o locks held */
+			}
 		}
 		if (ret < 0)
 			goto out;
@@ -1119,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 				  void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	LIST_HEAD(to_remove);
 	int drop = 0;
 
 	dout("removing cap %p, ci is %p, inode is %p\n",
@@ -1126,12 +1150,27 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	spin_lock(&ci->i_ceph_lock);
 	__ceph_remove_cap(cap, false);
 	if (!ci->i_auth_cap) {
+		struct ceph_cap_flush *cf;
 		struct ceph_mds_client *mdsc =
 			ceph_sb_to_client(inode->i_sb)->mdsc;
 
+		while (true) {
+			struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
+			if (!n)
+				break;
+			cf = rb_entry(n, struct ceph_cap_flush, i_node);
+			rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
+			list_add(&cf->list, &to_remove);
+		}
+
 		spin_lock(&mdsc->cap_dirty_lock);
+
+		list_for_each_entry(cf, &to_remove, list)
+			rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
 		if (!list_empty(&ci->i_dirty_item)) {
-			pr_info(" dropping dirty %s state for %p %lld\n",
+			pr_warn_ratelimited(
+				" dropping dirty %s state for %p %lld\n",
 				ceph_cap_string(ci->i_dirty_caps),
 				inode, ceph_ino(inode));
 			ci->i_dirty_caps = 0;
@@ -1139,7 +1178,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			drop = 1;
 		}
 		if (!list_empty(&ci->i_flushing_item)) {
-			pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+			pr_warn_ratelimited(
+				" dropping dirty+flushing %s state for %p %lld\n",
 				ceph_cap_string(ci->i_flushing_caps),
 				inode, ceph_ino(inode));
 			ci->i_flushing_caps = 0;
@@ -1148,8 +1188,20 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			drop = 1;
 		}
 		spin_unlock(&mdsc->cap_dirty_lock);
+
+		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+			list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+			ci->i_prealloc_cap_flush = NULL;
+		}
 	}
 	spin_unlock(&ci->i_ceph_lock);
+	while (!list_empty(&to_remove)) {
+		struct ceph_cap_flush *cf;
+		cf = list_first_entry(&to_remove,
+				      struct ceph_cap_flush, list);
+		list_del(&cf->list);
+		ceph_free_cap_flush(cf);
+	}
 	while (drop--)
 		iput(inode);
 	return 0;
@@ -1191,11 +1243,12 @@ static void remove_session_caps(struct ceph_mds_session *session)
 			spin_lock(&session->s_cap_lock);
 		}
 	}
-	spin_unlock(&session->s_cap_lock);
+
+	// drop cap expires and unlock s_cap_lock
+	cleanup_cap_releases(session->s_mdsc, session);
 
 	BUG_ON(session->s_nr_caps > 0);
 	BUG_ON(!list_empty(&session->s_cap_flushing));
-	cleanup_cap_releases(session);
 }
 
 /*
@@ -1371,7 +1424,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
 	     ceph_cap_string(used), ceph_cap_string(wanted));
 	if (cap == ci->i_auth_cap) {
-		if (ci->i_dirty_caps | ci->i_flushing_caps)
+		if (ci->i_dirty_caps || ci->i_flushing_caps ||
+		    !list_empty(&ci->i_cap_snaps))
 			goto out;
 		if ((used | wanted) & CEPH_CAP_ANY_WR)
 			goto out;
@@ -1417,121 +1471,80 @@ static int trim_caps(struct ceph_mds_client *mdsc,
 		session->s_trim_caps = 0;
 	}
 
-	ceph_add_cap_releases(mdsc, session);
 	ceph_send_cap_releases(mdsc, session);
 	return 0;
 }
 
-/*
- * Allocate cap_release messages.  If there is a partially full message
- * in the queue, try to allocate enough to cover it's remainder, so that
- * we can send it immediately.
- *
- * Called under s_mutex.
- */
-int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-			  struct ceph_mds_session *session)
+static int check_capsnap_flush(struct ceph_inode_info *ci,
+			       u64 want_snap_seq)
 {
-	struct ceph_msg *msg, *partial = NULL;
-	struct ceph_mds_cap_release *head;
-	int err = -ENOMEM;
-	int extra = mdsc->fsc->mount_options->cap_release_safety;
-	int num;
-
-	dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
-	     extra);
-
-	spin_lock(&session->s_cap_lock);
-
-	if (!list_empty(&session->s_cap_releases)) {
-		msg = list_first_entry(&session->s_cap_releases,
-				       struct ceph_msg,
-				 list_head);
-		head = msg->front.iov_base;
-		num = le32_to_cpu(head->num);
-		if (num) {
-			dout(" partial %p with (%d/%d)\n", msg, num,
-			     (int)CEPH_CAPS_PER_RELEASE);
-			extra += CEPH_CAPS_PER_RELEASE - num;
-			partial = msg;
-		}
-	}
-	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
-		spin_unlock(&session->s_cap_lock);
-		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-				   GFP_NOFS, false);
-		if (!msg)
-			goto out_unlocked;
-		dout("add_cap_releases %p msg %p now %d\n", session, msg,
-		     (int)msg->front.iov_len);
-		head = msg->front.iov_base;
-		head->num = cpu_to_le32(0);
-		msg->front.iov_len = sizeof(*head);
-		spin_lock(&session->s_cap_lock);
-		list_add(&msg->list_head, &session->s_cap_releases);
-		session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
-	}
-
-	if (partial) {
-		head = partial->front.iov_base;
-		num = le32_to_cpu(head->num);
-		dout(" queueing partial %p with %d/%d\n", partial, num,
-		     (int)CEPH_CAPS_PER_RELEASE);
-		list_move_tail(&partial->list_head,
-			       &session->s_cap_releases_done);
-		session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
+	int ret = 1;
+	spin_lock(&ci->i_ceph_lock);
+	if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
+		struct ceph_cap_snap *capsnap =
+			list_first_entry(&ci->i_cap_snaps,
+					 struct ceph_cap_snap, ci_item);
+		ret = capsnap->follows >= want_snap_seq;
 	}
-	err = 0;
-	spin_unlock(&session->s_cap_lock);
-out_unlocked:
-	return err;
+	spin_unlock(&ci->i_ceph_lock);
+	return ret;
 }
 
-static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
+static int check_caps_flush(struct ceph_mds_client *mdsc,
+			    u64 want_flush_tid)
 {
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	int ret;
-	spin_lock(&ci->i_ceph_lock);
-	if (ci->i_flushing_caps)
-		ret = ci->i_cap_flush_seq >= want_flush_seq;
-	else
-		ret = 1;
-	spin_unlock(&ci->i_ceph_lock);
+	struct rb_node *n;
+	struct ceph_cap_flush *cf;
+	int ret = 1;
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	n = rb_first(&mdsc->cap_flush_tree);
+	cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+	if (cf && cf->tid <= want_flush_tid) {
+		dout("check_caps_flush still flushing tid %llu <= %llu\n",
+		     cf->tid, want_flush_tid);
+		ret = 0;
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
 	return ret;
 }
 
 /*
  * flush all dirty inode data to disk.
  *
- * returns true if we've flushed through want_flush_seq
+ * returns true if we've flushed through want_flush_tid
  */
-static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+static void wait_caps_flush(struct ceph_mds_client *mdsc,
+			    u64 want_flush_tid, u64 want_snap_seq)
 {
 	int mds;
 
-	dout("check_cap_flush want %lld\n", want_flush_seq);
+	dout("check_caps_flush want %llu snap want %llu\n",
+	     want_flush_tid, want_snap_seq);
 	mutex_lock(&mdsc->mutex);
-	for (mds = 0; mds < mdsc->max_sessions; mds++) {
+	for (mds = 0; mds < mdsc->max_sessions; ) {
 		struct ceph_mds_session *session = mdsc->sessions[mds];
 		struct inode *inode = NULL;
 
-		if (!session)
+		if (!session) {
+			mds++;
 			continue;
+		}
 		get_session(session);
 		mutex_unlock(&mdsc->mutex);
 
 		mutex_lock(&session->s_mutex);
-		if (!list_empty(&session->s_cap_flushing)) {
-			struct ceph_inode_info *ci =
-				list_entry(session->s_cap_flushing.next,
-					   struct ceph_inode_info,
-					   i_flushing_item);
-
-			if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
-				dout("check_cap_flush still flushing %p "
-				     "seq %lld <= %lld to mds%d\n",
-				     &ci->vfs_inode, ci->i_cap_flush_seq,
-				     want_flush_seq, session->s_mds);
+		if (!list_empty(&session->s_cap_snaps_flushing)) {
+			struct ceph_cap_snap *capsnap =
+				list_first_entry(&session->s_cap_snaps_flushing,
+						 struct ceph_cap_snap,
+						 flushing_item);
+			struct ceph_inode_info *ci = capsnap->ci;
+			if (!check_capsnap_flush(ci, want_snap_seq)) {
+				dout("check_cap_flush still flushing snap %p "
+				     "follows %lld <= %lld to mds%d\n",
+				     &ci->vfs_inode, capsnap->follows,
+				     want_snap_seq, mds);
 				inode = igrab(&ci->vfs_inode);
 			}
 		}
@@ -1540,15 +1553,21 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 
 		if (inode) {
 			wait_event(mdsc->cap_flushing_wq,
-				   check_cap_flush(inode, want_flush_seq));
+				   check_capsnap_flush(ceph_inode(inode),
+						       want_snap_seq));
 			iput(inode);
+		} else {
+			mds++;
 		}
 
 		mutex_lock(&mdsc->mutex);
 	}
-
 	mutex_unlock(&mdsc->mutex);
-	dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+
+	wait_event(mdsc->cap_flushing_wq,
+		   check_caps_flush(mdsc, want_flush_tid));
+
+	dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
 }
 
 /*
@@ -1557,60 +1576,74 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
 			    struct ceph_mds_session *session)
 {
-	struct ceph_msg *msg;
+	struct ceph_msg *msg = NULL;
+	struct ceph_mds_cap_release *head;
+	struct ceph_mds_cap_item *item;
+	struct ceph_cap *cap;
+	LIST_HEAD(tmp_list);
+	int num_cap_releases;
 
-	dout("send_cap_releases mds%d\n", session->s_mds);
 	spin_lock(&session->s_cap_lock);
-	while (!list_empty(&session->s_cap_releases_done)) {
-		msg = list_first_entry(&session->s_cap_releases_done,
-				 struct ceph_msg, list_head);
-		list_del_init(&msg->list_head);
-		spin_unlock(&session->s_cap_lock);
-		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
-		ceph_con_send(&session->s_con, msg);
-		spin_lock(&session->s_cap_lock);
-	}
+again:
+	list_splice_init(&session->s_cap_releases, &tmp_list);
+	num_cap_releases = session->s_num_cap_releases;
+	session->s_num_cap_releases = 0;
 	spin_unlock(&session->s_cap_lock);
-}
 
-static void discard_cap_releases(struct ceph_mds_client *mdsc,
-				 struct ceph_mds_session *session)
-{
-	struct ceph_msg *msg;
-	struct ceph_mds_cap_release *head;
-	unsigned num;
-
-	dout("discard_cap_releases mds%d\n", session->s_mds);
+	while (!list_empty(&tmp_list)) {
+		if (!msg) {
+			msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
+					PAGE_CACHE_SIZE, GFP_NOFS, false);
+			if (!msg)
+				goto out_err;
+			head = msg->front.iov_base;
+			head->num = cpu_to_le32(0);
+			msg->front.iov_len = sizeof(*head);
+		}
+		cap = list_first_entry(&tmp_list, struct ceph_cap,
+					session_caps);
+		list_del(&cap->session_caps);
+		num_cap_releases--;
 
-	if (!list_empty(&session->s_cap_releases)) {
-		/* zero out the in-progress message */
-		msg = list_first_entry(&session->s_cap_releases,
-					struct ceph_msg, list_head);
 		head = msg->front.iov_base;
-		num = le32_to_cpu(head->num);
-		dout("discard_cap_releases mds%d %p %u\n",
-		     session->s_mds, msg, num);
-		head->num = cpu_to_le32(0);
-		msg->front.iov_len = sizeof(*head);
-		session->s_num_cap_releases += num;
+		le32_add_cpu(&head->num, 1);
+		item = msg->front.iov_base + msg->front.iov_len;
+		item->ino = cpu_to_le64(cap->cap_ino);
+		item->cap_id = cpu_to_le64(cap->cap_id);
+		item->migrate_seq = cpu_to_le32(cap->mseq);
+		item->seq = cpu_to_le32(cap->issue_seq);
+		msg->front.iov_len += sizeof(*item);
+
+		ceph_put_cap(mdsc, cap);
+
+		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+			msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+			dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+			ceph_con_send(&session->s_con, msg);
+			msg = NULL;
+		}
 	}
 
-	/* requeue completed messages */
-	while (!list_empty(&session->s_cap_releases_done)) {
-		msg = list_first_entry(&session->s_cap_releases_done,
-				 struct ceph_msg, list_head);
-		list_del_init(&msg->list_head);
+	BUG_ON(num_cap_releases != 0);
 
-		head = msg->front.iov_base;
-		num = le32_to_cpu(head->num);
-		dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
-		     num);
-		session->s_num_cap_releases += num;
-		head->num = cpu_to_le32(0);
-		msg->front.iov_len = sizeof(*head);
-		list_add(&msg->list_head, &session->s_cap_releases);
+	spin_lock(&session->s_cap_lock);
+	if (!list_empty(&session->s_cap_releases))
+		goto again;
+	spin_unlock(&session->s_cap_lock);
+
+	if (msg) {
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+		ceph_con_send(&session->s_con, msg);
 	}
+	return;
+out_err:
+	pr_err("send_cap_releases mds%d, failed to allocate message\n",
+		session->s_mds);
+	spin_lock(&session->s_cap_lock);
+	list_splice(&tmp_list, &session->s_cap_releases);
+	session->s_num_cap_releases += num_cap_releases;
+	spin_unlock(&session->s_cap_lock);
 }
 
 /*
@@ -1635,7 +1668,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 
 	order = get_order(size * num_entries);
 	while (order >= 0) {
-		rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
+		rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
+							__GFP_NOWARN,
 							order);
 		if (rinfo->dir_in)
 			break;
@@ -1697,13 +1731,9 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
 			struct ceph_mds_request, r_node);
 }
 
-static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+static inline  u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
 {
-	struct ceph_mds_request *req = __get_oldest_req(mdsc);
-
-	if (req)
-		return req->r_tid;
-	return 0;
+	return mdsc->oldest_tid;
 }
 
 /*
@@ -2267,15 +2297,18 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 	/* wait */
 	mutex_unlock(&mdsc->mutex);
 	dout("do_request waiting\n");
-	if (req->r_timeout) {
-		err = (long)wait_for_completion_killable_timeout(
-			&req->r_completion, req->r_timeout);
-		if (err == 0)
-			err = -EIO;
-	} else if (req->r_wait_for_completion) {
+	if (!req->r_timeout && req->r_wait_for_completion) {
 		err = req->r_wait_for_completion(mdsc, req);
 	} else {
-		err = wait_for_completion_killable(&req->r_completion);
+		long timeleft = wait_for_completion_killable_timeout(
+					&req->r_completion,
+					ceph_timeout_jiffies(req->r_timeout));
+		if (timeleft > 0)
+			err = 0;
+		else if (!timeleft)
+			err = -EIO;  /* timed out */
+		else
+			err = timeleft;  /* killed */
 	}
 	dout("do_request waited, got %d\n", err);
 	mutex_lock(&mdsc->mutex);
@@ -2496,7 +2529,6 @@ out_err:
 	}
 	mutex_unlock(&mdsc->mutex);
 
-	ceph_add_cap_releases(mdsc, req->r_session);
 	mutex_unlock(&session->s_mutex);
 
 	/* kick calling process */
@@ -2888,8 +2920,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	 */
 	session->s_cap_reconnect = 1;
 	/* drop old cap expires; we're about to reestablish that state */
-	discard_cap_releases(mdsc, session);
-	spin_unlock(&session->s_cap_lock);
+	cleanup_cap_releases(mdsc, session);
 
 	/* trim unused caps to reduce MDS's cache rejoin time */
 	if (mdsc->fsc->sb->s_root)
@@ -2956,6 +2987,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 
 	reply->hdr.data_len = cpu_to_le32(pagelist->length);
 	ceph_msg_data_add_pagelist(reply, pagelist);
+
+	ceph_early_kick_flushing_caps(mdsc, session);
+
 	ceph_con_send(&session->s_con, reply);
 
 	mutex_unlock(&session->s_mutex);
@@ -3352,7 +3386,6 @@ static void delayed_work(struct work_struct *work)
 			send_renew_caps(mdsc, s);
 		else
 			ceph_con_keepalive(&s->s_con);
-		ceph_add_cap_releases(mdsc, s);
 		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
 		    s->s_state == CEPH_MDS_SESSION_HUNG)
 			ceph_send_cap_releases(mdsc, s);
@@ -3390,11 +3423,13 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	atomic_set(&mdsc->num_sessions, 0);
 	mdsc->max_sessions = 0;
 	mdsc->stopping = 0;
+	mdsc->last_snap_seq = 0;
 	init_rwsem(&mdsc->snap_rwsem);
 	mdsc->snap_realms = RB_ROOT;
 	INIT_LIST_HEAD(&mdsc->snap_empty);
 	spin_lock_init(&mdsc->snap_empty_lock);
 	mdsc->last_tid = 0;
+	mdsc->oldest_tid = 0;
 	mdsc->request_tree = RB_ROOT;
 	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
 	mdsc->last_renew_caps = jiffies;
@@ -3402,7 +3437,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	spin_lock_init(&mdsc->cap_delay_lock);
 	INIT_LIST_HEAD(&mdsc->snap_flush_list);
 	spin_lock_init(&mdsc->snap_flush_lock);
-	mdsc->cap_flush_seq = 0;
+	mdsc->last_cap_flush_tid = 1;
+	mdsc->cap_flush_tree = RB_ROOT;
 	INIT_LIST_HEAD(&mdsc->cap_dirty);
 	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
 	mdsc->num_cap_flushing = 0;
@@ -3414,6 +3450,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	ceph_caps_init(mdsc);
 	ceph_adjust_min_caps(mdsc, fsc->min_caps);
 
+	init_rwsem(&mdsc->pool_perm_rwsem);
+	mdsc->pool_perm_tree = RB_ROOT;
+
 	return 0;
 }
 
@@ -3423,8 +3462,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
  */
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
+	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_request *req;
-	struct ceph_fs_client *fsc = mdsc->fsc;
 
 	mutex_lock(&mdsc->mutex);
 	if (__get_oldest_req(mdsc)) {
@@ -3432,7 +3471,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 
 		dout("wait_requests waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-				    fsc->client->options->mount_timeout * HZ);
+				    ceph_timeout_jiffies(opts->mount_timeout));
 
 		/* tear down remaining requests */
 		mutex_lock(&mdsc->mutex);
@@ -3485,7 +3524,8 @@ restart:
 			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
 		else
 			nextreq = NULL;
-		if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+		if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
+		    (req->r_op & CEPH_MDS_OP_WRITE)) {
 			/* write op */
 			ceph_mdsc_get_request(req);
 			if (nextreq)
@@ -3513,7 +3553,7 @@ restart:
 
 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
-	u64 want_tid, want_flush;
+	u64 want_tid, want_flush, want_snap;
 
 	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
 		return;
@@ -3525,13 +3565,18 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 
 	ceph_flush_dirty_caps(mdsc);
 	spin_lock(&mdsc->cap_dirty_lock);
-	want_flush = mdsc->cap_flush_seq;
+	want_flush = mdsc->last_cap_flush_tid;
 	spin_unlock(&mdsc->cap_dirty_lock);
 
-	dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+	down_read(&mdsc->snap_rwsem);
+	want_snap = mdsc->last_snap_seq;
+	up_read(&mdsc->snap_rwsem);
+
+	dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
+	     want_tid, want_flush, want_snap);
 
 	wait_unsafe_requests(mdsc, want_tid);
-	wait_caps_flush(mdsc, want_flush);
+	wait_caps_flush(mdsc, want_flush, want_snap);
 }
 
 /*
@@ -3549,10 +3594,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc)
  */
 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
+	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_session *session;
 	int i;
-	struct ceph_fs_client *fsc = mdsc->fsc;
-	unsigned long timeout = fsc->client->options->mount_timeout * HZ;
 
 	dout("close_sessions\n");
 
@@ -3573,7 +3617,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 
 	dout("waiting for sessions to close\n");
 	wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
-			   timeout);
+			   ceph_timeout_jiffies(opts->mount_timeout));
 
 	/* tear down remaining sessions */
 	mutex_lock(&mdsc->mutex);
@@ -3607,6 +3651,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 		ceph_mdsmap_destroy(mdsc->mdsmap);
 	kfree(mdsc->sessions);
 	ceph_caps_finalize(mdsc);
+	ceph_pool_perm_destroy(mdsc);
 }
 
 void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 1875b5d98..762757e6c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -139,7 +139,6 @@ struct ceph_mds_session {
 	int		  s_cap_reconnect;
 	int		  s_readonly;
 	struct list_head  s_cap_releases; /* waiting cap_release messages */
-	struct list_head  s_cap_releases_done; /* ready to send */
 	struct ceph_cap  *s_cap_iterator;
 
 	/* protected by mutex */
@@ -228,7 +227,7 @@ struct ceph_mds_request {
 	int r_err;
 	bool r_aborted;
 
-	unsigned long r_timeout;  /* optional.  jiffies */
+	unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
 	unsigned long r_started;  /* start time to measure timeout against */
 	unsigned long r_request_started; /* start time for mds request only,
 					    used to measure lease durations */
@@ -254,12 +253,21 @@ struct ceph_mds_request {
 	bool		  r_got_unsafe, r_got_safe, r_got_result;
 
 	bool              r_did_prepopulate;
+	long long	  r_dir_release_cnt;
+	long long	  r_dir_ordered_cnt;
+	int		  r_readdir_cache_idx;
 	u32               r_readdir_offset;
 
 	struct ceph_cap_reservation r_caps_reservation;
 	int r_num_caps;
 };
 
+struct ceph_pool_perm {
+	struct rb_node node;
+	u32 pool;
+	int perm;
+};
+
 /*
  * mds client state
  */
@@ -284,12 +292,15 @@ struct ceph_mds_client {
 	 * references (implying they contain no inodes with caps) that
 	 * should be destroyed.
 	 */
+	u64			last_snap_seq;
 	struct rw_semaphore     snap_rwsem;
 	struct rb_root          snap_realms;
 	struct list_head        snap_empty;
 	spinlock_t              snap_empty_lock;  /* protect snap_empty */
 
 	u64                    last_tid;      /* most recent mds request */
+	u64                    oldest_tid;    /* oldest incomplete mds request,
+						 excluding setfilelock requests */
 	struct rb_root         request_tree;  /* pending mds requests */
 	struct delayed_work    delayed_work;  /* delayed work */
 	unsigned long    last_renew_caps;  /* last time we renewed our caps */
@@ -298,7 +309,8 @@ struct ceph_mds_client {
 	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
 	spinlock_t       snap_flush_lock;
 
-	u64               cap_flush_seq;
+	u64               last_cap_flush_tid;
+	struct rb_root    cap_flush_tree;
 	struct list_head  cap_dirty;        /* inodes with dirty caps */
 	struct list_head  cap_dirty_migrating; /* ...that are migration... */
 	int               num_cap_flushing; /* # caps we are flushing */
@@ -328,6 +340,9 @@ struct ceph_mds_client {
 	spinlock_t	  dentry_lru_lock;
 	struct list_head  dentry_lru;
 	int		  num_dentry;
+
+	struct rw_semaphore     pool_perm_rwsem;
+	struct rb_root		pool_perm_tree;
 };
 
 extern const char *ceph_mds_op_name(int op);
@@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
 	kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
 
-extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-				 struct ceph_mds_session *session);
 extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
 				   struct ceph_mds_session *session);
 
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index a97e39f09..233d906ae 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b)
 }
 
 
-static struct ceph_snap_context *empty_snapc;
+struct ceph_snap_context *ceph_empty_snapc;
 
 /*
  * build the snap context for a given realm.
@@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 		return 0;
 	}
 
-	if (num == 0 && realm->seq == empty_snapc->seq) {
-		ceph_get_snap_context(empty_snapc);
-		snapc = empty_snapc;
+	if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
+		ceph_get_snap_context(ceph_empty_snapc);
+		snapc = ceph_empty_snapc;
 		goto done;
 	}
 
@@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num)
 	return 0;
 }
 
+static bool has_new_snaps(struct ceph_snap_context *o,
+			  struct ceph_snap_context *n)
+{
+	if (n->num_snaps == 0)
+		return false;
+	/* snaps are in descending order */
+	return n->snaps[0] > o->seq;
+}
 
 /*
  * When a snapshot is applied, the size/mtime inode metadata is queued
@@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 {
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap_snap *capsnap;
+	struct ceph_snap_context *old_snapc, *new_snapc;
 	int used, dirty;
 
 	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
@@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 	used = __ceph_caps_used(ci);
 	dirty = __ceph_caps_dirty(ci);
 
+	old_snapc = ci->i_head_snapc;
+	new_snapc = ci->i_snap_realm->cached_context;
+
 	/*
 	 * If there is a write in progress, treat that as a dirty Fw,
 	 * even though it hasn't completed yet; by the time we finish
@@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		   writes in progress now were started before the previous
 		   cap_snap.  lucky us. */
 		dout("queue_cap_snap %p already pending\n", inode);
-		kfree(capsnap);
-	} else if (ci->i_snap_realm->cached_context == empty_snapc) {
-		dout("queue_cap_snap %p empty snapc\n", inode);
-		kfree(capsnap);
-	} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
-			    CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
-		struct ceph_snap_context *snapc = ci->i_head_snapc;
-
-		/*
-		 * if we are a sync write, we may need to go to the snaprealm
-		 * to get the current snapc.
-		 */
-		if (!snapc)
-			snapc = ci->i_snap_realm->cached_context;
+		goto update_snapc;
+	}
+	if (ci->i_wrbuffer_ref_head == 0 &&
+	    !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
+		dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+		goto update_snapc;
+	}
 
-		dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
-		     inode, capsnap, snapc, ceph_cap_string(dirty));
-		ihold(inode);
+	BUG_ON(!old_snapc);
 
-		atomic_set(&capsnap->nref, 1);
-		capsnap->ci = ci;
-		INIT_LIST_HEAD(&capsnap->ci_item);
-		INIT_LIST_HEAD(&capsnap->flushing_item);
-
-		capsnap->follows = snapc->seq;
-		capsnap->issued = __ceph_caps_issued(ci, NULL);
-		capsnap->dirty = dirty;
-
-		capsnap->mode = inode->i_mode;
-		capsnap->uid = inode->i_uid;
-		capsnap->gid = inode->i_gid;
-
-		if (dirty & CEPH_CAP_XATTR_EXCL) {
-			__ceph_build_xattrs_blob(ci);
-			capsnap->xattr_blob =
-				ceph_buffer_get(ci->i_xattrs.blob);
-			capsnap->xattr_version = ci->i_xattrs.version;
-		} else {
-			capsnap->xattr_blob = NULL;
-			capsnap->xattr_version = 0;
+	/*
+	 * There is no need to send FLUSHSNAP message to MDS if there is
+	 * no new snapshot. But when there is dirty pages or on-going
+	 * writes, we still need to create cap_snap. cap_snap is needed
+	 * by the write path and page writeback path.
+	 *
+	 * also see ceph_try_drop_cap_snap()
+	 */
+	if (has_new_snaps(old_snapc, new_snapc)) {
+		if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
+			capsnap->need_flush = true;
+	} else {
+		if (!(used & CEPH_CAP_FILE_WR) &&
+		    ci->i_wrbuffer_ref_head == 0) {
+			dout("queue_cap_snap %p "
+			     "no new_snap|dirty_page|writing\n", inode);
+			goto update_snapc;
 		}
+	}
 
-		capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
-
-		/* dirty page count moved from _head to this cap_snap;
-		   all subsequent writes page dirties occur _after_ this
-		   snapshot. */
-		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
-		ci->i_wrbuffer_ref_head = 0;
-		capsnap->context = snapc;
-		ci->i_head_snapc =
-			ceph_get_snap_context(ci->i_snap_realm->cached_context);
-		dout(" new snapc is %p\n", ci->i_head_snapc);
-		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
-
-		if (used & CEPH_CAP_FILE_WR) {
-			dout("queue_cap_snap %p cap_snap %p snapc %p"
-			     " seq %llu used WR, now pending\n", inode,
-			     capsnap, snapc, snapc->seq);
-			capsnap->writing = 1;
-		} else {
-			/* note mtime, size NOW. */
-			__ceph_finish_cap_snap(ci, capsnap);
-		}
+	dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
+	     inode, capsnap, old_snapc, ceph_cap_string(dirty),
+	     capsnap->need_flush ? "" : "no_flush");
+	ihold(inode);
+
+	atomic_set(&capsnap->nref, 1);
+	capsnap->ci = ci;
+	INIT_LIST_HEAD(&capsnap->ci_item);
+	INIT_LIST_HEAD(&capsnap->flushing_item);
+
+	capsnap->follows = old_snapc->seq;
+	capsnap->issued = __ceph_caps_issued(ci, NULL);
+	capsnap->dirty = dirty;
+
+	capsnap->mode = inode->i_mode;
+	capsnap->uid = inode->i_uid;
+	capsnap->gid = inode->i_gid;
+
+	if (dirty & CEPH_CAP_XATTR_EXCL) {
+		__ceph_build_xattrs_blob(ci);
+		capsnap->xattr_blob =
+			ceph_buffer_get(ci->i_xattrs.blob);
+		capsnap->xattr_version = ci->i_xattrs.version;
 	} else {
-		dout("queue_cap_snap %p nothing dirty|writing\n", inode);
-		kfree(capsnap);
+		capsnap->xattr_blob = NULL;
+		capsnap->xattr_version = 0;
 	}
 
+	capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+
+	/* dirty page count moved from _head to this cap_snap;
+	   all subsequent writes page dirties occur _after_ this
+	   snapshot. */
+	capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+	ci->i_wrbuffer_ref_head = 0;
+	capsnap->context = old_snapc;
+	list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+	old_snapc = NULL;
+
+	if (used & CEPH_CAP_FILE_WR) {
+		dout("queue_cap_snap %p cap_snap %p snapc %p"
+		     " seq %llu used WR, now pending\n", inode,
+		     capsnap, old_snapc, old_snapc->seq);
+		capsnap->writing = 1;
+	} else {
+		/* note mtime, size NOW. */
+		__ceph_finish_cap_snap(ci, capsnap);
+	}
+	capsnap = NULL;
+
+update_snapc:
+	if (ci->i_head_snapc) {
+		ci->i_head_snapc = ceph_get_snap_context(new_snapc);
+		dout(" new snapc is %p\n", new_snapc);
+	}
 	spin_unlock(&ci->i_ceph_lock);
+
+	kfree(capsnap);
+	ceph_put_snap_context(old_snapc);
 }
 
 /*
@@ -699,6 +730,8 @@ more:
 
 		/* queue realm for cap_snap creation */
 		list_add(&realm->dirty_item, &dirty_realms);
+		if (realm->seq > mdsc->last_snap_seq)
+			mdsc->last_snap_seq = realm->seq;
 
 		invalidate = 1;
 	} else if (!realm->cached_context) {
@@ -964,14 +997,14 @@ out:
 
 int __init ceph_snap_init(void)
 {
-	empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
-	if (!empty_snapc)
+	ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
+	if (!ceph_empty_snapc)
 		return -ENOMEM;
-	empty_snapc->seq = 1;
+	ceph_empty_snapc->seq = 1;
 	return 0;
 }
 
 void ceph_snap_exit(void)
 {
-	ceph_put_snap_context(empty_snapc);
+	ceph_put_snap_context(ceph_empty_snapc);
 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4e9905374..d1c833c32 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -134,10 +134,12 @@ enum {
 	Opt_noino32,
 	Opt_fscache,
 	Opt_nofscache,
+	Opt_poolperm,
+	Opt_nopoolperm,
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	Opt_acl,
 #endif
-	Opt_noacl
+	Opt_noacl,
 };
 
 static match_table_t fsopt_tokens = {
@@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = {
 	{Opt_noino32, "noino32"},
 	{Opt_fscache, "fsc"},
 	{Opt_nofscache, "nofsc"},
+	{Opt_poolperm, "poolperm"},
+	{Opt_nopoolperm, "nopoolperm"},
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	{Opt_acl, "acl"},
 #endif
@@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private)
 	case Opt_nofscache:
 		fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
 		break;
+	case Opt_poolperm:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
+		printk ("pool perm");
+		break;
+	case Opt_nopoolperm:
+		fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
+		break;
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	case Opt_acl:
 		fsopt->sb_flags |= MS_POSIXACL;
@@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",nodcache");
 	if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
 		seq_puts(m, ",fsc");
+	if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
+		seq_puts(m, ",nopoolperm");
 
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	if (fsopt->sb_flags & MS_POSIXACL)
@@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
  */
 struct kmem_cache *ceph_inode_cachep;
 struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_flush_cachep;
 struct kmem_cache *ceph_dentry_cachep;
 struct kmem_cache *ceph_file_cachep;
 
@@ -634,6 +648,10 @@ static int __init init_caches(void)
 				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 	if (ceph_cap_cachep == NULL)
 		goto bad_cap;
+	ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
+					   SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_flush_cachep == NULL)
+		goto bad_cap_flush;
 
 	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
 					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
@@ -652,6 +670,8 @@ static int __init init_caches(void)
 bad_file:
 	kmem_cache_destroy(ceph_dentry_cachep);
 bad_dentry:
+	kmem_cache_destroy(ceph_cap_flush_cachep);
+bad_cap_flush:
 	kmem_cache_destroy(ceph_cap_cachep);
 bad_cap:
 	kmem_cache_destroy(ceph_inode_cachep);
@@ -668,6 +688,7 @@ static void destroy_caches(void)
 
 	kmem_cache_destroy(ceph_inode_cachep);
 	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_cap_flush_cachep);
 	kmem_cache_destroy(ceph_dentry_cachep);
 	kmem_cache_destroy(ceph_file_cachep);
 
@@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 	req->r_ino1.ino = CEPH_INO_ROOT;
 	req->r_ino1.snap = CEPH_NOSNAP;
 	req->r_started = started;
-	req->r_timeout = fsc->client->options->mount_timeout * HZ;
+	req->r_timeout = fsc->client->options->mount_timeout;
 	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fa20e1318..2f2460d23 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -35,6 +35,7 @@
 #define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */
 #define CEPH_MOUNT_OPT_DCACHE          (1<<9) /* use dcache for readdir etc */
 #define CEPH_MOUNT_OPT_FSCACHE         (1<<10) /* use fscache */
+#define CEPH_MOUNT_OPT_NOPOOLPERM      (1<<11) /* no pool permission check */
 
 #define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES | \
 				   CEPH_MOUNT_OPT_DCACHE)
@@ -121,11 +122,21 @@ struct ceph_cap {
 	struct rb_node ci_node;          /* per-ci cap tree */
 	struct ceph_mds_session *session;
 	struct list_head session_caps;   /* per-session caplist */
-	int mds;
 	u64 cap_id;       /* unique cap id (mds provided) */
-	int issued;       /* latest, from the mds */
-	int implemented;  /* implemented superset of issued (for revocation) */
-	int mds_wanted;
+	union {
+		/* in-use caps */
+		struct {
+			int issued;       /* latest, from the mds */
+			int implemented;  /* implemented superset of
+					     issued (for revocation) */
+			int mds, mds_wanted;
+		};
+		/* caps to release */
+		struct {
+			u64 cap_ino;
+			int queue_release;
+		};
+	};
 	u32 seq, issue_seq, mseq;
 	u32 cap_gen;      /* active/stale cycle */
 	unsigned long last_used;
@@ -163,6 +174,7 @@ struct ceph_cap_snap {
 	int writing;   /* a sync write is still in progress */
 	int dirty_pages;     /* dirty pages awaiting writeback */
 	bool inline_data;
+	bool need_flush;
 };
 
 static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
@@ -174,6 +186,16 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
 	}
 }
 
+struct ceph_cap_flush {
+	u64 tid;
+	int caps;
+	struct rb_node g_node; // global
+	union {
+		struct rb_node i_node; // inode
+		struct list_head list;
+	};
+};
+
 /*
  * The frag tree describes how a directory is fragmented, potentially across
  * multiple metadata servers.  It is also used to indicate points where
@@ -259,9 +281,9 @@ struct ceph_inode_info {
 	u32 i_time_warp_seq;
 
 	unsigned i_ceph_flags;
-	int i_ordered_count;
-	atomic_t i_release_count;
-	atomic_t i_complete_count;
+	atomic64_t i_release_count;
+	atomic64_t i_ordered_count;
+	atomic64_t i_complete_seq[2];
 
 	struct ceph_dir_layout i_dir_layout;
 	struct ceph_file_layout i_layout;
@@ -283,11 +305,11 @@ struct ceph_inode_info {
 	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
 	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
 	struct list_head i_dirty_item, i_flushing_item;
-	u64 i_cap_flush_seq;
 	/* we need to track cap writeback on a per-cap-bit basis, to allow
 	 * overlapping, pipelined cap flushes to the mds.  we can probably
 	 * reduce the tid to 8 bits if we're concerned about inode size. */
-	u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+	struct ceph_cap_flush *i_prealloc_cap_flush;
+	struct rb_root i_cap_flush_tree;
 	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
 	unsigned long i_hold_caps_min; /* jiffies */
 	unsigned long i_hold_caps_max; /* jiffies */
@@ -438,36 +460,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 /*
  * Ceph inode.
  */
-#define CEPH_I_DIR_ORDERED	1  /* dentries in dir are ordered */
-#define CEPH_I_NODELAY		4  /* do not delay cap release */
-#define CEPH_I_FLUSH		8  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH		16 /* do not flush dirty caps */
+#define CEPH_I_DIR_ORDERED	(1 << 0)  /* dentries in dir are ordered */
+#define CEPH_I_NODELAY		(1 << 1)  /* do not delay cap release */
+#define CEPH_I_FLUSH		(1 << 2)  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH		(1 << 3)  /* do not flush dirty caps */
+#define CEPH_I_POOL_PERM	(1 << 4)  /* pool rd/wr bits are valid */
+#define CEPH_I_POOL_RD		(1 << 5)  /* can read from pool */
+#define CEPH_I_POOL_WR		(1 << 6)  /* can write to pool */
+
 
 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
-					   int release_count, int ordered_count)
+					   long long release_count,
+					   long long ordered_count)
 {
-	atomic_set(&ci->i_complete_count, release_count);
-	if (ci->i_ordered_count == ordered_count)
-		ci->i_ceph_flags |= CEPH_I_DIR_ORDERED;
-	else
-		ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
+	smp_mb__before_atomic();
+	atomic64_set(&ci->i_complete_seq[0], release_count);
+	atomic64_set(&ci->i_complete_seq[1], ordered_count);
 }
 
 static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
 {
-	atomic_inc(&ci->i_release_count);
+	atomic64_inc(&ci->i_release_count);
+}
+
+static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci)
+{
+	atomic64_inc(&ci->i_ordered_count);
 }
 
 static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
 {
-	return atomic_read(&ci->i_complete_count) ==
-		atomic_read(&ci->i_release_count);
+	return atomic64_read(&ci->i_complete_seq[0]) ==
+		atomic64_read(&ci->i_release_count);
 }
 
 static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
 {
-	return __ceph_dir_is_complete(ci) &&
-		(ci->i_ceph_flags & CEPH_I_DIR_ORDERED);
+	return  atomic64_read(&ci->i_complete_seq[0]) ==
+		atomic64_read(&ci->i_release_count) &&
+		atomic64_read(&ci->i_complete_seq[1]) ==
+		atomic64_read(&ci->i_ordered_count);
 }
 
 static inline void ceph_dir_clear_complete(struct inode *inode)
@@ -477,20 +509,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode)
 
 static inline void ceph_dir_clear_ordered(struct inode *inode)
 {
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	spin_lock(&ci->i_ceph_lock);
-	ci->i_ordered_count++;
-	ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
-	spin_unlock(&ci->i_ceph_lock);
+	__ceph_dir_clear_ordered(ceph_inode(inode));
 }
 
 static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
 {
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	bool ret;
-	spin_lock(&ci->i_ceph_lock);
-	ret = __ceph_dir_is_complete_ordered(ci);
-	spin_unlock(&ci->i_ceph_lock);
+	bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode));
+	smp_rmb();
 	return ret;
 }
 
@@ -552,7 +577,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
 {
 	return ci->i_dirty_caps | ci->i_flushing_caps;
 }
-extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern struct ceph_cap_flush *ceph_alloc_cap_flush(void);
+extern void ceph_free_cap_flush(struct ceph_cap_flush *cf);
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+				  struct ceph_cap_flush **pcf);
 
 extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
 				      struct ceph_cap *ocap, int mask);
@@ -606,16 +634,20 @@ struct ceph_file_info {
 	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
 	unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
 	char *last_name;       /* last entry in previous chunk */
-	struct dentry *dentry; /* next dentry (for dcache readdir) */
-	int dir_release_count;
-	int dir_ordered_count;
+	long long dir_release_count;
+	long long dir_ordered_count;
+	int readdir_cache_idx;
 
 	/* used for -o dirstat read() on directory thing */
 	char *dir_info;
 	int dir_info_len;
 };
 
-
+struct ceph_readdir_cache_control {
+	struct page  *page;
+	struct dentry **dentries;
+	int index;
+};
 
 /*
  * A "snap realm" describes a subset of the file hierarchy sharing
@@ -687,6 +719,7 @@ static inline int default_congestion_kb(void)
 
 
 /* snap.c */
+extern struct ceph_snap_context *ceph_empty_snapc;
 struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
 					       u64 ino);
 extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
@@ -713,8 +746,8 @@ extern void ceph_snap_exit(void);
 static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
 {
 	return !list_empty(&ci->i_cap_snaps) &&
-		list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
-			   ci_item)->writing;
+	       list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap,
+			       ci_item)->writing;
 }
 
 /* inode.c */
@@ -838,12 +871,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc,
 			 struct ceph_cap *cap);
 extern int ceph_is_any_caps(struct inode *inode);
 
-extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
-				u64 cap_id, u32 migrate_seq, u32 issue_seq);
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
 extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
 		      int datasync);
+extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+					  struct ceph_mds_session *session);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 				    struct ceph_mds_session *session);
 extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
@@ -879,6 +912,9 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
 /* addr.c */
 extern const struct address_space_operations ceph_aops;
 extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
+extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need);
+extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 
 /* file.c */
 extern const struct file_operations ceph_file_fops;
@@ -890,7 +926,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 extern int ceph_release(struct inode *inode, struct file *filp);
 extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
 				  char *data, size_t len);
-int ceph_uninline_data(struct file *filp, struct page *locked_page);
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
 extern const struct file_operations ceph_snapdir_fops;
@@ -911,6 +946,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
 extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
+extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
 
 /*
  * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index cd7ffad40..819163d83 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
 	struct inode *inode = d_inode(dentry);
 	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_cap_flush *prealloc_cf = NULL;
 	int issued;
 	int err;
 	int dirty = 0;
@@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
 	char *newval = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
 	int required_blob_size;
+	bool lock_snap_rwsem = false;
 
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
@@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
 	if (!xattr)
 		goto out;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		goto out;
+
 	spin_lock(&ci->i_ceph_lock);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
-	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
 	if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
+
+	if (!lock_snap_rwsem && !ci->i_head_snapc) {
+		lock_snap_rwsem = true;
+		if (!down_read_trylock(&mdsc->snap_rwsem)) {
+			spin_unlock(&ci->i_ceph_lock);
+			down_read(&mdsc->snap_rwsem);
+			spin_lock(&ci->i_ceph_lock);
+			goto retry;
+		}
+	}
+
+	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
 	__build_xattrs(inode);
 
 	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
@@ -966,7 +984,7 @@ retry:
 		dout(" preaallocating new blob size=%d\n", required_blob_size);
 		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
 		if (!blob)
-			goto out;
+			goto do_sync_unlocked;
 		spin_lock(&ci->i_ceph_lock);
 		if (ci->i_xattrs.prealloc_blob)
 			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
@@ -978,21 +996,28 @@ retry:
 			  flags, value ? 1 : -1, &xattr);
 
 	if (!err) {
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+					       &prealloc_cf);
 		ci->i_xattrs.dirty = true;
 		inode->i_ctime = CURRENT_TIME;
 	}
 
 	spin_unlock(&ci->i_ceph_lock);
+	if (lock_snap_rwsem)
+		up_read(&mdsc->snap_rwsem);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
 do_sync_unlocked:
+	if (lock_snap_rwsem)
+		up_read(&mdsc->snap_rwsem);
 	err = ceph_sync_setxattr(dentry, name, value, size, flags);
 out:
+	ceph_free_cap_flush(prealloc_cf);
 	kfree(newname);
 	kfree(newval);
 	kfree(xattr);
@@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
 	struct inode *inode = d_inode(dentry);
 	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_cap_flush *prealloc_cf = NULL;
 	int issued;
 	int err;
 	int required_blob_size;
 	int dirty;
+	bool lock_snap_rwsem = false;
 
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
@@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
 	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
 		goto do_sync_unlocked;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
-	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
-
 	if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
+
+	if (!lock_snap_rwsem && !ci->i_head_snapc) {
+		lock_snap_rwsem = true;
+		if (!down_read_trylock(&mdsc->snap_rwsem)) {
+			spin_unlock(&ci->i_ceph_lock);
+			down_read(&mdsc->snap_rwsem);
+			spin_lock(&ci->i_ceph_lock);
+			goto retry;
+		}
+	}
+
+	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+
 	__build_xattrs(inode);
 
 	required_blob_size = __get_required_blob_size(ci, 0, 0);
@@ -1080,7 +1123,7 @@ retry:
 		dout(" preaallocating new blob size=%d\n", required_blob_size);
 		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
 		if (!blob)
-			goto out;
+			goto do_sync_unlocked;
 		spin_lock(&ci->i_ceph_lock);
 		if (ci->i_xattrs.prealloc_blob)
 			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
@@ -1090,18 +1133,24 @@ retry:
 
 	err = __remove_xattr_by_name(ceph_inode(inode), name);
 
-	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+				       &prealloc_cf);
 	ci->i_xattrs.dirty = true;
 	inode->i_ctime = CURRENT_TIME;
 	spin_unlock(&ci->i_ceph_lock);
+	if (lock_snap_rwsem)
+		up_read(&mdsc->snap_rwsem);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
 do_sync_unlocked:
+	if (lock_snap_rwsem)
+		up_read(&mdsc->snap_rwsem);
+	ceph_free_cap_flush(prealloc_cf);
 	err = ceph_send_removexattr(dentry, name);
-out:
 	return err;
 }
 
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index a2172f3f6..e7b478b49 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -192,6 +192,15 @@ config CIFS_SMB2
 	  options are also slightly simpler (compared to CIFS) due
 	  to protocol improvements.
 
+config CIFS_SMB311
+	bool "SMB3.1.1 network file system support (Experimental)"
+	depends on CIFS_SMB2 && INET
+
+	help
+	  This enables experimental support for the newest, SMB3.1.1, dialect.
+	  This dialect includes improved security negotiation features.
+	  If unsure, say N
+
 config CIFS_FSCACHE
 	  bool "Provide CIFS client caching support"
 	  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 252f5c158..a782b2290 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -120,7 +120,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
 #endif
 
 /* Functions related to symlinks */
-extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
+extern const char *cifs_follow_link(struct dentry *direntry, void **cookie);
 extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
 			 int buflen);
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 22b289a3b..b406a32de 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -171,6 +171,10 @@ enum smb_version {
 	Smb_21,
 	Smb_30,
 	Smb_302,
+#ifdef CONFIG_CIFS_SMB311
+	Smb_311,
+#endif /* SMB311 */
+	Smb_version_err
 };
 
 struct mid_q_entry;
@@ -368,6 +372,8 @@ struct smb_version_operations {
 	void (*new_lease_key)(struct cifs_fid *);
 	int (*generate_signingkey)(struct cifs_ses *);
 	int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
+	int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,
+			     struct cifsFileInfo *src_file);
 	int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
 				struct cifs_sb_info *, const unsigned char *,
 				char *, unsigned int *);
@@ -386,6 +392,9 @@ struct smb_version_operations {
 	int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file,
 			struct cifsFileInfo *target_file, u64 src_off, u64 len,
 			u64 dest_off);
+	int (*duplicate_extents)(const unsigned int, struct cifsFileInfo *src,
+			struct cifsFileInfo *target_file, u64 src_off, u64 len,
+			u64 dest_off);
 	int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
 	ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *,
 			const unsigned char *, const unsigned char *, char *,
@@ -1617,4 +1626,8 @@ extern struct smb_version_values smb30_values;
 #define SMB302_VERSION_STRING	"3.02"
 /*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
 extern struct smb_version_values smb302_values;
+#define SMB311_VERSION_STRING	"3.1.1"
+#define ALT_SMB311_VERSION_STRING "3.11"
+extern struct smb_version_operations smb311_operations;
+extern struct smb_version_values smb311_values;
 #endif	/* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 5f9822ac0..47b030da0 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2255,6 +2255,8 @@ typedef struct {
 
 
 /* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */
+#define FILE_SUPPORTS_SPARSE_VDL	0x10000000 /* faster nonsparse extend */
+#define FILE_SUPPORTS_BLOCK_REFCOUNTING	0x08000000 /* allow ioctl dup extents */
 #define FILE_SUPPORT_INTEGRITY_STREAMS	0x04000000
 #define FILE_SUPPORTS_USN_JOURNAL	0x02000000
 #define FILE_SUPPORTS_OPEN_BY_FILE_ID	0x01000000
@@ -2310,6 +2312,16 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */
 	char FileName[1];
 } __attribute__((packed)) FILE_ALL_INFO;	/* level 0x107 QPathInfo */
 
+typedef struct {
+	__le64 AllocationSize;
+	__le64 EndOfFile;	/* size ie offset to first free byte in file */
+	__le32 NumberOfLinks;	/* hard links */
+	__u8 DeletePending;
+	__u8 Directory;
+	__u16 Pad;
+} __attribute__((packed)) FILE_STANDARD_INFO;	/* level 0x102 QPathInfo */
+
+
 /* defines for enumerating possible values of the Unix type field below */
 #define UNIX_FILE      0
 #define UNIX_DIR       1
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f26ffbfc6..672ef35c9 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -625,9 +625,8 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
 		server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
 		memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
 		       CIFS_CRYPTO_KEY_SIZE);
-	} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
-			server->capabilities & CAP_EXTENDED_SECURITY) &&
-				(pSMBr->EncryptionKeyLength == 0)) {
+	} else if (pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
+			server->capabilities & CAP_EXTENDED_SECURITY) {
 		server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
 		rc = decode_ext_sec_blob(ses, pSMBr);
 	} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8383d5ea4..773f4dc77 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -280,6 +280,11 @@ static const match_table_t cifs_smb_version_tokens = {
 	{ Smb_21, SMB21_VERSION_STRING },
 	{ Smb_30, SMB30_VERSION_STRING },
 	{ Smb_302, SMB302_VERSION_STRING },
+#ifdef CONFIG_CIFS_SMB311
+	{ Smb_311, SMB311_VERSION_STRING },
+	{ Smb_311, ALT_SMB311_VERSION_STRING },
+#endif /* SMB311 */
+	{ Smb_version_err, NULL }
 };
 
 static int ip_connect(struct TCP_Server_Info *server);
@@ -1133,6 +1138,12 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
 		vol->ops = &smb30_operations; /* currently identical with 3.0 */
 		vol->vals = &smb302_values;
 		break;
+#ifdef CONFIG_CIFS_SMB311
+	case Smb_311:
+		vol->ops = &smb311_operations;
+		vol->vals = &smb311_values;
+		break;
+#endif /* SMB311 */
 #endif
 	default:
 		cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
@@ -3461,6 +3472,8 @@ try_mount_again:
 		else if (ses)
 			cifs_put_smb_ses(ses);
 
+		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
+
 		free_xid(xid);
 	}
 #endif
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 8b7898b76..49b8b6e41 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -31,12 +31,15 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifsfs.h"
+#include <linux/btrfs.h>
 
 #define CIFS_IOCTL_MAGIC	0xCF
 #define CIFS_IOC_COPYCHUNK_FILE	_IOW(CIFS_IOCTL_MAGIC, 3, int)
+#define CIFS_IOC_SET_INTEGRITY  _IO(CIFS_IOCTL_MAGIC, 4)
 
 static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
-			unsigned long srcfd, u64 off, u64 len, u64 destoff)
+			unsigned long srcfd, u64 off, u64 len, u64 destoff,
+			bool dup_extents)
 {
 	int rc;
 	struct cifsFileInfo *smb_file_target = dst_file->private_data;
@@ -109,9 +112,14 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
 	truncate_inode_pages_range(&target_inode->i_data, destoff,
 				   PAGE_CACHE_ALIGN(destoff + len)-1);
 
-	if (target_tcon->ses->server->ops->clone_range)
+	if (dup_extents && target_tcon->ses->server->ops->duplicate_extents)
+		rc = target_tcon->ses->server->ops->duplicate_extents(xid,
+			smb_file_src, smb_file_target, off, len, destoff);
+	else if (!dup_extents && target_tcon->ses->server->ops->clone_range)
 		rc = target_tcon->ses->server->ops->clone_range(xid,
 			smb_file_src, smb_file_target, off, len, destoff);
+	else
+		rc = -EOPNOTSUPP;
 
 	/* force revalidate of size and timestamps of target file now
 	   that target is updated on the server */
@@ -205,7 +213,20 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 			}
 			break;
 		case CIFS_IOC_COPYCHUNK_FILE:
-			rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0);
+			rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false);
+			break;
+		case BTRFS_IOC_CLONE:
+			rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true);
+			break;
+		case CIFS_IOC_SET_INTEGRITY:
+			if (pSMBFile == NULL)
+				break;
+			tcon = tlink_tcon(pSMBFile->tlink);
+			if (tcon->ses->server->ops->set_integrity)
+				rc = tcon->ses->server->ops->set_integrity(xid,
+						tcon, pSMBFile);
+			else
+				rc = -EOPNOTSUPP;
 			break;
 		default:
 			cifs_dbg(FYI, "unsupported ioctl\n");
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index e6c707cc6..e3548f73b 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -626,8 +626,8 @@ cifs_hl_exit:
 	return rc;
 }
 
-void *
-cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
+const char *
+cifs_follow_link(struct dentry *direntry, void **cookie)
 {
 	struct inode *inode = d_inode(direntry);
 	int rc = -ENOMEM;
@@ -643,16 +643,18 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink)) {
-		rc = PTR_ERR(tlink);
-		tlink = NULL;
-		goto out;
+		free_xid(xid);
+		return ERR_CAST(tlink);
 	}
 	tcon = tlink_tcon(tlink);
 	server = tcon->ses->server;
 
 	full_path = build_path_from_dentry(direntry);
-	if (!full_path)
-		goto out;
+	if (!full_path) {
+		free_xid(xid);
+		cifs_put_tlink(tlink);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
 
@@ -670,17 +672,13 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 						&target_path, cifs_sb);
 
 	kfree(full_path);
-out:
+	free_xid(xid);
+	cifs_put_tlink(tlink);
 	if (rc != 0) {
 		kfree(target_path);
-		target_path = ERR_PTR(rc);
+		return ERR_PTR(rc);
 	}
-
-	free_xid(xid);
-	if (tlink)
-		cifs_put_tlink(tlink);
-	nd_set_link(nd, target_path);
-	return NULL;
+	return *cookie = target_path;
 }
 
 int
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 54daee5ad..df91bcf56 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -806,6 +806,53 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
 			    cfile->fid.volatile_fid, cfile->pid, &eof, false);
 }
 
+#ifdef CONFIG_CIFS_SMB311
+static int
+smb2_duplicate_extents(const unsigned int xid,
+			struct cifsFileInfo *srcfile,
+			struct cifsFileInfo *trgtfile, u64 src_off,
+			u64 len, u64 dest_off)
+{
+	int rc;
+	unsigned int ret_data_len;
+	char *retbuf = NULL;
+	struct duplicate_extents_to_file dup_ext_buf;
+	struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink);
+
+	/* server fileays advertise duplicate extent support with this flag */
+	if ((le32_to_cpu(tcon->fsAttrInfo.Attributes) &
+	     FILE_SUPPORTS_BLOCK_REFCOUNTING) == 0)
+		return -EOPNOTSUPP;
+
+	dup_ext_buf.VolatileFileHandle = srcfile->fid.volatile_fid;
+	dup_ext_buf.PersistentFileHandle = srcfile->fid.persistent_fid;
+	dup_ext_buf.SourceFileOffset = cpu_to_le64(src_off);
+	dup_ext_buf.TargetFileOffset = cpu_to_le64(dest_off);
+	dup_ext_buf.ByteCount = cpu_to_le64(len);
+	cifs_dbg(FYI, "duplicate extents: src off %lld dst off %lld len %lld",
+		src_off, dest_off, len);
+
+	rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false);
+	if (rc)
+		goto duplicate_extents_out;
+
+	rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
+			trgtfile->fid.volatile_fid,
+			FSCTL_DUPLICATE_EXTENTS_TO_FILE,
+			true /* is_fsctl */, (char *)&dup_ext_buf,
+			sizeof(struct duplicate_extents_to_file),
+			(char **)&retbuf,
+			&ret_data_len);
+
+	if (ret_data_len > 0)
+		cifs_dbg(FYI, "non-zero response length in duplicate extents");
+
+duplicate_extents_out:
+	return rc;
+}
+#endif /* CONFIG_CIFS_SMB311 */
+
+
 static int
 smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
 		   struct cifsFileInfo *cfile)
@@ -815,6 +862,28 @@ smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
 }
 
 static int
+smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifsFileInfo *cfile)
+{
+	struct fsctl_set_integrity_information_req integr_info;
+	char *retbuf = NULL;
+	unsigned int ret_data_len;
+
+	integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED);
+	integr_info.Flags = 0;
+	integr_info.Reserved = 0;
+
+	return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+			cfile->fid.volatile_fid,
+			FSCTL_SET_INTEGRITY_INFORMATION,
+			true /* is_fsctl */, (char *)&integr_info,
+			sizeof(struct fsctl_set_integrity_information_req),
+			(char **)&retbuf,
+			&ret_data_len);
+
+}
+
+static int
 smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 		     const char *path, struct cifs_sb_info *cifs_sb,
 		     struct cifs_fid *fid, __u16 search_flags,
@@ -1624,6 +1693,7 @@ struct smb_version_operations smb30_operations = {
 	.new_lease_key = smb2_new_lease_key,
 	.generate_signingkey = generate_smb3signingkey,
 	.calc_signature = smb3_calc_signature,
+	.set_integrity  = smb3_set_integrity,
 	.is_read_op = smb21_is_read_op,
 	.set_oplock_level = smb3_set_oplock_level,
 	.create_lease_buf = smb3_create_lease_buf,
@@ -1635,6 +1705,94 @@ struct smb_version_operations smb30_operations = {
 	.fallocate = smb3_fallocate,
 };
 
+#ifdef CONFIG_CIFS_SMB311
+struct smb_version_operations smb311_operations = {
+	.compare_fids = smb2_compare_fids,
+	.setup_request = smb2_setup_request,
+	.setup_async_request = smb2_setup_async_request,
+	.check_receive = smb2_check_receive,
+	.add_credits = smb2_add_credits,
+	.set_credits = smb2_set_credits,
+	.get_credits_field = smb2_get_credits_field,
+	.get_credits = smb2_get_credits,
+	.wait_mtu_credits = smb2_wait_mtu_credits,
+	.get_next_mid = smb2_get_next_mid,
+	.read_data_offset = smb2_read_data_offset,
+	.read_data_length = smb2_read_data_length,
+	.map_error = map_smb2_to_linux_error,
+	.find_mid = smb2_find_mid,
+	.check_message = smb2_check_message,
+	.dump_detail = smb2_dump_detail,
+	.clear_stats = smb2_clear_stats,
+	.print_stats = smb2_print_stats,
+	.dump_share_caps = smb2_dump_share_caps,
+	.is_oplock_break = smb2_is_valid_oplock_break,
+	.downgrade_oplock = smb2_downgrade_oplock,
+	.need_neg = smb2_need_neg,
+	.negotiate = smb2_negotiate,
+	.negotiate_wsize = smb2_negotiate_wsize,
+	.negotiate_rsize = smb2_negotiate_rsize,
+	.sess_setup = SMB2_sess_setup,
+	.logoff = SMB2_logoff,
+	.tree_connect = SMB2_tcon,
+	.tree_disconnect = SMB2_tdis,
+	.qfs_tcon = smb3_qfs_tcon,
+	.is_path_accessible = smb2_is_path_accessible,
+	.can_echo = smb2_can_echo,
+	.echo = SMB2_echo,
+	.query_path_info = smb2_query_path_info,
+	.get_srv_inum = smb2_get_srv_inum,
+	.query_file_info = smb2_query_file_info,
+	.set_path_size = smb2_set_path_size,
+	.set_file_size = smb2_set_file_size,
+	.set_file_info = smb2_set_file_info,
+	.set_compression = smb2_set_compression,
+	.mkdir = smb2_mkdir,
+	.mkdir_setinfo = smb2_mkdir_setinfo,
+	.rmdir = smb2_rmdir,
+	.unlink = smb2_unlink,
+	.rename = smb2_rename_path,
+	.create_hardlink = smb2_create_hardlink,
+	.query_symlink = smb2_query_symlink,
+	.query_mf_symlink = smb3_query_mf_symlink,
+	.create_mf_symlink = smb3_create_mf_symlink,
+	.open = smb2_open_file,
+	.set_fid = smb2_set_fid,
+	.close = smb2_close_file,
+	.flush = smb2_flush_file,
+	.async_readv = smb2_async_readv,
+	.async_writev = smb2_async_writev,
+	.sync_read = smb2_sync_read,
+	.sync_write = smb2_sync_write,
+	.query_dir_first = smb2_query_dir_first,
+	.query_dir_next = smb2_query_dir_next,
+	.close_dir = smb2_close_dir,
+	.calc_smb_size = smb2_calc_size,
+	.is_status_pending = smb2_is_status_pending,
+	.oplock_response = smb2_oplock_response,
+	.queryfs = smb2_queryfs,
+	.mand_lock = smb2_mand_lock,
+	.mand_unlock_range = smb2_unlock_range,
+	.push_mand_locks = smb2_push_mandatory_locks,
+	.get_lease_key = smb2_get_lease_key,
+	.set_lease_key = smb2_set_lease_key,
+	.new_lease_key = smb2_new_lease_key,
+	.generate_signingkey = generate_smb3signingkey,
+	.calc_signature = smb3_calc_signature,
+	.set_integrity  = smb3_set_integrity,
+	.is_read_op = smb21_is_read_op,
+	.set_oplock_level = smb3_set_oplock_level,
+	.create_lease_buf = smb3_create_lease_buf,
+	.parse_lease_buf = smb3_parse_lease_buf,
+	.clone_range = smb2_clone_range,
+	.duplicate_extents = smb2_duplicate_extents,
+/*	.validate_negotiate = smb3_validate_negotiate, */ /* not used in 3.11 */
+	.wp_retry_size = smb2_wp_retry_size,
+	.dir_needs_close = smb2_dir_needs_close,
+	.fallocate = smb3_fallocate,
+};
+#endif /* CIFS_SMB311 */
+
 struct smb_version_values smb20_values = {
 	.version_string = SMB20_VERSION_STRING,
 	.protocol_id = SMB20_PROT_ID,
@@ -1714,3 +1872,25 @@ struct smb_version_values smb302_values = {
 	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
 	.create_lease_size = sizeof(struct create_lease_v2),
 };
+
+#ifdef CONFIG_CIFS_SMB311
+struct smb_version_values smb311_values = {
+	.version_string = SMB311_VERSION_STRING,
+	.protocol_id = SMB311_PROT_ID,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+	.large_lock_type = 0,
+	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+	.header_size = sizeof(struct smb2_hdr),
+	.max_header_size = MAX_SMB2_HDR_SIZE,
+	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+	.lock_cmd = SMB2_LOCK,
+	.cap_unix = 0,
+	.cap_nt_find = SMB2_NT_FIND,
+	.cap_large_files = SMB2_LARGE_FILES,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease_v2),
+};
+#endif /* SMB311 */
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 54cbe19d9..b8b4f08ee 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -304,6 +304,59 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
 	return rc;
 }
 
+#ifdef CONFIG_CIFS_SMB311
+/* offset is sizeof smb2_negotiate_req - 4 but rounded up to 8 bytes */
+#define OFFSET_OF_NEG_CONTEXT 0x68  /* sizeof(struct smb2_negotiate_req) - 4 */
+
+
+#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES	cpu_to_le16(1)
+#define SMB2_ENCRYPTION_CAPABILITIES		cpu_to_le16(2)
+
+static void
+build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
+{
+	pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES;
+	pneg_ctxt->DataLength = cpu_to_le16(38);
+	pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1);
+	pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE);
+	get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE);
+	pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512;
+}
+
+static void
+build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt)
+{
+	pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES;
+	pneg_ctxt->DataLength = cpu_to_le16(6);
+	pneg_ctxt->CipherCount = cpu_to_le16(2);
+	pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;
+	pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM;
+}
+
+static void
+assemble_neg_contexts(struct smb2_negotiate_req *req)
+{
+
+	/* +4 is to account for the RFC1001 len field */
+	char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT + 4;
+
+	build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
+	/* Add 2 to size to round to 8 byte boundary */
+	pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context);
+	build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
+	req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
+	req->NegotiateContextCount = cpu_to_le16(2);
+	inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2
+			+ sizeof(struct smb2_encryption_neg_context)); /* calculate hash */
+}
+#else
+static void assemble_neg_contexts(struct smb2_negotiate_req *req)
+{
+	return;
+}
+#endif /* SMB311 */
+
+
 /*
  *
  *	SMB2 Worker functions follow:
@@ -363,10 +416,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 	/* ClientGUID must be zero for SMB2.02 dialect */
 	if (ses->server->vals->protocol_id == SMB20_PROT_ID)
 		memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE);
-	else
+	else {
 		memcpy(req->ClientGUID, server->client_guid,
 			SMB2_CLIENT_GUID_SIZE);
-
+		if (ses->server->vals->protocol_id == SMB311_PROT_ID)
+			assemble_neg_contexts(req);
+	}
 	iov[0].iov_base = (char *)req;
 	/* 4 for rfc1002 length field */
 	iov[0].iov_len = get_rfc1002_length(req) + 4;
@@ -393,8 +448,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 		cifs_dbg(FYI, "negotiated smb3.0 dialect\n");
 	else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID))
 		cifs_dbg(FYI, "negotiated smb3.02 dialect\n");
+#ifdef CONFIG_CIFS_SMB311
+	else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID))
+		cifs_dbg(FYI, "negotiated smb3.1.1 dialect\n");
+#endif /* SMB311 */
 	else {
-		cifs_dbg(VFS, "Illegal dialect returned by server %d\n",
+		cifs_dbg(VFS, "Illegal dialect returned by server 0x%x\n",
 			 le16_to_cpu(rsp->DialectRevision));
 		rc = -EIO;
 		goto neg_exit;
@@ -572,7 +631,7 @@ ssetup_ntlmssp_authenticate:
 		return rc;
 
 	req->hdr.SessionId = 0; /* First session, not a reauthenticate */
-	req->VcNumber = 0; /* MBZ */
+	req->Flags = 0; /* MBZ */
 	/* to enable echos and oplocks */
 	req->hdr.CreditRequest = cpu_to_le16(3);
 
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 70867d54f..451108284 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -136,9 +136,6 @@ struct smb2_transform_hdr {
 	__u64  SessionId;
 } __packed;
 
-/* Encryption Algorithms */
-#define SMB2_ENCRYPTION_AES128_CCM	cpu_to_le16(0x0001)
-
 /*
  *	SMB2 flag definitions
  */
@@ -191,7 +188,10 @@ struct smb2_negotiate_req {
 	__le16 Reserved;	/* MBZ */
 	__le32 Capabilities;
 	__u8   ClientGUID[SMB2_CLIENT_GUID_SIZE];
-	__le64 ClientStartTime;	/* MBZ */
+	/* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
+	__le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
+	__le16 NegotiateContextCount;  /* SMB3.1.1 only. MBZ earlier */
+	__le16 Reserved2;
 	__le16 Dialects[1]; /* One dialect (vers=) at a time for now */
 } __packed;
 
@@ -200,6 +200,7 @@ struct smb2_negotiate_req {
 #define SMB21_PROT_ID 0x0210
 #define SMB30_PROT_ID 0x0300
 #define SMB302_PROT_ID 0x0302
+#define SMB311_PROT_ID 0x0311
 #define BAD_PROT_ID   0xFFFF
 
 /* SecurityMode flags */
@@ -217,12 +218,38 @@ struct smb2_negotiate_req {
 #define SMB2_NT_FIND			0x00100000
 #define SMB2_LARGE_FILES		0x00200000
 
+#define SMB311_SALT_SIZE			32
+/* Hash Algorithm Types */
+#define SMB2_PREAUTH_INTEGRITY_SHA512	cpu_to_le16(0x0001)
+
+struct smb2_preauth_neg_context {
+	__le16	ContextType; /* 1 */
+	__le16	DataLength;
+	__le32	Reserved;
+	__le16	HashAlgorithmCount; /* 1 */
+	__le16	SaltLength;
+	__le16	HashAlgorithms; /* HashAlgorithms[0] since only one defined */
+	__u8	Salt[SMB311_SALT_SIZE];
+} __packed;
+
+/* Encryption Algorithms Ciphers */
+#define SMB2_ENCRYPTION_AES128_CCM	cpu_to_le16(0x0001)
+#define SMB2_ENCRYPTION_AES128_GCM	cpu_to_le16(0x0002)
+
+struct smb2_encryption_neg_context {
+	__le16	ContextType; /* 2 */
+	__le16	DataLength;
+	__le32	Reserved;
+	__le16	CipherCount; /* AES-128-GCM and AES-128-CCM */
+	__le16	Ciphers[2]; /* Ciphers[0] since only one used now */
+} __packed;
+
 struct smb2_negotiate_rsp {
 	struct smb2_hdr hdr;
 	__le16 StructureSize;	/* Must be 65 */
 	__le16 SecurityMode;
 	__le16 DialectRevision;
-	__le16 Reserved;	/* MBZ */
+	__le16 NegotiateContextCount;	/* Prior to SMB3.1.1 was Reserved & MBZ */
 	__u8   ServerGUID[16];
 	__le32 Capabilities;
 	__le32 MaxTransactSize;
@@ -232,14 +259,18 @@ struct smb2_negotiate_rsp {
 	__le64 ServerStartTime;
 	__le16 SecurityBufferOffset;
 	__le16 SecurityBufferLength;
-	__le32 Reserved2;	/* may be any value, ignore */
+	__le32 NegotiateContextOffset;	/* Pre:SMB3.1.1 was reserved/ignored */
 	__u8   Buffer[1];	/* variable length GSS security buffer */
 } __packed;
 
+/* Flags */
+#define SMB2_SESSION_REQ_FLAG_BINDING		0x01
+#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA	0x04
+
 struct smb2_sess_setup_req {
 	struct smb2_hdr hdr;
 	__le16 StructureSize; /* Must be 25 */
-	__u8   VcNumber;
+	__u8   Flags;
 	__u8   SecurityMode;
 	__le32 Capabilities;
 	__le32 Channel;
@@ -274,10 +305,13 @@ struct smb2_logoff_rsp {
 	__le16 Reserved;
 } __packed;
 
+/* Flags/Reserved for SMB3.1.1 */
+#define SMB2_SHAREFLAG_CLUSTER_RECONNECT	0x0001
+
 struct smb2_tree_connect_req {
 	struct smb2_hdr hdr;
 	__le16 StructureSize;	/* Must be 9 */
-	__le16 Reserved;
+	__le16 Reserved; /* Flags in SMB3.1.1 */
 	__le16 PathOffset;
 	__le16 PathLength;
 	__u8   Buffer[1];	/* variable length */
@@ -587,6 +621,29 @@ struct copychunk_ioctl_rsp {
 	__le32 TotalBytesWritten;
 } __packed;
 
+struct fsctl_set_integrity_information_req {
+	__le16	ChecksumAlgorithm;
+	__le16	Reserved;
+	__le32	Flags;
+} __packed;
+
+struct fsctl_get_integrity_information_rsp {
+	__le16	ChecksumAlgorithm;
+	__le16	Reserved;
+	__le32	Flags;
+	__le32	ChecksumChunkSizeInBytes;
+	__le32	ClusterSizeInBytes;
+} __packed;
+
+/* Integrity ChecksumAlgorithm choices for above */
+#define	CHECKSUM_TYPE_NONE	0x0000
+#define	CHECKSUM_TYPE_CRC64	0x0002
+#define CHECKSUM_TYPE_UNCHANGED	0xFFFF	/* set only */
+
+/* Integrity flags for above */
+#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF	0x00000001
+
+
 struct validate_negotiate_info_req {
 	__le32 Capabilities;
 	__u8   Guid[SMB2_CLIENT_GUID_SIZE];
@@ -620,6 +677,14 @@ struct compress_ioctl {
 	__le16 CompressionState; /* See cifspdu.h for possible flag values */
 } __packed;
 
+struct duplicate_extents_to_file {
+	__u64 PersistentFileHandle; /* source file handle, opaque endianness */
+	__u64 VolatileFileHandle;
+	__le64 SourceFileOffset;
+	__le64 TargetFileOffset;
+	__le64 ByteCount;  /* Bytes to be copied */
+} __packed;
+
 struct smb2_ioctl_req {
 	struct smb2_hdr hdr;
 	__le16 StructureSize;	/* Must be 57 */
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 83efa5953..a639d0dab 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -75,10 +75,13 @@
 #define FSCTL_QUERY_SPARING_INFO     0x00090138 /* BB add struct */
 #define FSCTL_SET_ZERO_ON_DEALLOC    0x00090194 /* BB add struct */
 #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
+#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C
 #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
 #define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
 #define FSCTL_FILE_LEVEL_TRIM        0x00098208 /* BB add struct */
+#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344
 #define FSCTL_SIS_LINK_FILES         0x0009C104
+#define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280
 #define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
 #define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
 /* strange that the number for this op is not sequential with previous op */
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index d6f7a76a1..f829fe963 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -79,7 +79,7 @@ void coda_sysctl_clean(void);
 
 static inline struct coda_inode_info *ITOC(struct inode *inode)
 {
-	return list_entry(inode, struct coda_inode_info, vfs_inode);
+	return container_of(inode, struct coda_inode_info, vfs_inode);
 }
 
 static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6b8e2f091..48851f6ea 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -896,6 +896,7 @@ COMPATIBLE_IOCTL(FIGETBSZ)
 /* 'X' - originally XFS but some now in the VFS */
 COMPATIBLE_IOCTL(FIFREEZE)
 COMPATIBLE_IOCTL(FITHAW)
+COMPATIBLE_IOCTL(FITRIM)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
 COMPATIBLE_IOCTL(KDGKBTYPE)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 8d89f5fd0..eae87575e 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -236,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 
 	if (dentry) {
 		spin_lock(&dentry->d_lock);
-		if (!d_unhashed(dentry) && d_really_is_positive(dentry)) {
+		if (simple_positive(dentry)) {
 			dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e65f9ffbb..b863a09cd 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref);
  *	config_item_init - initialize item.
  *	@item:	item in question.
  */
-void config_item_init(struct config_item *item)
+static void config_item_init(struct config_item *item)
 {
 	kref_init(&item->ci_kref);
 	INIT_LIST_HEAD(&item->ci_entry);
 }
-EXPORT_SYMBOL(config_item_init);
 
 /**
  *	config_item_set_name - Set the name of an item
@@ -116,7 +115,7 @@ void config_item_init_type_name(struct config_item *item,
 				const char *name,
 				struct config_item_type *type)
 {
-	config_item_set_name(item, name);
+	config_item_set_name(item, "%s", name);
 	item->ci_type = type;
 	config_item_init(item);
 }
@@ -125,7 +124,7 @@ EXPORT_SYMBOL(config_item_init_type_name);
 void config_group_init_type_name(struct config_group *group, const char *name,
 			 struct config_item_type *type)
 {
-	config_item_set_name(&group->cg_item, name);
+	config_item_set_name(&group->cg_item, "%s", name);
 	group->cg_item.ci_type = type;
 	config_group_init(group);
 }
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index cc9f2546e..ec5c8325b 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -279,36 +279,27 @@ static int configfs_getlink(struct dentry *dentry, char * path)
 
 }
 
-static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
 {
-	int error = -ENOMEM;
 	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	int error;
 
-	if (page) {
-		error = configfs_getlink(dentry, (char *)page);
-		if (!error) {
-			nd_set_link(nd, (char *)page);
-			return (void *)page;
-		}
-	}
-
-	nd_set_link(nd, ERR_PTR(error));
-	return NULL;
-}
+	if (!page)
+		return ERR_PTR(-ENOMEM);
 
-static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
-			      void *cookie)
-{
-	if (cookie) {
-		unsigned long page = (unsigned long)cookie;
-		free_page(page);
+	error = configfs_getlink(dentry, (char *)page);
+	if (!error) {
+		return *cookie = (void *)page;
 	}
+
+	free_page(page);
+	return ERR_PTR(error);
 }
 
 const struct inode_operations configfs_symlink_inode_operations = {
 	.follow_link = configfs_follow_link,
 	.readlink = generic_readlink,
-	.put_link = configfs_put_link,
+	.put_link = free_page_put_link,
 	.setattr = configfs_setattr,
 };
 
diff --git a/fs/coredump.c b/fs/coredump.c
index bbbe139ab..c5ecde6f3 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -70,7 +70,8 @@ static int expand_corename(struct core_name *cn, int size)
 	return 0;
 }
 
-static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
+static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt,
+				     va_list arg)
 {
 	int free, need;
 	va_list arg_copy;
@@ -93,7 +94,7 @@ again:
 	return -ENOMEM;
 }
 
-static int cn_printf(struct core_name *cn, const char *fmt, ...)
+static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...)
 {
 	va_list arg;
 	int ret;
@@ -105,7 +106,8 @@ static int cn_printf(struct core_name *cn, const char *fmt, ...)
 	return ret;
 }
 
-static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
+static __printf(2, 3)
+int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
 {
 	int cur = cn->used;
 	va_list arg;
@@ -138,7 +140,7 @@ static int cn_print_exe_file(struct core_name *cn)
 		goto put_exe_file;
 	}
 
-	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+	path = file_path(exe_file, pathbuf, PATH_MAX);
 	if (IS_ERR(path)) {
 		ret = PTR_ERR(path);
 		goto free_buf;
@@ -209,11 +211,15 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 				break;
 			/* uid */
 			case 'u':
-				err = cn_printf(cn, "%d", cred->uid);
+				err = cn_printf(cn, "%u",
+						from_kuid(&init_user_ns,
+							  cred->uid));
 				break;
 			/* gid */
 			case 'g':
-				err = cn_printf(cn, "%d", cred->gid);
+				err = cn_printf(cn, "%u",
+						from_kgid(&init_user_ns,
+							  cred->gid));
 				break;
 			case 'd':
 				err = cn_printf(cn, "%d",
@@ -221,7 +227,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 				break;
 			/* signal that caused the coredump */
 			case 's':
-				err = cn_printf(cn, "%ld", cprm->siginfo->si_signo);
+				err = cn_printf(cn, "%d",
+						cprm->siginfo->si_signo);
 				break;
 			/* UNIX time of coredump */
 			case 't': {
diff --git a/fs/dax.c b/fs/dax.c
index 6f65f00e5..a7f77e1fa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -155,7 +155,7 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 		}
 
 		if (iov_iter_rw(iter) == WRITE)
-			len = copy_from_iter(addr, max - pos, iter);
+			len = copy_from_iter_nocache(addr, max - pos, iter);
 		else if (!hole)
 			len = copy_to_iter(addr, max - pos, iter);
 		else
@@ -209,7 +209,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 	}
 
 	/* Protects against truncate */
-	inode_dio_begin(inode);
+	if (!(flags & DIO_SKIP_DIO_COUNT))
+		inode_dio_begin(inode);
 
 	retval = dax_io(inode, iter, pos, end, get_block, &bh);
 
@@ -219,7 +220,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 	if ((retval > 0) && end_io)
 		end_io(iocb, pos, retval, bh.b_private);
 
-	inode_dio_end(inode);
+	if (!(flags & DIO_SKIP_DIO_COUNT))
+		inode_dio_end(inode);
  out:
 	return retval;
 }
@@ -309,14 +311,27 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
  out:
 	i_mmap_unlock_read(mapping);
 
-	if (bh->b_end_io)
-		bh->b_end_io(bh, 1);
-
 	return error;
 }
 
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			get_block_t get_block)
+/**
+ * __dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @complete_unwritten: The filesystem method used to convert unwritten blocks
+ *	to written so the data written to them is exposed. This is required for
+ *	required by write faults for filesystems that will return unwritten
+ *	extent mappings from @get_block, but it is optional for reads as
+ *	dax_insert_mapping() will always zero unwritten blocks. If the fs does
+ *	not support unwritten extents, the it should pass NULL.
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files. __dax_fault() assumes the caller has done all
+ * the necessary locking for the page fault to proceed successfully.
+ */
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+			get_block_t get_block, dax_iodone_t complete_unwritten)
 {
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
@@ -417,7 +432,23 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		page_cache_release(page);
 	}
 
+	/*
+	 * If we successfully insert the new mapping over an unwritten extent,
+	 * we need to ensure we convert the unwritten extent. If there is an
+	 * error inserting the mapping, the filesystem needs to leave it as
+	 * unwritten to prevent exposure of the stale underlying data to
+	 * userspace, but we still need to call the completion function so
+	 * the private resources on the mapping buffer can be released. We
+	 * indicate what the callback should do via the uptodate variable, same
+	 * as for normal BH based IO completions.
+	 */
 	error = dax_insert_mapping(inode, &bh, vma, vmf);
+	if (buffer_unwritten(&bh)) {
+		if (complete_unwritten)
+			complete_unwritten(&bh, !error);
+		else
+			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
+	}
 
  out:
 	if (error == -ENOMEM)
@@ -434,6 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	}
 	goto out;
 }
+EXPORT_SYMBOL(__dax_fault);
 
 /**
  * dax_fault - handle a page fault on a DAX file
@@ -445,7 +477,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  * fault handler for DAX files.
  */
 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			get_block_t get_block)
+	      get_block_t get_block, dax_iodone_t complete_unwritten)
 {
 	int result;
 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -454,7 +486,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
-	result = do_dax_fault(vma, vmf, get_block);
+	result = __dax_fault(vma, vmf, get_block, complete_unwritten);
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(sb);
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 7c39f19bd..9b5fe503f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -322,17 +322,17 @@ static void dentry_free(struct dentry *dentry)
 }
 
 /**
- * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
  * @dentry: the target dentry
  * After this call, in-progress rcu-walk path lookup will fail. This
  * should be called after unhashing, and after changing d_inode (if
  * the dentry has not already been unhashed).
  */
-static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
 {
-	assert_spin_locked(&dentry->d_lock);
-	/* Go through a barrier */
-	write_seqcount_barrier(&dentry->d_seq);
+	lockdep_assert_held(&dentry->d_lock);
+	/* Go through am invalidation barrier */
+	write_seqcount_invalidate(&dentry->d_seq);
 }
 
 /*
@@ -372,7 +372,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
 	struct inode *inode = dentry->d_inode;
 	__d_clear_type_and_inode(dentry);
 	hlist_del_init(&dentry->d_u.d_alias);
-	dentry_rcuwalk_barrier(dentry);
+	dentry_rcuwalk_invalidate(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&inode->i_lock);
 	if (!inode->i_nlink)
@@ -494,7 +494,7 @@ void __d_drop(struct dentry *dentry)
 		__hlist_bl_del(&dentry->d_hash);
 		dentry->d_hash.pprev = NULL;
 		hlist_bl_unlock(b);
-		dentry_rcuwalk_barrier(dentry);
+		dentry_rcuwalk_invalidate(dentry);
 	}
 }
 EXPORT_SYMBOL(__d_drop);
@@ -1167,7 +1167,7 @@ enum d_walk_ret {
  *
  * The @enter() and @finish() callbacks are called with d_lock held.
  */
-void d_walk(struct dentry *parent, void *data,
+static void d_walk(struct dentry *parent, void *data,
 		   enum d_walk_ret (*enter)(void *, struct dentry *),
 		   void (*finish)(void *))
 {
@@ -1272,7 +1272,6 @@ rename_retry:
 	seq = 1;
 	goto again;
 }
-EXPORT_SYMBOL(d_walk);
 
 /*
  * Search for at least 1 mount point in the dentry's subdirs.
@@ -1677,7 +1676,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 				DCACHE_OP_COMPARE	|
 				DCACHE_OP_REVALIDATE	|
 				DCACHE_OP_WEAK_REVALIDATE	|
-				DCACHE_OP_DELETE ));
+				DCACHE_OP_DELETE	|
+				DCACHE_OP_SELECT_INODE));
 	dentry->d_op = op;
 	if (!op)
 		return;
@@ -1693,6 +1693,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 		dentry->d_flags |= DCACHE_OP_DELETE;
 	if (op->d_prune)
 		dentry->d_flags |= DCACHE_OP_PRUNE;
+	if (op->d_select_inode)
+		dentry->d_flags |= DCACHE_OP_SELECT_INODE;
 
 }
 EXPORT_SYMBOL(d_set_d_op);
@@ -1756,7 +1758,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	if (inode)
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
 	__d_set_inode_and_type(dentry, inode, add_flags);
-	dentry_rcuwalk_barrier(dentry);
+	dentry_rcuwalk_invalidate(dentry);
 	spin_unlock(&dentry->d_lock);
 	fsnotify_d_instantiate(dentry, inode);
 }
@@ -3440,22 +3442,15 @@ void __init vfs_caches_init_early(void)
 	inode_init_early();
 }
 
-void __init vfs_caches_init(unsigned long mempages)
+void __init vfs_caches_init(void)
 {
-	unsigned long reserve;
-
-	/* Base hash sizes on available memory, with a reserve equal to
-           150% of current kernel size */
-
-	reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
-	mempages -= reserve;
-
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	dcache_init();
 	inode_init();
-	files_init(mempages);
+	files_init();
+	files_maxfiles_init();
 	mnt_init();
 	bdev_cache_init();
 	chrdev_init();
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 830a7e76f..284f9aa00 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -17,7 +17,6 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/pagemap.h>
-#include <linux/namei.h>
 #include <linux/debugfs.h>
 #include <linux/io.h>
 #include <linux/slab.h>
@@ -43,17 +42,6 @@ const struct file_operations debugfs_file_operations = {
 	.llseek =	noop_llseek,
 };
 
-static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	nd_set_link(nd, d_inode(dentry)->i_private);
-	return NULL;
-}
-
-const struct inode_operations debugfs_link_operations = {
-	.readlink       = generic_readlink,
-	.follow_link    = debugfs_follow_link,
-};
-
 static int debugfs_u8_set(void *data, u64 val)
 {
 	*(u8 *)data = val;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 12756040c..c711be8d6 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -44,11 +44,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
 	return inode;
 }
 
-static inline int debugfs_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 struct debugfs_mount_opts {
 	kuid_t uid;
 	kgid_t gid;
@@ -174,7 +169,7 @@ static void debugfs_evict_inode(struct inode *inode)
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 	if (S_ISLNK(inode->i_mode))
-		kfree(inode->i_private);
+		kfree(inode->i_link);
 }
 
 static const struct super_operations debugfs_super_operations = {
@@ -511,8 +506,8 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 		return failed_creating(dentry);
 	}
 	inode->i_mode = S_IFLNK | S_IRWXUGO;
-	inode->i_op = &debugfs_link_operations;
-	inode->i_private = link;
+	inode->i_op = &simple_symlink_inode_operations;
+	inode->i_link = link;
 	d_instantiate(dentry, inode);
 	return end_creating(dentry);
 }
@@ -522,7 +517,7 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 {
 	int ret = 0;
 
-	if (debugfs_positive(dentry)) {
+	if (simple_positive(dentry)) {
 		dget(dentry);
 		if (d_is_dir(dentry))
 			ret = simple_rmdir(d_inode(parent), dentry);
@@ -602,7 +597,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	 */
 	spin_lock(&parent->d_lock);
 	list_for_each_entry(child, &parent->d_subdirs, d_child) {
-		if (!debugfs_positive(child))
+		if (!simple_positive(child))
 			continue;
 
 		/* perhaps simple_empty(child) makes more sense */
@@ -623,7 +618,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 		 * from d_subdirs. When releasing the parent->d_lock we can
 		 * no longer trust that the next pointer is valid.
 		 * Restart the loop. We'll skip this one with the
-		 * debugfs_positive() check.
+		 * simple_positive() check.
 		 */
 		goto loop;
 	}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index add566303..c35ffdc12 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 	if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
 		return inode->i_sb;
 #endif
+	if (!devpts_mnt)
+		return NULL;
 	return devpts_mnt->mnt_sb;
 }
 
@@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = {
 int devpts_new_index(struct inode *ptmx_inode)
 {
 	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_fs_info *fsi;
 	int index;
 	int ida_ret;
 
+	if (!sb)
+		return -ENODEV;
+
+	fsi = DEVPTS_SB(sb);
 retry:
 	if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
 		return -ENOMEM;
@@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
 	struct dentry *dentry;
 	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
 	struct inode *inode;
-	struct dentry *root = sb->s_root;
-	struct pts_fs_info *fsi = DEVPTS_SB(sb);
-	struct pts_mount_opts *opts = &fsi->mount_opts;
+	struct dentry *root;
+	struct pts_fs_info *fsi;
+	struct pts_mount_opts *opts;
 	char s[12];
 
+	if (!sb)
+		return ERR_PTR(-ENODEV);
+
+	root = sb->s_root;
+	fsi = DEVPTS_SB(sb);
+	opts = &fsi->mount_opts;
+
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
@@ -676,12 +689,16 @@ static int __init init_devpts_fs(void)
 	struct ctl_table_header *table;
 
 	if (!err) {
+		struct vfsmount *mnt;
+
 		table = register_sysctl_table(pty_root_table);
-		devpts_mnt = kern_mount(&devpts_fs_type);
-		if (IS_ERR(devpts_mnt)) {
-			err = PTR_ERR(devpts_mnt);
+		mnt = kern_mount(&devpts_fs_type);
+		if (IS_ERR(mnt)) {
+			err = PTR_ERR(mnt);
 			unregister_filesystem(&devpts_fs_type);
 			unregister_sysctl_table(table);
+		} else {
+			devpts_mnt = mnt;
 		}
 	}
 	return err;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d08e079ea..754fd6c0b 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -921,8 +921,8 @@ static int tcp_accept_from_sock(struct connection *con)
 	mutex_unlock(&connections_lock);
 
 	memset(&peeraddr, 0, sizeof(peeraddr));
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
-				  IPPROTO_TCP, &newsock);
+	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+				  SOCK_STREAM, IPPROTO_TCP, &newsock);
 	if (result < 0)
 		return -ENOMEM;
 
@@ -1173,8 +1173,8 @@ static void tcp_connect_to_sock(struct connection *con)
 		goto out;
 
 	/* Create a socket to communicate with */
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
-				  IPPROTO_TCP, &sock);
+	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+				  SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (result < 0)
 		goto out_err;
 
@@ -1258,8 +1258,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 		addr_len = sizeof(struct sockaddr_in6);
 
 	/* Create a socket to communicate with */
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
-				  IPPROTO_TCP, &sock);
+	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+				  SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (result < 0) {
 		log_print("Can't create listening comms socket");
 		goto create_out;
@@ -1365,8 +1365,8 @@ static int sctp_listen_for_all(void)
 
 	log_print("Using SCTP for communications");
 
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
-				  IPPROTO_SCTP, &sock);
+	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+				  SOCK_SEQPACKET, IPPROTO_SCTP, &sock);
 	if (result < 0) {
 		log_print("Can't create comms socket, check SCTP is loaded");
 		goto out;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 8289cb195..5718cb9f7 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,7 +8,6 @@
 #include <linux/writeback.h>
 #include <linux/sysctl.h>
 #include <linux/gfp.h>
-#include <linux/export.h>
 #include "internal.h"
 
 /* A global variable is a bit ugly, but it keeps the code simple */
@@ -38,12 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 	iput(toput_inode);
 }
 
-/* For TuxOnIce */
-void drop_pagecache(void)
-{
-	iterate_supers(drop_pagecache_sb, NULL);
-}
-
 int drop_caches_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 72afcc629..feef8a9c4 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -325,7 +325,6 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return rc;
 
 	switch (cmd) {
-	case FITRIM:
 	case FS_IOC32_GETFLAGS:
 	case FS_IOC32_SETFLAGS:
 	case FS_IOC32_GETVERSION:
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index fc850b55d..3c4db1172 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -170,7 +170,6 @@ out_unlock:
  * @directory_inode: inode of the new file's dentry's parent in ecryptfs
  * @ecryptfs_dentry: New file's dentry in ecryptfs
  * @mode: The mode of the new file
- * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
  *
  * Creates the underlying file and the eCryptfs inode which will link to
  * it. It will also update the eCryptfs directory inode to mimic the
@@ -384,7 +383,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
  * ecryptfs_lookup
  * @ecryptfs_dir_inode: The eCryptfs directory inode
  * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
- * @ecryptfs_nd: nameidata; may be NULL
+ * @flags: lookup flags
  *
  * Find a file on disk. If the file does not exist, then we'll add it to the
  * dentry cache and continue on to read it from the disk.
@@ -675,18 +674,16 @@ out:
 	return rc ? ERR_PTR(rc) : buf;
 }
 
-static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie)
 {
 	size_t len;
 	char *buf = ecryptfs_readlink_lower(dentry, &len);
 	if (IS_ERR(buf))
-		goto out;
+		return buf;
 	fsstack_copy_attr_atime(d_inode(dentry),
 				d_inode(ecryptfs_dentry_to_lower(dentry)));
 	buf[len] = '\0';
-out:
-	nd_set_link(nd, buf);
-	return NULL;
+	return *cookie = buf;
 }
 
 /**
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 7fca462ea..c8411a30f 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -67,7 +67,7 @@ static struct kmem_cache * efs_inode_cachep;
 static struct inode *efs_alloc_inode(struct super_block *sb)
 {
 	struct efs_inode_info *ei;
-	ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
+	ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index b47c7b8dc..a364fd096 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -16,5 +16,5 @@
 libore-y := ore.o ore_raid.o
 obj-$(CONFIG_ORE) += libore.o
 
-exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o
+exofs-y := inode.o file.o namei.o dir.o super.o sys.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4deb0b05b..e5bb2abf7 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -44,12 +44,6 @@ static inline void exofs_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-/* Accesses dir's inode->i_size must be called under inode lock */
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
-
 static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
 {
 	loff_t last_byte = inode->i_size;
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index ad9cac670..2e86086bc 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -207,10 +207,6 @@ extern const struct address_space_operations exofs_aops;
 extern const struct inode_operations exofs_dir_inode_operations;
 extern const struct inode_operations exofs_special_inode_operations;
 
-/* symlink.c         */
-extern const struct inode_operations exofs_symlink_inode_operations;
-extern const struct inode_operations exofs_fast_symlink_inode_operations;
-
 /* exofs_init_comps will initialize an ore_components device array
  * pointing to a single ore_comp struct, and a round-robin view
  * of the device table.
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 786e4cc8c..73c64daa0 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1222,10 +1222,11 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		inode->i_fop = &exofs_dir_operations;
 		inode->i_mapping->a_ops = &exofs_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (exofs_inode_is_fast_symlink(inode))
-			inode->i_op = &exofs_fast_symlink_inode_operations;
-		else {
-			inode->i_op = &exofs_symlink_inode_operations;
+		if (exofs_inode_is_fast_symlink(inode)) {
+			inode->i_op = &simple_symlink_inode_operations;
+			inode->i_link = (char *)oi->i_data;
+		} else {
+			inode->i_op = &page_symlink_inode_operations;
 			inode->i_mapping->a_ops = &exofs_aops;
 		}
 	} else {
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 5ae25e431..09a6bb1ad 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -113,7 +113,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
 	oi = exofs_i(inode);
 	if (l > sizeof(oi->i_data)) {
 		/* slow symlink */
-		inode->i_op = &exofs_symlink_inode_operations;
+		inode->i_op = &page_symlink_inode_operations;
 		inode->i_mapping->a_ops = &exofs_aops;
 		memset(oi->i_data, 0, sizeof(oi->i_data));
 
@@ -122,7 +122,8 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
 			goto out_fail;
 	} else {
 		/* fast symlink */
-		inode->i_op = &exofs_fast_symlink_inode_operations;
+		inode->i_op = &simple_symlink_inode_operations;
+		inode->i_link = (char *)oi->i_data;
 		memcpy(oi->i_data, symname, l);
 		inode->i_size = l-1;
 	}
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
deleted file mode 100644
index 6f6f3a4c1..000000000
--- a/fs/exofs/symlink.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- *     Copyright (C) 1992, 1993, 1994, 1995
- *     Remy Card (card@masi.ibp.fr)
- *     Laboratoire MASI - Institut Blaise Pascal
- *     Universite Pierre et Marie Curie (Paris VI)
- *     from
- *     linux/fs/minix/inode.c
- *     Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/namei.h>
-
-#include "exofs.h"
-
-static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct exofs_i_info *oi = exofs_i(d_inode(dentry));
-
-	nd_set_link(nd, (char *)oi->i_data);
-	return NULL;
-}
-
-const struct inode_operations exofs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
-};
-
-const struct inode_operations exofs_fast_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= exofs_follow_link,
-};
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 796b491e6..0c6638b40 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -70,11 +70,6 @@ static inline void ext2_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 /*
  * Return the offset into page `page_nr' of the last valid
  * byte in that page, plus one.
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3a0a6c640..3b57c9f83 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -28,12 +28,12 @@
 #ifdef CONFIG_FS_DAX
 static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_fault(vma, vmf, ext2_get_block);
+	return dax_fault(vma, vmf, ext2_get_block, NULL);
 }
 
 static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_mkwrite(vma, vmf, ext2_get_block);
+	return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
 }
 
 static const struct vm_operations_struct ext2_dax_vm_ops = {
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index f460ae36d..5c09776d3 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1403,6 +1403,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 			inode->i_mapping->a_ops = &ext2_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext2_inode_is_fast_symlink(inode)) {
+			inode->i_link = (char *)ei->i_data;
 			inode->i_op = &ext2_fast_symlink_inode_operations;
 			nd_terminate_link(ei->i_data, inode->i_size,
 				sizeof(ei->i_data) - 1);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 3e074a9cc..13ec54a99 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -189,7 +189,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
 	} else {
 		/* fast symlink */
 		inode->i_op = &ext2_fast_symlink_inode_operations;
-		memcpy((char*)(EXT2_I(inode)->i_data),symname,l);
+		inode->i_link = (char*)EXT2_I(inode)->i_data;
+		memcpy(inode->i_link, symname, l);
 		inode->i_size = l-1;
 	}
 	mark_inode_dirty(inode);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d0e746e96..900e19cf9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -882,6 +882,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
 		 MS_POSIXACL : 0);
+	sb->s_iflags |= SB_I_CGROUPWB;
 
 	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
 	    (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 20608f17c..ae17179f3 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -19,14 +19,6 @@
 
 #include "ext2.h"
 #include "xattr.h"
-#include <linux/namei.h>
-
-static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct ext2_inode_info *ei = EXT2_I(d_inode(dentry));
-	nd_set_link(nd, (char *)ei->i_data);
-	return NULL;
-}
 
 const struct inode_operations ext2_symlink_inode_operations = {
 	.readlink	= generic_readlink,
@@ -43,7 +35,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
  
 const struct inode_operations ext2_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= ext2_follow_link,
+	.follow_link	= simple_follow_link,
 	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2ee2dc435..6c7e5468a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2999,6 +2999,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 			inode->i_op = &ext3_fast_symlink_inode_operations;
 			nd_terminate_link(ei->i_data, inode->i_size,
 				sizeof(ei->i_data) - 1);
+			inode->i_link = (char *)ei->i_data;
 		} else {
 			inode->i_op = &ext3_symlink_inode_operations;
 			ext3_set_aops(inode);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 4264b9bd0..c9e767cd4 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2308,7 +2308,8 @@ retry:
 		}
 	} else {
 		inode->i_op = &ext3_fast_symlink_inode_operations;
-		memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
+		inode->i_link = (char*)&EXT3_I(inode)->i_data;
+		memcpy(inode->i_link, symname, l);
 		inode->i_size = l-1;
 	}
 	EXT3_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a9312f0a5..5ed0044fb 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1908,7 +1908,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_mount_state = le16_to_cpu(es->s_state);
 	sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
 	sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
-	for (i=0; i < 4; i++)
+	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
 	i = le32_to_cpu(es->s_flags);
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index ea96df3c5..c08c59094 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -17,17 +17,9 @@
  *  ext3 symlink handling code
  */
 
-#include <linux/namei.h>
 #include "ext3.h"
 #include "xattr.h"
 
-static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct ext3_inode_info *ei = EXT3_I(d_inode(dentry));
-	nd_set_link(nd, (char*)ei->i_data);
-	return NULL;
-}
-
 const struct inode_operations ext3_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
@@ -43,7 +35,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
 
 const struct inode_operations ext3_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= ext3_follow_link,
+	.follow_link	= simple_follow_link,
 	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
 	.setxattr	= generic_setxattr,
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 024f2284d..bf8bc8aba 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -72,6 +72,7 @@ config EXT4_ENCRYPTION
 	select CRYPTO_ECB
 	select CRYPTO_XTS
 	select CRYPTO_CTS
+	select CRYPTO_CTR
 	select CRYPTO_SHA256
 	select KEYS
 	select ENCRYPTED_KEYS
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 955bf49a7..cd6ea29be 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -369,7 +369,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb,
 	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (buffer_verified(bh))
+	if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
 		return;
 
 	ext4_lock_group(sb, block_group);
@@ -446,7 +446,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 		unlock_buffer(bh);
 		if (err)
 			ext4_error(sb, "Checksum bad for grp %u", block_group);
-		return bh;
+		goto verify;
 	}
 	ext4_unlock_group(sb, block_group);
 	if (buffer_uptodate(bh)) {
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 8ff15273a..457315581 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -55,6 +55,9 @@ static mempool_t *ext4_bounce_page_pool;
 static LIST_HEAD(ext4_free_crypto_ctxs);
 static DEFINE_SPINLOCK(ext4_crypto_ctx_lock);
 
+static struct kmem_cache *ext4_crypto_ctx_cachep;
+struct kmem_cache *ext4_crypt_info_cachep;
+
 /**
  * ext4_release_crypto_ctx() - Releases an encryption context
  * @ctx: The encryption context to release.
@@ -68,18 +71,12 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx)
 {
 	unsigned long flags;
 
-	if (ctx->bounce_page) {
-		if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL)
-			__free_page(ctx->bounce_page);
-		else
-			mempool_free(ctx->bounce_page, ext4_bounce_page_pool);
-		ctx->bounce_page = NULL;
-	}
-	ctx->control_page = NULL;
+	if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page)
+		mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool);
+	ctx->w.bounce_page = NULL;
+	ctx->w.control_page = NULL;
 	if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) {
-		if (ctx->tfm)
-			crypto_free_tfm(ctx->tfm);
-		kfree(ctx);
+		kmem_cache_free(ext4_crypto_ctx_cachep, ctx);
 	} else {
 		spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
 		list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
@@ -88,23 +85,6 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx)
 }
 
 /**
- * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context
- * @mask: The allocation mask.
- *
- * Return: An allocated and initialized encryption context on success. An error
- * value or NULL otherwise.
- */
-static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask)
-{
-	struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx),
-					      mask);
-
-	if (!ctx)
-		return ERR_PTR(-ENOMEM);
-	return ctx;
-}
-
-/**
  * ext4_get_crypto_ctx() - Gets an encryption context
  * @inode:       The inode for which we are doing the crypto
  *
@@ -118,10 +98,10 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode)
 	struct ext4_crypto_ctx *ctx = NULL;
 	int res = 0;
 	unsigned long flags;
-	struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key;
+	struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
 
-	if (!ext4_read_workqueue)
-		ext4_init_crypto();
+	if (ci == NULL)
+		return ERR_PTR(-ENOKEY);
 
 	/*
 	 * We first try getting the ctx from a free list because in
@@ -140,50 +120,16 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode)
 		list_del(&ctx->free_list);
 	spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
 	if (!ctx) {
-		ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS);
-		if (IS_ERR(ctx)) {
-			res = PTR_ERR(ctx);
+		ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS);
+		if (!ctx) {
+			res = -ENOMEM;
 			goto out;
 		}
 		ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
 	} else {
 		ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
 	}
-
-	/* Allocate a new Crypto API context if we don't already have
-	 * one or if it isn't the right mode. */
-	BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID);
-	if (ctx->tfm && (ctx->mode != key->mode)) {
-		crypto_free_tfm(ctx->tfm);
-		ctx->tfm = NULL;
-		ctx->mode = EXT4_ENCRYPTION_MODE_INVALID;
-	}
-	if (!ctx->tfm) {
-		switch (key->mode) {
-		case EXT4_ENCRYPTION_MODE_AES_256_XTS:
-			ctx->tfm = crypto_ablkcipher_tfm(
-				crypto_alloc_ablkcipher("xts(aes)", 0, 0));
-			break;
-		case EXT4_ENCRYPTION_MODE_AES_256_GCM:
-			/* TODO(mhalcrow): AEAD w/ gcm(aes);
-			 * crypto_aead_setauthsize() */
-			ctx->tfm = ERR_PTR(-ENOTSUPP);
-			break;
-		default:
-			BUG();
-		}
-		if (IS_ERR_OR_NULL(ctx->tfm)) {
-			res = PTR_ERR(ctx->tfm);
-			ctx->tfm = NULL;
-			goto out;
-		}
-		ctx->mode = key->mode;
-	}
-	BUG_ON(key->size != ext4_encryption_key_size(key->mode));
-
-	/* There shouldn't be a bounce page attached to the crypto
-	 * context at this point. */
-	BUG_ON(ctx->bounce_page);
+	ctx->flags &= ~EXT4_WRITE_PATH_FL;
 
 out:
 	if (res) {
@@ -204,20 +150,8 @@ void ext4_exit_crypto(void)
 {
 	struct ext4_crypto_ctx *pos, *n;
 
-	list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) {
-		if (pos->bounce_page) {
-			if (pos->flags &
-			    EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) {
-				__free_page(pos->bounce_page);
-			} else {
-				mempool_free(pos->bounce_page,
-					     ext4_bounce_page_pool);
-			}
-		}
-		if (pos->tfm)
-			crypto_free_tfm(pos->tfm);
-		kfree(pos);
-	}
+	list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list)
+		kmem_cache_free(ext4_crypto_ctx_cachep, pos);
 	INIT_LIST_HEAD(&ext4_free_crypto_ctxs);
 	if (ext4_bounce_page_pool)
 		mempool_destroy(ext4_bounce_page_pool);
@@ -225,6 +159,12 @@ void ext4_exit_crypto(void)
 	if (ext4_read_workqueue)
 		destroy_workqueue(ext4_read_workqueue);
 	ext4_read_workqueue = NULL;
+	if (ext4_crypto_ctx_cachep)
+		kmem_cache_destroy(ext4_crypto_ctx_cachep);
+	ext4_crypto_ctx_cachep = NULL;
+	if (ext4_crypt_info_cachep)
+		kmem_cache_destroy(ext4_crypt_info_cachep);
+	ext4_crypt_info_cachep = NULL;
 }
 
 /**
@@ -237,23 +177,31 @@ void ext4_exit_crypto(void)
  */
 int ext4_init_crypto(void)
 {
-	int i, res;
+	int i, res = -ENOMEM;
 
 	mutex_lock(&crypto_init);
 	if (ext4_read_workqueue)
 		goto already_initialized;
 	ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0);
-	if (!ext4_read_workqueue) {
-		res = -ENOMEM;
+	if (!ext4_read_workqueue)
+		goto fail;
+
+	ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx,
+					    SLAB_RECLAIM_ACCOUNT);
+	if (!ext4_crypto_ctx_cachep)
+		goto fail;
+
+	ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info,
+					    SLAB_RECLAIM_ACCOUNT);
+	if (!ext4_crypt_info_cachep)
 		goto fail;
-	}
 
 	for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
 		struct ext4_crypto_ctx *ctx;
 
-		ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL);
-		if (IS_ERR(ctx)) {
-			res = PTR_ERR(ctx);
+		ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS);
+		if (!ctx) {
+			res = -ENOMEM;
 			goto fail;
 		}
 		list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
@@ -317,32 +265,11 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
 	struct ablkcipher_request *req = NULL;
 	DECLARE_EXT4_COMPLETION_RESULT(ecr);
 	struct scatterlist dst, src;
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm);
+	struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
 	int res = 0;
 
-	BUG_ON(!ctx->tfm);
-	BUG_ON(ctx->mode != ei->i_encryption_key.mode);
-
-	if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) {
-		printk_ratelimited(KERN_ERR
-				   "%s: unsupported crypto algorithm: %d\n",
-				   __func__, ctx->mode);
-		return -ENOTSUPP;
-	}
-
-	crypto_ablkcipher_clear_flags(atfm, ~0);
-	crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
-
-	res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw,
-				       ei->i_encryption_key.size);
-	if (res) {
-		printk_ratelimited(KERN_ERR
-				   "%s: crypto_ablkcipher_setkey() failed\n",
-				   __func__);
-		return res;
-	}
-	req = ablkcipher_request_alloc(atfm, GFP_NOFS);
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
 	if (!req) {
 		printk_ratelimited(KERN_ERR
 				   "%s: crypto_request_alloc() failed\n",
@@ -384,6 +311,15 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
 	return 0;
 }
 
+static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx)
+{
+	ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT);
+	if (ctx->w.bounce_page == NULL)
+		return ERR_PTR(-ENOMEM);
+	ctx->flags |= EXT4_WRITE_PATH_FL;
+	return ctx->w.bounce_page;
+}
+
 /**
  * ext4_encrypt() - Encrypts a page
  * @inode:          The inode for which the encryption should take place
@@ -413,27 +349,17 @@ struct page *ext4_encrypt(struct inode *inode,
 		return (struct page *) ctx;
 
 	/* The encryption operation will require a bounce page. */
-	ciphertext_page = alloc_page(GFP_NOFS);
-	if (!ciphertext_page) {
-		/* This is a potential bottleneck, but at least we'll have
-		 * forward progress. */
-		ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
-						 GFP_NOFS);
-		if (WARN_ON_ONCE(!ciphertext_page)) {
-			ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
-							 GFP_NOFS | __GFP_WAIT);
-		}
-		ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
-	} else {
-		ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
-	}
-	ctx->bounce_page = ciphertext_page;
-	ctx->control_page = plaintext_page;
+	ciphertext_page = alloc_bounce_page(ctx);
+	if (IS_ERR(ciphertext_page))
+		goto errout;
+	ctx->w.control_page = plaintext_page;
 	err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
 			       plaintext_page, ciphertext_page);
 	if (err) {
+		ciphertext_page = ERR_PTR(err);
+	errout:
 		ext4_release_crypto_ctx(ctx);
-		return ERR_PTR(err);
+		return ciphertext_page;
 	}
 	SetPagePrivate(ciphertext_page);
 	set_page_private(ciphertext_page, (unsigned long)ctx);
@@ -470,8 +396,8 @@ int ext4_decrypt_one(struct inode *inode, struct page *page)
 
 	struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode);
 
-	if (!ctx)
-		return -ENOMEM;
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
 	ret = ext4_decrypt(ctx, page);
 	ext4_release_crypto_ctx(ctx);
 	return ret;
@@ -493,21 +419,11 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
-	ciphertext_page = alloc_page(GFP_NOFS);
-	if (!ciphertext_page) {
-		/* This is a potential bottleneck, but at least we'll have
-		 * forward progress. */
-		ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
-						 GFP_NOFS);
-		if (WARN_ON_ONCE(!ciphertext_page)) {
-			ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
-							 GFP_NOFS | __GFP_WAIT);
-		}
-		ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
-	} else {
-		ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+	ciphertext_page = alloc_bounce_page(ctx);
+	if (IS_ERR(ciphertext_page)) {
+		err = PTR_ERR(ciphertext_page);
+		goto errout;
 	}
-	ctx->bounce_page = ciphertext_page;
 
 	while (len--) {
 		err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
@@ -529,6 +445,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
 			goto errout;
 		}
 		err = submit_bio_wait(WRITE, bio);
+		bio_put(bio);
 		if (err)
 			goto errout;
 	}
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index fded02f72..7dc4eb559 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -48,6 +48,12 @@ bool ext4_valid_filenames_enc_mode(uint32_t mode)
 	return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS);
 }
 
+static unsigned max_name_len(struct inode *inode)
+{
+	return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
+		EXT4_NAME_LEN;
+}
+
 /**
  * ext4_fname_encrypt() -
  *
@@ -55,43 +61,52 @@ bool ext4_valid_filenames_enc_mode(uint32_t mode)
  * ciphertext. Errors are returned as negative numbers.  We trust the caller to
  * allocate sufficient memory to oname string.
  */
-static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx,
+static int ext4_fname_encrypt(struct inode *inode,
 			      const struct qstr *iname,
 			      struct ext4_str *oname)
 {
 	u32 ciphertext_len;
 	struct ablkcipher_request *req = NULL;
 	DECLARE_EXT4_COMPLETION_RESULT(ecr);
-	struct crypto_ablkcipher *tfm = ctx->ctfm;
+	struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
 	int res = 0;
 	char iv[EXT4_CRYPTO_BLOCK_SIZE];
-	struct scatterlist sg[1];
-	int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK);
-	char *workbuf;
+	struct scatterlist src_sg, dst_sg;
+	int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK);
+	char *workbuf, buf[32], *alloc_buf = NULL;
+	unsigned lim = max_name_len(inode);
 
-	if (iname->len <= 0 || iname->len > ctx->lim)
+	if (iname->len <= 0 || iname->len > lim)
 		return -EIO;
 
 	ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ?
 		EXT4_CRYPTO_BLOCK_SIZE : iname->len;
 	ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding);
-	ciphertext_len = (ciphertext_len > ctx->lim)
-			? ctx->lim : ciphertext_len;
+	ciphertext_len = (ciphertext_len > lim)
+			? lim : ciphertext_len;
+
+	if (ciphertext_len <= sizeof(buf)) {
+		workbuf = buf;
+	} else {
+		alloc_buf = kmalloc(ciphertext_len, GFP_NOFS);
+		if (!alloc_buf)
+			return -ENOMEM;
+		workbuf = alloc_buf;
+	}
 
 	/* Allocate request */
 	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
 	if (!req) {
 		printk_ratelimited(
 		    KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
+		kfree(alloc_buf);
 		return -ENOMEM;
 	}
 	ablkcipher_request_set_callback(req,
 		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 		ext4_dir_crypt_complete, &ecr);
 
-	/* Map the workpage */
-	workbuf = kmap(ctx->workpage);
-
 	/* Copy the input */
 	memcpy(workbuf, iname->name, iname->len);
 	if (iname->len < ciphertext_len)
@@ -101,21 +116,16 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx,
 	memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
 
 	/* Create encryption request */
-	sg_init_table(sg, 1);
-	sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0);
-	ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv);
+	sg_init_one(&src_sg, workbuf, ciphertext_len);
+	sg_init_one(&dst_sg, oname->name, ciphertext_len);
+	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
 	res = crypto_ablkcipher_encrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
 		BUG_ON(req->base.data != &ecr);
 		wait_for_completion(&ecr.completion);
 		res = ecr.res;
 	}
-	if (res >= 0) {
-		/* Copy the result to output */
-		memcpy(oname->name, workbuf, ciphertext_len);
-		res = ciphertext_len;
-	}
-	kunmap(ctx->workpage);
+	kfree(alloc_buf);
 	ablkcipher_request_free(req);
 	if (res < 0) {
 		printk_ratelimited(
@@ -132,20 +142,21 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx,
  *	Errors are returned as negative numbers.
  *	We trust the caller to allocate sufficient memory to oname string.
  */
-static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx,
+static int ext4_fname_decrypt(struct inode *inode,
 			      const struct ext4_str *iname,
 			      struct ext4_str *oname)
 {
 	struct ext4_str tmp_in[2], tmp_out[1];
 	struct ablkcipher_request *req = NULL;
 	DECLARE_EXT4_COMPLETION_RESULT(ecr);
-	struct scatterlist sg[1];
-	struct crypto_ablkcipher *tfm = ctx->ctfm;
+	struct scatterlist src_sg, dst_sg;
+	struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
 	int res = 0;
 	char iv[EXT4_CRYPTO_BLOCK_SIZE];
-	char *workbuf;
+	unsigned lim = max_name_len(inode);
 
-	if (iname->len <= 0 || iname->len > ctx->lim)
+	if (iname->len <= 0 || iname->len > lim)
 		return -EIO;
 
 	tmp_in[0].name = iname->name;
@@ -163,31 +174,19 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx,
 		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 		ext4_dir_crypt_complete, &ecr);
 
-	/* Map the workpage */
-	workbuf = kmap(ctx->workpage);
-
-	/* Copy the input */
-	memcpy(workbuf, iname->name, iname->len);
-
 	/* Initialize IV */
 	memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
 
 	/* Create encryption request */
-	sg_init_table(sg, 1);
-	sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0);
-	ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv);
+	sg_init_one(&src_sg, iname->name, iname->len);
+	sg_init_one(&dst_sg, oname->name, oname->len);
+	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
 	res = crypto_ablkcipher_decrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
 		BUG_ON(req->base.data != &ecr);
 		wait_for_completion(&ecr.completion);
 		res = ecr.res;
 	}
-	if (res >= 0) {
-		/* Copy the result to output */
-		memcpy(oname->name, workbuf, iname->len);
-		res = iname->len;
-	}
-	kunmap(ctx->workpage);
 	ablkcipher_request_free(req);
 	if (res < 0) {
 		printk_ratelimited(
@@ -254,207 +253,6 @@ static int digest_decode(const char *src, int len, char *dst)
 }
 
 /**
- * ext4_free_fname_crypto_ctx() -
- *
- * Frees up a crypto context.
- */
-void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx)
-{
-	if (ctx == NULL || IS_ERR(ctx))
-		return;
-
-	if (ctx->ctfm && !IS_ERR(ctx->ctfm))
-		crypto_free_ablkcipher(ctx->ctfm);
-	if (ctx->htfm && !IS_ERR(ctx->htfm))
-		crypto_free_hash(ctx->htfm);
-	if (ctx->workpage && !IS_ERR(ctx->workpage))
-		__free_page(ctx->workpage);
-	kfree(ctx);
-}
-
-/**
- * ext4_put_fname_crypto_ctx() -
- *
- * Return: The crypto context onto free list. If the free list is above a
- * threshold, completely frees up the context, and returns the memory.
- *
- * TODO: Currently we directly free the crypto context. Eventually we should
- * add code it to return to free list. Such an approach will increase
- * efficiency of directory lookup.
- */
-void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx)
-{
-	if (*ctx == NULL || IS_ERR(*ctx))
-		return;
-	ext4_free_fname_crypto_ctx(*ctx);
-	*ctx = NULL;
-}
-
-/**
- * ext4_search_fname_crypto_ctx() -
- */
-static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx(
-		const struct ext4_encryption_key *key)
-{
-	return NULL;
-}
-
-/**
- * ext4_alloc_fname_crypto_ctx() -
- */
-struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx(
-	const struct ext4_encryption_key *key)
-{
-	struct ext4_fname_crypto_ctx *ctx;
-
-	ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS);
-	if (ctx == NULL)
-		return ERR_PTR(-ENOMEM);
-	if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) {
-		/* This will automatically set key mode to invalid
-		 * As enum for ENCRYPTION_MODE_INVALID is zero */
-		memset(&ctx->key, 0, sizeof(ctx->key));
-	} else {
-		memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key));
-	}
-	ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode)
-		? 0 : 1;
-	ctx->ctfm_key_is_ready = 0;
-	ctx->ctfm = NULL;
-	ctx->htfm = NULL;
-	ctx->workpage = NULL;
-	return ctx;
-}
-
-/**
- * ext4_get_fname_crypto_ctx() -
- *
- * Allocates a free crypto context and initializes it to hold
- * the crypto material for the inode.
- *
- * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise.
- */
-struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(
-	struct inode *inode, u32 max_ciphertext_len)
-{
-	struct ext4_fname_crypto_ctx *ctx;
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	int res;
-
-	/* Check if the crypto policy is set on the inode */
-	res = ext4_encrypted_inode(inode);
-	if (res == 0)
-		return NULL;
-
-	if (!ext4_has_encryption_key(inode))
-		ext4_generate_encryption_key(inode);
-
-	/* Get a crypto context based on the key.
-	 * A new context is allocated if no context matches the requested key.
-	 */
-	ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key));
-	if (ctx == NULL)
-		ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key));
-	if (IS_ERR(ctx))
-		return ctx;
-
-	ctx->flags = ei->i_crypt_policy_flags;
-	if (ctx->has_valid_key) {
-		if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) {
-			printk_once(KERN_WARNING
-				    "ext4: unsupported key mode %d\n",
-				    ctx->key.mode);
-			return ERR_PTR(-ENOKEY);
-		}
-
-		/* As a first cut, we will allocate new tfm in every call.
-		 * later, we will keep the tfm around, in case the key gets
-		 * re-used */
-		if (ctx->ctfm == NULL) {
-			ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))",
-					0, 0);
-		}
-		if (IS_ERR(ctx->ctfm)) {
-			res = PTR_ERR(ctx->ctfm);
-			printk(
-			    KERN_DEBUG "%s: error (%d) allocating crypto tfm\n",
-			    __func__, res);
-			ctx->ctfm = NULL;
-			ext4_put_fname_crypto_ctx(&ctx);
-			return ERR_PTR(res);
-		}
-		if (ctx->ctfm == NULL) {
-			printk(
-			    KERN_DEBUG "%s: could not allocate crypto tfm\n",
-			    __func__);
-			ext4_put_fname_crypto_ctx(&ctx);
-			return ERR_PTR(-ENOMEM);
-		}
-		if (ctx->workpage == NULL)
-			ctx->workpage = alloc_page(GFP_NOFS);
-		if (IS_ERR(ctx->workpage)) {
-			res = PTR_ERR(ctx->workpage);
-			printk(
-			    KERN_DEBUG "%s: error (%d) allocating work page\n",
-			    __func__, res);
-			ctx->workpage = NULL;
-			ext4_put_fname_crypto_ctx(&ctx);
-			return ERR_PTR(res);
-		}
-		if (ctx->workpage == NULL) {
-			printk(
-			    KERN_DEBUG "%s: could not allocate work page\n",
-			    __func__);
-			ext4_put_fname_crypto_ctx(&ctx);
-			return ERR_PTR(-ENOMEM);
-		}
-		ctx->lim = max_ciphertext_len;
-		crypto_ablkcipher_clear_flags(ctx->ctfm, ~0);
-		crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm),
-			CRYPTO_TFM_REQ_WEAK_KEY);
-
-		/* If we are lucky, we will get a context that is already
-		 * set up with the right key. Else, we will have to
-		 * set the key */
-		if (!ctx->ctfm_key_is_ready) {
-			/* Since our crypto objectives for filename encryption
-			 * are pretty weak,
-			 * we directly use the inode master key */
-			res = crypto_ablkcipher_setkey(ctx->ctfm,
-					ctx->key.raw, ctx->key.size);
-			if (res) {
-				ext4_put_fname_crypto_ctx(&ctx);
-				return ERR_PTR(-EIO);
-			}
-			ctx->ctfm_key_is_ready = 1;
-		} else {
-			/* In the current implementation, key should never be
-			 * marked "ready" for a context that has just been
-			 * allocated. So we should never reach here */
-			 BUG();
-		}
-	}
-	if (ctx->htfm == NULL)
-		ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(ctx->htfm)) {
-		res = PTR_ERR(ctx->htfm);
-		printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n",
-			__func__, res);
-		ctx->htfm = NULL;
-		ext4_put_fname_crypto_ctx(&ctx);
-		return ERR_PTR(res);
-	}
-	if (ctx->htfm == NULL) {
-		printk(KERN_DEBUG "%s: could not allocate hash tfm\n",
-				__func__);
-		ext4_put_fname_crypto_ctx(&ctx);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	return ctx;
-}
-
-/**
  * ext4_fname_crypto_round_up() -
  *
  * Return: The next multiple of block size
@@ -464,44 +262,29 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize)
 	return ((size+blksize-1)/blksize)*blksize;
 }
 
-/**
- * ext4_fname_crypto_namelen_on_disk() -
- */
-int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
-				      u32 namelen)
+unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen)
 {
-	u32 ciphertext_len;
-	int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK);
-
-	if (ctx == NULL)
-		return -EIO;
-	if (!(ctx->has_valid_key))
-		return -EACCES;
-	ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ?
-		EXT4_CRYPTO_BLOCK_SIZE : namelen;
-	ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding);
-	ciphertext_len = (ciphertext_len > ctx->lim)
-			? ctx->lim : ciphertext_len;
-	return (int) ciphertext_len;
+	struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+	int padding = 32;
+
+	if (ci)
+		padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK);
+	if (ilen < EXT4_CRYPTO_BLOCK_SIZE)
+		ilen = EXT4_CRYPTO_BLOCK_SIZE;
+	return ext4_fname_crypto_round_up(ilen, padding);
 }
 
-/**
- * ext4_fname_crypto_alloc_obuff() -
+/*
+ * ext4_fname_crypto_alloc_buffer() -
  *
  * Allocates an output buffer that is sufficient for the crypto operation
  * specified by the context and the direction.
  */
-int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx,
+int ext4_fname_crypto_alloc_buffer(struct inode *inode,
 				   u32 ilen, struct ext4_str *crypto_str)
 {
-	unsigned int olen;
-	int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK);
+	unsigned int olen = ext4_fname_encrypted_size(inode, ilen);
 
-	if (!ctx)
-		return -EIO;
-	if (padding < EXT4_CRYPTO_BLOCK_SIZE)
-		padding = EXT4_CRYPTO_BLOCK_SIZE;
-	olen = ext4_fname_crypto_round_up(ilen, padding);
 	crypto_str->len = olen;
 	if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2)
 		olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2;
@@ -529,7 +312,7 @@ void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str)
 /**
  * ext4_fname_disk_to_usr() - converts a filename from disk space to user space
  */
-int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+int _ext4_fname_disk_to_usr(struct inode *inode,
 			    struct dx_hash_info *hinfo,
 			    const struct ext4_str *iname,
 			    struct ext4_str *oname)
@@ -537,8 +320,6 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
 	char buf[24];
 	int ret;
 
-	if (ctx == NULL)
-		return -EIO;
 	if (iname->len < 3) {
 		/*Check for . and .. */
 		if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') {
@@ -548,8 +329,8 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
 			return oname->len;
 		}
 	}
-	if (ctx->has_valid_key)
-		return ext4_fname_decrypt(ctx, iname, oname);
+	if (EXT4_I(inode)->i_crypt_info)
+		return ext4_fname_decrypt(inode, iname, oname);
 
 	if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) {
 		ret = digest_encode(iname->name, iname->len, oname->name);
@@ -568,7 +349,7 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
 	return ret + 1;
 }
 
-int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+int ext4_fname_disk_to_usr(struct inode *inode,
 			   struct dx_hash_info *hinfo,
 			   const struct ext4_dir_entry_2 *de,
 			   struct ext4_str *oname)
@@ -576,21 +357,20 @@ int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
 	struct ext4_str iname = {.name = (unsigned char *) de->name,
 				 .len = de->name_len };
 
-	return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname);
+	return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname);
 }
 
 
 /**
  * ext4_fname_usr_to_disk() - converts a filename from user space to disk space
  */
-int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
+int ext4_fname_usr_to_disk(struct inode *inode,
 			   const struct qstr *iname,
 			   struct ext4_str *oname)
 {
 	int res;
+	struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
 
-	if (ctx == NULL)
-		return -EIO;
 	if (iname->len < 3) {
 		/*Check for . and .. */
 		if (iname->name[0] == '.' &&
@@ -601,8 +381,8 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
 			return oname->len;
 		}
 	}
-	if (ctx->has_valid_key) {
-		res = ext4_fname_encrypt(ctx, iname, oname);
+	if (ci) {
+		res = ext4_fname_encrypt(inode, iname, oname);
 		return res;
 	}
 	/* Without a proper key, a user is not allowed to modify the filenames
@@ -611,109 +391,79 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
 	return -EACCES;
 }
 
-/*
- * Calculate the htree hash from a filename from user space
- */
-int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
-			    const struct qstr *iname,
-			    struct dx_hash_info *hinfo)
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+			      int lookup, struct ext4_filename *fname)
 {
-	struct ext4_str tmp;
-	int ret = 0;
-	char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1];
+	struct ext4_crypt_info *ci;
+	int ret = 0, bigname = 0;
+
+	memset(fname, 0, sizeof(struct ext4_filename));
+	fname->usr_fname = iname;
 
-	if (!ctx ||
+	if (!ext4_encrypted_inode(dir) ||
 	    ((iname->name[0] == '.') &&
 	     ((iname->len == 1) ||
 	      ((iname->name[1] == '.') && (iname->len == 2))))) {
-		ext4fs_dirhash(iname->name, iname->len, hinfo);
+		fname->disk_name.name = (unsigned char *) iname->name;
+		fname->disk_name.len = iname->len;
 		return 0;
 	}
-
-	if (!ctx->has_valid_key && iname->name[0] == '_') {
-		if (iname->len != 33)
-			return -ENOENT;
-		ret = digest_decode(iname->name+1, iname->len, buf);
-		if (ret != 24)
-			return -ENOENT;
-		memcpy(&hinfo->hash, buf, 4);
-		memcpy(&hinfo->minor_hash, buf + 4, 4);
+	ret = ext4_get_encryption_info(dir);
+	if (ret)
+		return ret;
+	ci = EXT4_I(dir)->i_crypt_info;
+	if (ci) {
+		ret = ext4_fname_crypto_alloc_buffer(dir, iname->len,
+						     &fname->crypto_buf);
+		if (ret < 0)
+			return ret;
+		ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf);
+		if (ret < 0)
+			goto errout;
+		fname->disk_name.name = fname->crypto_buf.name;
+		fname->disk_name.len = fname->crypto_buf.len;
 		return 0;
 	}
+	if (!lookup)
+		return -EACCES;
 
-	if (!ctx->has_valid_key && iname->name[0] != '_') {
-		if (iname->len > 43)
-			return -ENOENT;
-		ret = digest_decode(iname->name, iname->len, buf);
-		ext4fs_dirhash(buf, ret, hinfo);
-		return 0;
+	/* We don't have the key and we are doing a lookup; decode the
+	 * user-supplied name
+	 */
+	if (iname->name[0] == '_')
+		bigname = 1;
+	if ((bigname && (iname->len != 33)) ||
+	    (!bigname && (iname->len > 43)))
+		return -ENOENT;
+
+	fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+	if (fname->crypto_buf.name == NULL)
+		return -ENOMEM;
+	ret = digest_decode(iname->name + bigname, iname->len - bigname,
+			    fname->crypto_buf.name);
+	if (ret < 0) {
+		ret = -ENOENT;
+		goto errout;
 	}
-
-	/* First encrypt the plaintext name */
-	ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp);
-	if (ret < 0)
-		return ret;
-
-	ret = ext4_fname_encrypt(ctx, iname, &tmp);
-	if (ret >= 0) {
-		ext4fs_dirhash(tmp.name, tmp.len, hinfo);
-		ret = 0;
+	fname->crypto_buf.len = ret;
+	if (bigname) {
+		memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4);
+		memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4);
+	} else {
+		fname->disk_name.name = fname->crypto_buf.name;
+		fname->disk_name.len = fname->crypto_buf.len;
 	}
-
-	ext4_fname_crypto_free_buffer(&tmp);
+	return 0;
+errout:
+	kfree(fname->crypto_buf.name);
+	fname->crypto_buf.name = NULL;
 	return ret;
 }
 
-int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr,
-		     int len, const char * const name,
-		     struct ext4_dir_entry_2 *de)
+void ext4_fname_free_filename(struct ext4_filename *fname)
 {
-	int ret = -ENOENT;
-	int bigname = (*name == '_');
-
-	if (ctx->has_valid_key) {
-		if (cstr->name == NULL) {
-			struct qstr istr;
-
-			ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr);
-			if (ret < 0)
-				goto errout;
-			istr.name = name;
-			istr.len = len;
-			ret = ext4_fname_encrypt(ctx, &istr, cstr);
-			if (ret < 0)
-				goto errout;
-		}
-	} else {
-		if (cstr->name == NULL) {
-			cstr->name = kmalloc(32, GFP_KERNEL);
-			if (cstr->name == NULL)
-				return -ENOMEM;
-			if ((bigname && (len != 33)) ||
-			    (!bigname && (len > 43)))
-				goto errout;
-			ret = digest_decode(name+bigname, len-bigname,
-					    cstr->name);
-			if (ret < 0) {
-				ret = -ENOENT;
-				goto errout;
-			}
-			cstr->len = ret;
-		}
-		if (bigname) {
-			if (de->name_len < 16)
-				return 0;
-			ret = memcmp(de->name + de->name_len - 16,
-				     cstr->name + 8, 16);
-			return (ret == 0) ? 1 : 0;
-		}
-	}
-	if (de->name_len != cstr->len)
-		return 0;
-	ret = memcmp(de->name, cstr->name, cstr->len);
-	return (ret == 0) ? 1 : 0;
-errout:
-	kfree(cstr->name);
-	cstr->name = NULL;
-	return ret;
+	kfree(fname->crypto_buf.name);
+	fname->crypto_buf.name = NULL;
+	fname->usr_fname = NULL;
+	fname->disk_name.name = NULL;
 }
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 52170d0b7..442d24e8e 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -84,14 +84,38 @@ out:
 	return res;
 }
 
-/**
- * ext4_generate_encryption_key() - generates an encryption key
- * @inode: The inode to generate the encryption key for.
- */
-int ext4_generate_encryption_key(struct inode *inode)
+void ext4_free_crypt_info(struct ext4_crypt_info *ci)
+{
+	if (!ci)
+		return;
+
+	if (ci->ci_keyring_key)
+		key_put(ci->ci_keyring_key);
+	crypto_free_ablkcipher(ci->ci_ctfm);
+	kmem_cache_free(ext4_crypt_info_cachep, ci);
+}
+
+void ext4_free_encryption_info(struct inode *inode,
+			       struct ext4_crypt_info *ci)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_crypt_info *prev;
+
+	if (ci == NULL)
+		ci = ACCESS_ONCE(ei->i_crypt_info);
+	if (ci == NULL)
+		return;
+	prev = cmpxchg(&ei->i_crypt_info, ci, NULL);
+	if (prev != ci)
+		return;
+
+	ext4_free_crypt_info(ci);
+}
+
+int _ext4_get_encryption_info(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_encryption_key *crypt_key = &ei->i_encryption_key;
+	struct ext4_crypt_info *crypt_info;
 	char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
 				 (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1];
 	struct key *keyring_key = NULL;
@@ -99,31 +123,76 @@ int ext4_generate_encryption_key(struct inode *inode)
 	struct ext4_encryption_context ctx;
 	struct user_key_payload *ukp;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
-				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
-				 &ctx, sizeof(ctx));
+	struct crypto_ablkcipher *ctfm;
+	const char *cipher_str;
+	char raw_key[EXT4_MAX_KEY_SIZE];
+	char mode;
+	int res;
 
-	if (res != sizeof(ctx)) {
-		if (res > 0)
-			res = -EINVAL;
-		goto out;
+	if (!ext4_read_workqueue) {
+		res = ext4_init_crypto();
+		if (res)
+			return res;
+	}
+
+retry:
+	crypt_info = ACCESS_ONCE(ei->i_crypt_info);
+	if (crypt_info) {
+		if (!crypt_info->ci_keyring_key ||
+		    key_validate(crypt_info->ci_keyring_key) == 0)
+			return 0;
+		ext4_free_encryption_info(inode, crypt_info);
+		goto retry;
 	}
+
+	res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+				 &ctx, sizeof(ctx));
+	if (res < 0) {
+		if (!DUMMY_ENCRYPTION_ENABLED(sbi))
+			return res;
+		ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
+		ctx.filenames_encryption_mode =
+			EXT4_ENCRYPTION_MODE_AES_256_CTS;
+		ctx.flags = 0;
+	} else if (res != sizeof(ctx))
+		return -EINVAL;
 	res = 0;
 
-	ei->i_crypt_policy_flags = ctx.flags;
+	crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL);
+	if (!crypt_info)
+		return -ENOMEM;
+
+	crypt_info->ci_flags = ctx.flags;
+	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+	crypt_info->ci_ctfm = NULL;
+	crypt_info->ci_keyring_key = NULL;
+	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+	       sizeof(crypt_info->ci_master_key));
 	if (S_ISREG(inode->i_mode))
-		crypt_key->mode = ctx.contents_encryption_mode;
+		mode = crypt_info->ci_data_mode;
 	else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		crypt_key->mode = ctx.filenames_encryption_mode;
-	else {
-		printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n");
+		mode = crypt_info->ci_filename_mode;
+	else
 		BUG();
+	switch (mode) {
+	case EXT4_ENCRYPTION_MODE_AES_256_XTS:
+		cipher_str = "xts(aes)";
+		break;
+	case EXT4_ENCRYPTION_MODE_AES_256_CTS:
+		cipher_str = "cts(cbc(aes))";
+		break;
+	default:
+		printk_once(KERN_WARNING
+			    "ext4: unsupported key mode %d (ino %u)\n",
+			    mode, (unsigned) inode->i_ino);
+		res = -ENOKEY;
+		goto out;
 	}
-	crypt_key->size = ext4_encryption_key_size(crypt_key->mode);
-	BUG_ON(!crypt_key->size);
 	if (DUMMY_ENCRYPTION_ENABLED(sbi)) {
-		memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE);
-		goto out;
+		memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE);
+		goto got_key;
 	}
 	memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX,
 	       EXT4_KEY_DESC_PREFIX_SIZE);
@@ -138,6 +207,7 @@ int ext4_generate_encryption_key(struct inode *inode)
 		keyring_key = NULL;
 		goto out;
 	}
+	crypt_info->ci_keyring_key = keyring_key;
 	BUG_ON(keyring_key->type != &key_type_logon);
 	ukp = ((struct user_key_payload *)keyring_key->payload.data);
 	if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
@@ -148,19 +218,43 @@ int ext4_generate_encryption_key(struct inode *inode)
 	BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
 		     EXT4_KEY_DERIVATION_NONCE_SIZE);
 	BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE);
-	res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw);
+	res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
+				  raw_key);
+got_key:
+	ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
+	if (!ctfm || IS_ERR(ctfm)) {
+		res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+		printk(KERN_DEBUG
+		       "%s: error %d (inode %u) allocating crypto tfm\n",
+		       __func__, res, (unsigned) inode->i_ino);
+		goto out;
+	}
+	crypt_info->ci_ctfm = ctfm;
+	crypto_ablkcipher_clear_flags(ctfm, ~0);
+	crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
+			     CRYPTO_TFM_REQ_WEAK_KEY);
+	res = crypto_ablkcipher_setkey(ctfm, raw_key,
+				       ext4_encryption_key_size(mode));
+	if (res)
+		goto out;
+	memzero_explicit(raw_key, sizeof(raw_key));
+	if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) {
+		ext4_free_crypt_info(crypt_info);
+		goto retry;
+	}
+	return 0;
+
 out:
-	if (keyring_key)
-		key_put(keyring_key);
-	if (res < 0)
-		crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID;
+	if (res == -ENOKEY)
+		res = 0;
+	ext4_free_crypt_info(crypt_info);
+	memzero_explicit(raw_key, sizeof(raw_key));
 	return res;
 }
 
 int ext4_has_encryption_key(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_encryption_key *crypt_key = &ei->i_encryption_key;
 
-	return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID);
+	return (ei->i_crypt_info != NULL);
 }
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
index a6d6291ae..02c4e5df7 100644
--- a/fs/ext4/crypto_policy.c
+++ b/fs/ext4/crypto_policy.c
@@ -51,6 +51,10 @@ static int ext4_create_encryption_context_from_policy(
 	struct ext4_encryption_context ctx;
 	int res = 0;
 
+	res = ext4_convert_inline_data(inode);
+	if (res)
+		return res;
+
 	ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
 	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
 	       EXT4_KEY_DESCRIPTOR_SIZE);
@@ -89,6 +93,8 @@ int ext4_process_policy(const struct ext4_encryption_policy *policy,
 		return -EINVAL;
 
 	if (!ext4_inode_has_encryption_context(inode)) {
+		if (!S_ISDIR(inode->i_mode))
+			return -EINVAL;
 		if (!ext4_empty_dir(inode))
 			return -ENOTEMPTY;
 		return ext4_create_encryption_context_from_policy(inode,
@@ -126,7 +132,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy)
 int ext4_is_child_context_consistent_with_parent(struct inode *parent,
 						 struct inode *child)
 {
-	struct ext4_encryption_context parent_ctx, child_ctx;
+	struct ext4_crypt_info *parent_ci, *child_ci;
 	int res;
 
 	if ((parent == NULL) || (child == NULL)) {
@@ -136,26 +142,28 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent,
 	/* no restrictions if the parent directory is not encrypted */
 	if (!ext4_encrypted_inode(parent))
 		return 1;
-	res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION,
-			     EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
-			     &parent_ctx, sizeof(parent_ctx));
-	if (res != sizeof(parent_ctx))
-		return 0;
 	/* if the child directory is not encrypted, this is always a problem */
 	if (!ext4_encrypted_inode(child))
 		return 0;
-	res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION,
-			     EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
-			     &child_ctx, sizeof(child_ctx));
-	if (res != sizeof(child_ctx))
+	res = ext4_get_encryption_info(parent);
+	if (res)
 		return 0;
-	return (memcmp(parent_ctx.master_key_descriptor,
-		       child_ctx.master_key_descriptor,
+	res = ext4_get_encryption_info(child);
+	if (res)
+		return 0;
+	parent_ci = EXT4_I(parent)->i_crypt_info;
+	child_ci = EXT4_I(child)->i_crypt_info;
+	if (!parent_ci && !child_ci)
+		return 1;
+	if (!parent_ci || !child_ci)
+		return 0;
+
+	return (memcmp(parent_ci->ci_master_key,
+		       child_ci->ci_master_key,
 		       EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
-		(parent_ctx.contents_encryption_mode ==
-		 child_ctx.contents_encryption_mode) &&
-		(parent_ctx.filenames_encryption_mode ==
-		 child_ctx.filenames_encryption_mode));
+		(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+		(parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
+		(parent_ci->ci_flags == child_ci->ci_flags));
 }
 
 /**
@@ -168,31 +176,40 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent,
 int ext4_inherit_context(struct inode *parent, struct inode *child)
 {
 	struct ext4_encryption_context ctx;
-	int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION,
-				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
-				 &ctx, sizeof(ctx));
+	struct ext4_crypt_info *ci;
+	int res;
+
+	res = ext4_get_encryption_info(parent);
+	if (res < 0)
+		return res;
+	ci = EXT4_I(parent)->i_crypt_info;
+	if (ci == NULL)
+		return -ENOKEY;
 
-	if (res != sizeof(ctx)) {
-		if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) {
-			ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
-			ctx.contents_encryption_mode =
-				EXT4_ENCRYPTION_MODE_AES_256_XTS;
-			ctx.filenames_encryption_mode =
-				EXT4_ENCRYPTION_MODE_AES_256_CTS;
-			ctx.flags = 0;
-			memset(ctx.master_key_descriptor, 0x42,
-			       EXT4_KEY_DESCRIPTOR_SIZE);
-			res = 0;
-		} else {
-			goto out;
-		}
+	ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
+	if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) {
+		ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
+		ctx.filenames_encryption_mode =
+			EXT4_ENCRYPTION_MODE_AES_256_CTS;
+		ctx.flags = 0;
+		memset(ctx.master_key_descriptor, 0x42,
+		       EXT4_KEY_DESCRIPTOR_SIZE);
+		res = 0;
+	} else {
+		ctx.contents_encryption_mode = ci->ci_data_mode;
+		ctx.filenames_encryption_mode = ci->ci_filename_mode;
+		ctx.flags = ci->ci_flags;
+		memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+		       EXT4_KEY_DESCRIPTOR_SIZE);
 	}
 	get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
 	res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION,
 			     EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
 			     sizeof(ctx), 0);
-out:
-	if (!res)
+	if (!res) {
 		ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT);
+		ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA);
+		res = ext4_get_encryption_info(child);
+	}
 	return res;
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 5665d82d2..f9e149119 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -110,7 +110,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *bh = NULL;
 	int dir_has_error = 0;
-	struct ext4_fname_crypto_ctx *enc_ctx = NULL;
 	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
 
 	if (is_dx_dir(inode)) {
@@ -134,16 +133,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 			return err;
 	}
 
-	enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN);
-	if (IS_ERR(enc_ctx))
-		return PTR_ERR(enc_ctx);
-	if (enc_ctx) {
-		err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN,
+	if (ext4_encrypted_inode(inode)) {
+		err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN,
 						     &fname_crypto_str);
-		if (err < 0) {
-			ext4_put_fname_crypto_ctx(&enc_ctx);
+		if (err < 0)
 			return err;
-		}
 	}
 
 	offset = ctx->pos & (sb->s_blocksize - 1);
@@ -239,17 +233,19 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 			offset += ext4_rec_len_from_disk(de->rec_len,
 					sb->s_blocksize);
 			if (le32_to_cpu(de->inode)) {
-				if (enc_ctx == NULL) {
-					/* Directory is not encrypted */
+				if (!ext4_encrypted_inode(inode)) {
 					if (!dir_emit(ctx, de->name,
 					    de->name_len,
 					    le32_to_cpu(de->inode),
 					    get_dtype(sb, de->file_type)))
 						goto done;
 				} else {
+					int save_len = fname_crypto_str.len;
+
 					/* Directory is encrypted */
-					err = ext4_fname_disk_to_usr(enc_ctx,
+					err = ext4_fname_disk_to_usr(inode,
 						NULL, de, &fname_crypto_str);
+					fname_crypto_str.len = save_len;
 					if (err < 0)
 						goto errout;
 					if (!dir_emit(ctx,
@@ -272,7 +268,6 @@ done:
 	err = 0;
 errout:
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-	ext4_put_fname_crypto_ctx(&enc_ctx);
 	ext4_fname_crypto_free_buffer(&fname_crypto_str);
 #endif
 	brelse(bh);
@@ -598,6 +593,13 @@ finished:
 	return 0;
 }
 
+static int ext4_dir_open(struct inode * inode, struct file * filp)
+{
+	if (ext4_encrypted_inode(inode))
+		return ext4_get_encryption_info(inode) ? -EACCES : 0;
+	return 0;
+}
+
 static int ext4_release_dir(struct inode *inode, struct file *filp)
 {
 	if (filp->private_data)
@@ -640,5 +642,6 @@ const struct file_operations ext4_dir_operations = {
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
 	.fsync		= ext4_sync_file,
+	.open		= ext4_dir_open,
 	.release	= ext4_release_dir,
 };
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9a83f149a..f5e9f0422 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -69,15 +69,6 @@
 #define ext_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
-#define EXT4_ERROR_INODE(inode, fmt, a...) \
-	ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
-
-#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)			\
-	ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
-
-#define EXT4_ERROR_FILE(file, block, fmt, a...)				\
-	ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
-
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
 
@@ -90,6 +81,11 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
 
+enum SHIFT_DIRECTION {
+	SHIFT_LEFT = 0,
+	SHIFT_RIGHT,
+};
+
 /*
  * Flags used in mballoc's allocation_context flags field.
  *
@@ -911,7 +907,6 @@ struct ext4_inode_info {
 
 	/* on-disk additional length */
 	__u16 i_extra_isize;
-	char i_crypt_policy_flags;
 
 	/* Indicate the inline data space. */
 	u16 i_inline_off;
@@ -955,7 +950,7 @@ struct ext4_inode_info {
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 	/* Encryption params */
-	struct ext4_encryption_key i_encryption_key;
+	struct ext4_crypt_info *i_crypt_info;
 #endif
 };
 
@@ -1374,12 +1369,6 @@ struct ext4_sb_info {
 	struct ratelimit_state s_err_ratelimit_state;
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
-
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	/* Encryption */
-	uint32_t s_file_encryption_mode;
-	uint32_t s_dir_encryption_mode;
-#endif
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1838,6 +1827,17 @@ struct dx_hash_info
  */
 #define HASH_NB_ALWAYS		1
 
+struct ext4_filename {
+	const struct qstr *usr_fname;
+	struct ext4_str disk_name;
+	struct dx_hash_info hinfo;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	struct ext4_str crypto_buf;
+#endif
+};
+
+#define fname_name(p) ((p)->disk_name.name)
+#define fname_len(p)  ((p)->disk_name.len)
 
 /*
  * Describe an inode's exact location on disk and in memory
@@ -2054,6 +2054,7 @@ int ext4_get_policy(struct inode *inode,
 		    struct ext4_encryption_policy *policy);
 
 /* crypto.c */
+extern struct kmem_cache *ext4_crypt_info_cachep;
 bool ext4_valid_contents_enc_mode(uint32_t mode);
 uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size);
 extern struct workqueue_struct *ext4_read_workqueue;
@@ -2085,57 +2086,84 @@ static inline int ext4_sb_has_crypto(struct super_block *sb)
 /* crypto_fname.c */
 bool ext4_valid_filenames_enc_mode(uint32_t mode);
 u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
-int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx,
+unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen);
+int ext4_fname_crypto_alloc_buffer(struct inode *inode,
 				   u32 ilen, struct ext4_str *crypto_str);
-int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+int _ext4_fname_disk_to_usr(struct inode *inode,
 			    struct dx_hash_info *hinfo,
 			    const struct ext4_str *iname,
 			    struct ext4_str *oname);
-int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+int ext4_fname_disk_to_usr(struct inode *inode,
 			   struct dx_hash_info *hinfo,
 			   const struct ext4_dir_entry_2 *de,
 			   struct ext4_str *oname);
-int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
+int ext4_fname_usr_to_disk(struct inode *inode,
 			   const struct qstr *iname,
 			   struct ext4_str *oname);
-int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
-			   const struct qstr *iname,
-			   struct dx_hash_info *hinfo);
-int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
-				      u32 namelen);
-int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr,
-		     int len, const char * const name,
-		     struct ext4_dir_entry_2 *de);
-
-
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx);
-struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode,
-							u32 max_len);
 void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str);
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+			      int lookup, struct ext4_filename *fname);
+void ext4_fname_free_filename(struct ext4_filename *fname);
 #else
 static inline
-void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { }
-static inline
-struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode,
-							u32 max_len)
+int ext4_setup_fname_crypto(struct inode *inode)
 {
-	return NULL;
+	return 0;
 }
 static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { }
+static inline int ext4_fname_setup_filename(struct inode *dir,
+				     const struct qstr *iname,
+				     int lookup, struct ext4_filename *fname)
+{
+	fname->usr_fname = iname;
+	fname->disk_name.name = (unsigned char *) iname->name;
+	fname->disk_name.len = iname->len;
+	return 0;
+}
+static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
 #endif
 
 
 /* crypto_key.c */
-int ext4_generate_encryption_key(struct inode *inode);
+void ext4_free_crypt_info(struct ext4_crypt_info *ci);
+void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci);
+int _ext4_get_encryption_info(struct inode *inode);
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 int ext4_has_encryption_key(struct inode *inode);
+
+static inline int ext4_get_encryption_info(struct inode *inode)
+{
+	struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+
+	if (!ci ||
+	    (ci->ci_keyring_key &&
+	     (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+					   (1 << KEY_FLAG_REVOKED) |
+					   (1 << KEY_FLAG_DEAD)))))
+		return _ext4_get_encryption_info(inode);
+	return 0;
+}
+
+static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
+{
+	return EXT4_I(inode)->i_crypt_info;
+}
+
 #else
 static inline int ext4_has_encryption_key(struct inode *inode)
 {
 	return 0;
 }
+static inline int ext4_get_encryption_info(struct inode *inode)
+{
+	return 0;
+}
+static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
+{
+	return NULL;
+}
 #endif
 
 
@@ -2156,14 +2184,13 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
 			     struct buffer_head *bh,
 			     void *buf, int buf_size,
-			     const char *name, int namelen,
+			     struct ext4_filename *fname,
 			     struct ext4_dir_entry_2 **dest_de);
 int ext4_insert_dentry(struct inode *dir,
-			struct inode *inode,
-			struct ext4_dir_entry_2 *de,
-			int buf_size,
-		       const struct qstr *iname,
-			const char *name, int namelen);
+		       struct inode *inode,
+		       struct ext4_dir_entry_2 *de,
+		       int buf_size,
+		       struct ext4_filename *fname);
 static inline void ext4_update_dx_flag(struct inode *inode)
 {
 	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
@@ -2317,13 +2344,14 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 				__u32 start_minor_hash, __u32 *next_hash);
-extern int search_dir(struct buffer_head *bh,
-		      char *search_buf,
-		      int buf_size,
-		      struct inode *dir,
-		      const struct qstr *d_name,
-		      unsigned int offset,
-		      struct ext4_dir_entry_2 **res_dir);
+extern int ext4_search_dir(struct buffer_head *bh,
+			   char *search_buf,
+			   int buf_size,
+			   struct inode *dir,
+			   struct ext4_filename *fname,
+			   const struct qstr *d_name,
+			   unsigned int offset,
+			   struct ext4_dir_entry_2 **res_dir);
 extern int ext4_generic_delete_entry(handle_t *handle,
 				     struct inode *dir,
 				     struct ext4_dir_entry_2 *de_del,
@@ -2368,6 +2396,9 @@ void __ext4_abort(struct super_block *, const char *, unsigned int,
 extern __printf(4, 5)
 void __ext4_warning(struct super_block *, const char *, unsigned int,
 		    const char *, ...);
+extern __printf(4, 5)
+void __ext4_warning_inode(const struct inode *inode, const char *function,
+			  unsigned int line, const char *fmt, ...);
 extern __printf(3, 4)
 void __ext4_msg(struct super_block *, const char *, const char *, ...);
 extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
@@ -2378,6 +2409,15 @@ void __ext4_grp_locked_error(const char *, unsigned int,
 			     unsigned long, ext4_fsblk_t,
 			     const char *, ...);
 
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+	ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
+
+#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)			\
+	ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
+
+#define EXT4_ERROR_FILE(file, block, fmt, a...)				\
+	ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
+
 #ifdef CONFIG_PRINTK
 
 #define ext4_error_inode(inode, func, line, block, fmt, ...)		\
@@ -2390,6 +2430,8 @@ void __ext4_grp_locked_error(const char *, unsigned int,
 	__ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
 #define ext4_warning(sb, fmt, ...)					\
 	__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning_inode(inode, fmt, ...)				\
+	__ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__)
 #define ext4_msg(sb, level, fmt, ...)				\
 	__ext4_msg(sb, level, fmt, ##__VA_ARGS__)
 #define dump_mmp_msg(sb, mmp, msg)					\
@@ -2425,6 +2467,11 @@ do {									\
 	no_printk(fmt, ##__VA_ARGS__);					\
 	__ext4_warning(sb, "", 0, " ");					\
 } while (0)
+#define ext4_warning_inode(inode, fmt, ...)				\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_warning_inode(inode, "", 0, " ");			\
+} while (0)
 #define ext4_msg(sb, level, fmt, ...)					\
 do {									\
 	no_printk(fmt, ##__VA_ARGS__);					\
@@ -2768,7 +2815,9 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
 extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
 					 unsigned len, unsigned copied,
 					 struct page *page);
-extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+extern int ext4_try_add_inline_entry(handle_t *handle,
+				     struct ext4_filename *fname,
+				     struct dentry *dentry,
 				     struct inode *inode);
 extern int ext4_try_create_inline_dir(handle_t *handle,
 				      struct inode *parent,
@@ -2782,6 +2831,7 @@ extern int htree_inlinedir_to_tree(struct file *dir_file,
 				   __u32 start_hash, __u32 start_minor_hash,
 				   int *has_inline_data);
 extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+					struct ext4_filename *fname,
 					const struct qstr *d_name,
 					struct ext4_dir_entry_2 **res_dir,
 					int *has_inline_data);
@@ -2847,6 +2897,7 @@ extern int ext4_mpage_readpages(struct address_space *mapping,
 				unsigned nr_pages);
 
 /* symlink.c */
+extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
 
@@ -2912,6 +2963,7 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
 extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
 extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
 				struct inode *inode2, ext4_lblk_t lblk1,
 			     ext4_lblk_t lblk2,  ext4_lblk_t count,
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index d75159c10..ac7d4e813 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -66,24 +66,39 @@ struct ext4_encryption_context {
 #define EXT4_KEY_DESC_PREFIX "ext4:"
 #define EXT4_KEY_DESC_PREFIX_SIZE 5
 
+/* This is passed in from userspace into the kernel keyring */
 struct ext4_encryption_key {
-	uint32_t mode;
-	char raw[EXT4_MAX_KEY_SIZE];
-	uint32_t size;
+        __u32 mode;
+        char raw[EXT4_MAX_KEY_SIZE];
+        __u32 size;
+} __attribute__((__packed__));
+
+struct ext4_crypt_info {
+	char		ci_data_mode;
+	char		ci_filename_mode;
+	char		ci_flags;
+	struct crypto_ablkcipher *ci_ctfm;
+	struct key	*ci_keyring_key;
+	char		ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE];
 };
 
 #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL             0x00000001
-#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL     0x00000002
+#define EXT4_WRITE_PATH_FL			      0x00000002
 
 struct ext4_crypto_ctx {
-	struct crypto_tfm *tfm;         /* Crypto API context */
-	struct page *bounce_page;       /* Ciphertext page on write path */
-	struct page *control_page;      /* Original page on write path */
-	struct bio *bio;                /* The bio for this context */
-	struct work_struct work;        /* Work queue for read complete path */
-	struct list_head free_list;     /* Free list */
-	int flags;                      /* Flags */
-	int mode;                       /* Encryption mode for tfm */
+	union {
+		struct {
+			struct page *bounce_page;       /* Ciphertext page */
+			struct page *control_page;      /* Original page  */
+		} w;
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		} r;
+		struct list_head free_list;     /* Free list */
+	};
+	char flags;                      /* Flags */
+	char mode;                       /* Encryption mode for tfm */
 };
 
 struct ext4_completion_result {
@@ -121,18 +136,6 @@ struct ext4_str {
 	u32 len;
 };
 
-struct ext4_fname_crypto_ctx {
-	u32 lim;
-	char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE];
-	struct crypto_ablkcipher *ctfm;
-	struct crypto_hash *htfm;
-	struct page *workpage;
-	struct ext4_encryption_key key;
-	unsigned flags : 8;
-	unsigned has_valid_key : 1;
-	unsigned ctfm_key_is_ready : 1;
-};
-
 /**
  * For encrypted symlinks, the ciphertext length is stored at the beginning
  * of the string in little-endian format.
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 87ba10d1d..2553aa8b6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
+#include <linux/backing-dev.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "xattr.h"
@@ -4456,6 +4457,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+	if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+		ar.flags |= EXT4_MB_USE_RESERVED;
 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
 	if (!newblock)
 		goto out2;
@@ -4663,6 +4666,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 	int ret = 0;
 	int ret2 = 0;
 	int retries = 0;
+	int depth = 0;
 	struct ext4_map_blocks map;
 	unsigned int credits;
 	loff_t epos;
@@ -4677,13 +4681,32 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 	if (len <= EXT_UNWRITTEN_MAX_LEN)
 		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
 	/*
 	 * credits to insert 1 extent into extent tree
 	 */
 	credits = ext4_chunk_trans_blocks(inode, len);
+	/*
+	 * We can only call ext_depth() on extent based inodes
+	 */
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		depth = ext_depth(inode);
+	else
+		depth = -1;
 
 retry:
 	while (ret >= 0 && len) {
+		/*
+		 * Recalculate credits when extent tree depth changes.
+		 */
+		if (depth >= 0 && depth != ext_depth(inode)) {
+			credits = ext4_chunk_trans_blocks(inode, len);
+			depth = ext_depth(inode);
+		}
+
 		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
 					    credits);
 		if (IS_ERR(handle)) {
@@ -4725,6 +4748,8 @@ retry:
 		goto retry;
 	}
 
+	ext4_inode_resume_unlocked_dio(inode);
+
 	return ret > 0 ? ret2 : ret;
 }
 
@@ -4912,12 +4937,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	 * bug we should fix....
 	 */
 	if (ext4_encrypted_inode(inode) &&
-	    (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)))
+	    (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
+		     FALLOC_FL_ZERO_RANGE)))
 		return -EOPNOTSUPP;
 
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+		     FALLOC_FL_INSERT_RANGE))
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -4930,6 +4957,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (mode & FALLOC_FL_COLLAPSE_RANGE)
 		return ext4_collapse_range(inode, offset, len);
 
+	if (mode & FALLOC_FL_INSERT_RANGE)
+		return ext4_insert_range(inode, offset, len);
+
 	if (mode & FALLOC_FL_ZERO_RANGE)
 		return ext4_zero_range(file, offset, len, mode);
 
@@ -5224,13 +5254,13 @@ ext4_access_path(handle_t *handle, struct inode *inode,
 /*
  * ext4_ext_shift_path_extents:
  * Shift the extents of a path structure lying between path[depth].p_ext
- * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
- * from starting block for each extent.
+ * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
+ * if it is right shift or left shift operation.
  */
 static int
 ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
 			    struct inode *inode, handle_t *handle,
-			    ext4_lblk_t *start)
+			    enum SHIFT_DIRECTION SHIFT)
 {
 	int depth, err = 0;
 	struct ext4_extent *ex_start, *ex_last;
@@ -5252,19 +5282,25 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
 			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
 				update = 1;
 
-			*start = le32_to_cpu(ex_last->ee_block) +
-				ext4_ext_get_actual_len(ex_last);
-
 			while (ex_start <= ex_last) {
-				le32_add_cpu(&ex_start->ee_block, -shift);
-				/* Try to merge to the left. */
-				if ((ex_start >
-				     EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
-				    ext4_ext_try_to_merge_right(inode,
-							path, ex_start - 1))
+				if (SHIFT == SHIFT_LEFT) {
+					le32_add_cpu(&ex_start->ee_block,
+						-shift);
+					/* Try to merge to the left. */
+					if ((ex_start >
+					    EXT_FIRST_EXTENT(path[depth].p_hdr))
+					    &&
+					    ext4_ext_try_to_merge_right(inode,
+					    path, ex_start - 1))
+						ex_last--;
+					else
+						ex_start++;
+				} else {
+					le32_add_cpu(&ex_last->ee_block, shift);
+					ext4_ext_try_to_merge_right(inode, path,
+						ex_last);
 					ex_last--;
-				else
-					ex_start++;
+				}
 			}
 			err = ext4_ext_dirty(handle, inode, path + depth);
 			if (err)
@@ -5279,7 +5315,10 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
 		if (err)
 			goto out;
 
-		le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+		if (SHIFT == SHIFT_LEFT)
+			le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+		else
+			le32_add_cpu(&path[depth].p_idx->ei_block, shift);
 		err = ext4_ext_dirty(handle, inode, path + depth);
 		if (err)
 			goto out;
@@ -5297,19 +5336,20 @@ out:
 
 /*
  * ext4_ext_shift_extents:
- * All the extents which lies in the range from start to the last allocated
- * block for the file are shifted downwards by shift blocks.
+ * All the extents which lies in the range from @start to the last allocated
+ * block for the @inode are shifted either towards left or right (depending
+ * upon @SHIFT) by @shift blocks.
  * On success, 0 is returned, error otherwise.
  */
 static int
 ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
-		       ext4_lblk_t start, ext4_lblk_t shift)
+		       ext4_lblk_t start, ext4_lblk_t shift,
+		       enum SHIFT_DIRECTION SHIFT)
 {
 	struct ext4_ext_path *path;
 	int ret = 0, depth;
 	struct ext4_extent *extent;
-	ext4_lblk_t stop_block;
-	ext4_lblk_t ex_start, ex_end;
+	ext4_lblk_t stop, *iterator, ex_start, ex_end;
 
 	/* Let path point to the last extent */
 	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
@@ -5321,58 +5361,84 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	if (!extent)
 		goto out;
 
-	stop_block = le32_to_cpu(extent->ee_block) +
+	stop = le32_to_cpu(extent->ee_block) +
 			ext4_ext_get_actual_len(extent);
 
-	/* Nothing to shift, if hole is at the end of file */
-	if (start >= stop_block)
-		goto out;
+       /*
+	 * In case of left shift, Don't start shifting extents until we make
+	 * sure the hole is big enough to accommodate the shift.
+	*/
+	if (SHIFT == SHIFT_LEFT) {
+		path = ext4_find_extent(inode, start - 1, &path, 0);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+		depth = path->p_depth;
+		extent =  path[depth].p_ext;
+		if (extent) {
+			ex_start = le32_to_cpu(extent->ee_block);
+			ex_end = le32_to_cpu(extent->ee_block) +
+				ext4_ext_get_actual_len(extent);
+		} else {
+			ex_start = 0;
+			ex_end = 0;
+		}
 
-	/*
-	 * Don't start shifting extents until we make sure the hole is big
-	 * enough to accomodate the shift.
-	 */
-	path = ext4_find_extent(inode, start - 1, &path, 0);
-	if (IS_ERR(path))
-		return PTR_ERR(path);
-	depth = path->p_depth;
-	extent =  path[depth].p_ext;
-	if (extent) {
-		ex_start = le32_to_cpu(extent->ee_block);
-		ex_end = le32_to_cpu(extent->ee_block) +
-			ext4_ext_get_actual_len(extent);
-	} else {
-		ex_start = 0;
-		ex_end = 0;
+		if ((start == ex_start && shift > ex_start) ||
+		    (shift > start - ex_end)) {
+			ext4_ext_drop_refs(path);
+			kfree(path);
+			return -EINVAL;
+		}
 	}
 
-	if ((start == ex_start && shift > ex_start) ||
-	    (shift > start - ex_end))
-		return -EINVAL;
+	/*
+	 * In case of left shift, iterator points to start and it is increased
+	 * till we reach stop. In case of right shift, iterator points to stop
+	 * and it is decreased till we reach start.
+	 */
+	if (SHIFT == SHIFT_LEFT)
+		iterator = &start;
+	else
+		iterator = &stop;
 
 	/* Its safe to start updating extents */
-	while (start < stop_block) {
-		path = ext4_find_extent(inode, start, &path, 0);
+	while (start < stop) {
+		path = ext4_find_extent(inode, *iterator, &path, 0);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
 		depth = path->p_depth;
 		extent = path[depth].p_ext;
 		if (!extent) {
 			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
-					 (unsigned long) start);
+					 (unsigned long) *iterator);
 			return -EIO;
 		}
-		if (start > le32_to_cpu(extent->ee_block)) {
+		if (SHIFT == SHIFT_LEFT && *iterator >
+		    le32_to_cpu(extent->ee_block)) {
 			/* Hole, move to the next extent */
 			if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
 				path[depth].p_ext++;
 			} else {
-				start = ext4_ext_next_allocated_block(path);
+				*iterator = ext4_ext_next_allocated_block(path);
 				continue;
 			}
 		}
+
+		if (SHIFT == SHIFT_LEFT) {
+			extent = EXT_LAST_EXTENT(path[depth].p_hdr);
+			*iterator = le32_to_cpu(extent->ee_block) +
+					ext4_ext_get_actual_len(extent);
+		} else {
+			extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
+			*iterator =  le32_to_cpu(extent->ee_block) > 0 ?
+				le32_to_cpu(extent->ee_block) - 1 : 0;
+			/* Update path extent in case we need to stop */
+			while (le32_to_cpu(extent->ee_block) < start)
+				extent++;
+			path[depth].p_ext = extent;
+		}
 		ret = ext4_ext_shift_path_extents(path, shift, inode,
-				handle, &start);
+				handle, SHIFT);
 		if (ret)
 			break;
 	}
@@ -5485,7 +5551,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	ext4_discard_preallocations(inode);
 
 	ret = ext4_ext_shift_extents(inode, handle, punch_stop,
-				     punch_stop - punch_start);
+				     punch_stop - punch_start, SHIFT_LEFT);
 	if (ret) {
 		up_write(&EXT4_I(inode)->i_data_sem);
 		goto out_stop;
@@ -5510,6 +5576,174 @@ out_mutex:
 	return ret;
 }
 
+/*
+ * ext4_insert_range:
+ * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
+ * The data blocks starting from @offset to the EOF are shifted by @len
+ * towards right to create a hole in the @inode. Inode size is increased
+ * by len bytes.
+ * Returns 0 on success, error otherwise.
+ */
+int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct super_block *sb = inode->i_sb;
+	handle_t *handle;
+	struct ext4_ext_path *path;
+	struct ext4_extent *extent;
+	ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
+	unsigned int credits, ee_len;
+	int ret = 0, depth, split_flag = 0;
+	loff_t ioffset;
+
+	/*
+	 * We need to test this early because xfstests assumes that an
+	 * insert range of (0, 1) will return EOPNOTSUPP if the file
+	 * system does not support insert range.
+	 */
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		return -EOPNOTSUPP;
+
+	/* Insert range works only on fs block size aligned offsets. */
+	if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
+			len & (EXT4_CLUSTER_SIZE(sb) - 1))
+		return -EINVAL;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	trace_ext4_insert_range(inode, offset, len);
+
+	offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+	len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
+
+	/* Call ext4_force_commit to flush all data in case of data=journal */
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Need to round down to align start offset to page size boundary
+	 * for page size > block size.
+	 */
+	ioffset = round_down(offset, PAGE_SIZE);
+
+	/* Write out all dirty pages */
+	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+			LLONG_MAX);
+	if (ret)
+		return ret;
+
+	/* Take mutex lock */
+	mutex_lock(&inode->i_mutex);
+
+	/* Currently just for extent based files */
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		ret = -EOPNOTSUPP;
+		goto out_mutex;
+	}
+
+	/* Check for wrap through zero */
+	if (inode->i_size + len > inode->i_sb->s_maxbytes) {
+		ret = -EFBIG;
+		goto out_mutex;
+	}
+
+	/* Offset should be less than i_size */
+	if (offset >= i_size_read(inode)) {
+		ret = -EINVAL;
+		goto out_mutex;
+	}
+
+	truncate_pagecache(inode, ioffset);
+
+	/* Wait for existing dio to complete */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	credits = ext4_writepage_trans_blocks(inode);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_dio;
+	}
+
+	/* Expand file to avoid data loss if there is error while shifting */
+	inode->i_size += len;
+	EXT4_I(inode)->i_disksize += len;
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ret = ext4_mark_inode_dirty(handle, inode);
+	if (ret)
+		goto out_stop;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_discard_preallocations(inode);
+
+	path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+	if (IS_ERR(path)) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+
+	depth = ext_depth(inode);
+	extent = path[depth].p_ext;
+	if (extent) {
+		ee_start_lblk = le32_to_cpu(extent->ee_block);
+		ee_len = ext4_ext_get_actual_len(extent);
+
+		/*
+		 * If offset_lblk is not the starting block of extent, split
+		 * the extent @offset_lblk
+		 */
+		if ((offset_lblk > ee_start_lblk) &&
+				(offset_lblk < (ee_start_lblk + ee_len))) {
+			if (ext4_ext_is_unwritten(extent))
+				split_flag = EXT4_EXT_MARK_UNWRIT1 |
+					EXT4_EXT_MARK_UNWRIT2;
+			ret = ext4_split_extent_at(handle, inode, &path,
+					offset_lblk, split_flag,
+					EXT4_EX_NOCACHE |
+					EXT4_GET_BLOCKS_PRE_IO |
+					EXT4_GET_BLOCKS_METADATA_NOFAIL);
+		}
+
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		if (ret < 0) {
+			up_write(&EXT4_I(inode)->i_data_sem);
+			goto out_stop;
+		}
+	}
+
+	ret = ext4_es_remove_extent(inode, offset_lblk,
+			EXT_MAX_BLOCKS - offset_lblk);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+
+	/*
+	 * if offset_lblk lies in a hole which is at start of file, use
+	 * ee_start_lblk to shift extents
+	 */
+	ret = ext4_ext_shift_extents(inode, handle,
+		ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
+		len_lblk, SHIFT_RIGHT);
+
+	up_write(&EXT4_I(inode)->i_data_sem);
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+
+out_stop:
+	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
 /**
  * ext4_swap_extents - Swap extents between two inodes
  *
@@ -5542,7 +5776,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
 	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
 	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
-	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+	BUG_ON(!mutex_is_locked(&inode2->i_mutex));
 
 	*erp = ext4_es_remove_extent(inode1, lblk1, count);
 	if (unlikely(*erp))
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0613c256c..bc313ac5d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -192,15 +192,27 @@ out:
 }
 
 #ifdef CONFIG_FS_DAX
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+	struct inode *inode = bh->b_assoc_map->host;
+	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+	int err;
+	if (!uptodate)
+		return;
+	WARN_ON(!buffer_unwritten(bh));
+	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_fault(vma, vmf, ext4_get_block);
+	return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
 					/* Is this the right get_block? */
 }
 
 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_mkwrite(vma, vmf, ext4_get_block);
+	return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
 }
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
@@ -223,9 +235,11 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	struct inode *inode = file->f_mapping->host;
 
 	if (ext4_encrypted_inode(inode)) {
-		int err = ext4_generate_encryption_key(inode);
+		int err = ext4_get_encryption_info(inode);
 		if (err)
 			return 0;
+		if (ext4_encryption_info(inode) == NULL)
+			return -ENOKEY;
 	}
 	file_accessed(file);
 	if (IS_DAX(file_inode(file))) {
@@ -278,6 +292,13 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 			ext4_journal_stop(handle);
 		}
 	}
+	if (ext4_encrypted_inode(inode)) {
+		ret = ext4_get_encryption_info(inode);
+		if (ret)
+			return -EACCES;
+		if (ext4_encryption_info(inode) == NULL)
+			return -ENOKEY;
+	}
 	/*
 	 * Set up the jbd2_inode if we are opening the inode for
 	 * writing and the journal is present
@@ -287,13 +308,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 		if (ret < 0)
 			return ret;
 	}
-	ret = dquot_file_open(inode, filp);
-	if (!ret && ext4_encrypted_inode(inode)) {
-		ret = ext4_generate_encryption_key(inode);
-		if (ret)
-			ret = -EACCES;
-	}
-	return ret;
+	return dquot_file_open(inode, filp);
 }
 
 /*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1eaa6cb96..173c1ae21 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -726,11 +726,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	ext4_group_t i;
 	ext4_group_t flex_group;
 	struct ext4_group_info *grp;
+	int encrypt = 0;
 
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
 		return ERR_PTR(-EPERM);
 
+	if ((ext4_encrypted_inode(dir) ||
+	     DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
+	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+		err = ext4_get_encryption_info(dir);
+		if (err)
+			return ERR_PTR(err);
+		if (ext4_encryption_info(dir) == NULL)
+			return ERR_PTR(-EPERM);
+		if (!handle)
+			nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
+		encrypt = 1;
+	}
+
 	sb = dir->i_sb;
 	ngroups = ext4_get_groups_count(sb);
 	trace_ext4_request_inode(dir, mode);
@@ -996,12 +1010,6 @@ got:
 	ei->i_block_group = group;
 	ei->i_last_alloc_group = ~0;
 
-	/* If the directory encrypted, then we should encrypt the inode. */
-	if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) &&
-	    (ext4_encrypted_inode(dir) ||
-	     DUMMY_ENCRYPTION_ENABLED(sbi)))
-		ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
-
 	ext4_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
 		ext4_handle_sync(handle);
@@ -1034,28 +1042,9 @@ got:
 	ext4_set_inode_state(inode, EXT4_STATE_NEW);
 
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) &&
-	    (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) {
-		ei->i_inline_off = 0;
-		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-			EXT4_FEATURE_INCOMPAT_INLINE_DATA))
-			ext4_set_inode_state(inode,
-			EXT4_STATE_MAY_INLINE_DATA);
-	} else {
-		/* Inline data and encryption are incompatible
-		 * We turn off inline data since encryption is enabled */
-		ei->i_inline_off = 1;
-		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-			EXT4_FEATURE_INCOMPAT_INLINE_DATA))
-			ext4_clear_inode_state(inode,
-			EXT4_STATE_MAY_INLINE_DATA);
-	}
-#else
 	ei->i_inline_off = 0;
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
 		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-#endif
 	ret = inode;
 	err = dquot_alloc_inode(inode);
 	if (err)
@@ -1082,6 +1071,12 @@ got:
 		ei->i_datasync_tid = handle->h_transaction->t_tid;
 	}
 
+	if (encrypt) {
+		err = ext4_inherit_context(dir, inode);
+		if (err)
+			goto fail_free_drop;
+	}
+
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_std_error(sb, err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 94ae6874c..4f6ac499f 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -576,6 +576,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 		ar.flags = EXT4_MB_HINT_DATA;
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+	if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+		ar.flags |= EXT4_MB_USE_RESERVED;
 
 	ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
 
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 095c7a258..cd944a7a9 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -995,20 +995,18 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
  * and -EEXIST if directory entry already exists.
  */
 static int ext4_add_dirent_to_inline(handle_t *handle,
+				     struct ext4_filename *fname,
 				     struct dentry *dentry,
 				     struct inode *inode,
 				     struct ext4_iloc *iloc,
 				     void *inline_start, int inline_size)
 {
 	struct inode	*dir = d_inode(dentry->d_parent);
-	const char	*name = dentry->d_name.name;
-	int		namelen = dentry->d_name.len;
 	int		err;
 	struct ext4_dir_entry_2 *de;
 
-	err = ext4_find_dest_de(dir, inode, iloc->bh,
-				inline_start, inline_size,
-				name, namelen, &de);
+	err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
+				inline_size, fname, &de);
 	if (err)
 		return err;
 
@@ -1016,8 +1014,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
 	err = ext4_journal_get_write_access(handle, iloc->bh);
 	if (err)
 		return err;
-	ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name,
-			   name, namelen);
+	ext4_insert_dentry(dir, inode, de, inline_size, fname);
 
 	ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
 
@@ -1248,8 +1245,8 @@ out:
  * If succeeds, return 0. If not, extended the inline dir and copied data to
  * the new created block.
  */
-int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
-			      struct inode *inode)
+int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
+			      struct dentry *dentry, struct inode *inode)
 {
 	int ret, inline_size;
 	void *inline_start;
@@ -1268,7 +1265,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
 						 EXT4_INLINE_DOTDOT_SIZE;
 	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
 
-	ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+	ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc,
 					inline_start, inline_size);
 	if (ret != -ENOSPC)
 		goto out;
@@ -1289,8 +1286,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
 	if (inline_size) {
 		inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
 
-		ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
-						inline_start, inline_size);
+		ret = ext4_add_dirent_to_inline(handle, fname, dentry,
+						inode, &iloc, inline_start,
+						inline_size);
 
 		if (ret != -ENOSPC)
 			goto out;
@@ -1611,6 +1609,7 @@ out:
 }
 
 struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+					struct ext4_filename *fname,
 					const struct qstr *d_name,
 					struct ext4_dir_entry_2 **res_dir,
 					int *has_inline_data)
@@ -1632,8 +1631,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
 	inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
 						EXT4_INLINE_DOTDOT_SIZE;
 	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
-	ret = search_dir(iloc.bh, inline_start, inline_size,
-			 dir, d_name, 0, res_dir);
+	ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
+			      dir, fname, d_name, 0, res_dir);
 	if (ret == 1)
 		goto out_find;
 	if (ret < 0)
@@ -1645,8 +1644,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
 	inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
 	inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
 
-	ret = search_dir(iloc.bh, inline_start, inline_size,
-			 dir, d_name, 0, res_dir);
+	ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
+			      dir, fname, d_name, 0, res_dir);
 	if (ret == 1)
 		goto out_find;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 966c61482..cecf9aa10 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -656,18 +656,6 @@ has_zeroout:
 	return retval;
 }
 
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-	struct inode *inode = bh->b_assoc_map->host;
-	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
-	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-	int err;
-	if (!uptodate)
-		return;
-	WARN_ON(!buffer_unwritten(bh));
-	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
@@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+		if (IS_DAX(inode) && buffer_unwritten(bh)) {
+			/*
+			 * dgc: I suspect unwritten conversion on ext4+DAX is
+			 * fundamentally broken here when there are concurrent
+			 * read/write in progress on this inode.
+			 */
+			WARN_ON_ONCE(io_end);
 			bh->b_assoc_map = inode->i_mapping;
 			bh->b_private = (void *)(unsigned long)iblock;
-			bh->b_end_io = ext4_end_io_unwritten;
 		}
 		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
 			set_buffer_defer_completion(bh);
@@ -731,18 +724,18 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
-				ext4_lblk_t block, int create)
+				ext4_lblk_t block, int map_flags)
 {
 	struct ext4_map_blocks map;
 	struct buffer_head *bh;
+	int create = map_flags & EXT4_GET_BLOCKS_CREATE;
 	int err;
 
 	J_ASSERT(handle != NULL || create == 0);
 
 	map.m_lblk = block;
 	map.m_len = 1;
-	err = ext4_map_blocks(handle, inode, &map,
-			      create ? EXT4_GET_BLOCKS_CREATE : 0);
+	err = ext4_map_blocks(handle, inode, &map, map_flags);
 
 	if (err == 0)
 		return create ? ERR_PTR(-ENOSPC) : NULL;
@@ -788,11 +781,11 @@ errout:
 }
 
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
-			       ext4_lblk_t block, int create)
+			       ext4_lblk_t block, int map_flags)
 {
 	struct buffer_head *bh;
 
-	bh = ext4_getblk(handle, inode, block, create);
+	bh = ext4_getblk(handle, inode, block, map_flags);
 	if (IS_ERR(bh))
 		return bh;
 	if (!bh || buffer_uptodate(bh))
@@ -1261,13 +1254,12 @@ static int ext4_journalled_write_end(struct file *file,
 }
 
 /*
- * Reserve a single cluster located at lblock
+ * Reserve space for a single cluster
  */
-static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+static int ext4_da_reserve_space(struct inode *inode)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned int md_needed;
 	int ret;
 
 	/*
@@ -1279,25 +1271,14 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 	if (ret)
 		return ret;
 
-	/*
-	 * recalculate the amount of metadata blocks to reserve
-	 * in order to allocate nrblocks
-	 * worse case is one extent per block
-	 */
 	spin_lock(&ei->i_block_reservation_lock);
-	/*
-	 * ext4_calc_metadata_amount() has side effects, which we have
-	 * to be prepared undo if we fail to claim space.
-	 */
-	md_needed = 0;
-	trace_ext4_da_reserve_space(inode, 0);
-
 	if (ext4_claim_free_clusters(sbi, 1, 0)) {
 		spin_unlock(&ei->i_block_reservation_lock);
 		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
 		return -ENOSPC;
 	}
 	ei->i_reserved_data_blocks++;
+	trace_ext4_da_reserve_space(inode);
 	spin_unlock(&ei->i_block_reservation_lock);
 
 	return 0;       /* success */
@@ -1575,9 +1556,9 @@ add_delayed:
 		 * then we don't need to reserve it again. However we still need
 		 * to reserve metadata for every block we're going to write.
 		 */
-		if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 ||
+		if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
 		    !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
-			ret = ext4_da_reserve_space(inode, iblock);
+			ret = ext4_da_reserve_space(inode);
 			if (ret) {
 				/* not enough space to reserve */
 				retval = ret;
@@ -4237,8 +4218,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &ext4_dir_inode_operations;
 		inode->i_fop = &ext4_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (ext4_inode_is_fast_symlink(inode) &&
-		    !ext4_encrypted_inode(inode)) {
+		if (ext4_encrypted_inode(inode)) {
+			inode->i_op = &ext4_encrypted_symlink_inode_operations;
+			ext4_set_aops(inode);
+		} else if (ext4_inode_is_fast_symlink(inode)) {
+			inode->i_link = (char *)ei->i_data;
 			inode->i_op = &ext4_fast_symlink_inode_operations;
 			nd_terminate_link(ei->i_data, inode->i_size,
 				sizeof(ei->i_data) - 1);
@@ -4707,8 +4691,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		ext4_journal_stop(handle);
 	}
 
-	if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+	if (attr->ia_valid & ATTR_SIZE) {
 		handle_t *handle;
+		loff_t oldsize = inode->i_size;
+		int shrink = (attr->ia_size <= inode->i_size);
 
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4716,24 +4702,26 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			if (attr->ia_size > sbi->s_bitmap_maxbytes)
 				return -EFBIG;
 		}
+		if (!S_ISREG(inode->i_mode))
+			return -EINVAL;
 
 		if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
 			inode_inc_iversion(inode);
 
-		if (S_ISREG(inode->i_mode) &&
+		if (ext4_should_order_data(inode) &&
 		    (attr->ia_size < inode->i_size)) {
-			if (ext4_should_order_data(inode)) {
-				error = ext4_begin_ordered_truncate(inode,
+			error = ext4_begin_ordered_truncate(inode,
 							    attr->ia_size);
-				if (error)
-					goto err_out;
-			}
+			if (error)
+				goto err_out;
+		}
+		if (attr->ia_size != inode->i_size) {
 			handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
 			if (IS_ERR(handle)) {
 				error = PTR_ERR(handle);
 				goto err_out;
 			}
-			if (ext4_handle_valid(handle)) {
+			if (ext4_handle_valid(handle) && shrink) {
 				error = ext4_orphan_add(handle, inode);
 				orphan = 1;
 			}
@@ -4752,15 +4740,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			up_write(&EXT4_I(inode)->i_data_sem);
 			ext4_journal_stop(handle);
 			if (error) {
-				ext4_orphan_del(NULL, inode);
+				if (orphan)
+					ext4_orphan_del(NULL, inode);
 				goto err_out;
 			}
-		} else {
-			loff_t oldsize = inode->i_size;
-
-			i_size_write(inode, attr->ia_size);
-			pagecache_isize_extended(inode, oldsize, inode->i_size);
 		}
+		if (!shrink)
+			pagecache_isize_extended(inode, oldsize, inode->i_size);
 
 		/*
 		 * Blocks are going to be removed from the inode. Wait
@@ -4780,13 +4766,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		 * in data=journal mode to make pages freeable.
 		 */
 		truncate_pagecache(inode, inode->i_size);
+		if (shrink)
+			ext4_truncate(inode);
 	}
-	/*
-	 * We want to call ext4_truncate() even if attr->ia_size ==
-	 * inode->i_size for cases like truncation of fallocated space
-	 */
-	if (attr->ia_valid & ATTR_SIZE)
-		ext4_truncate(inode);
 
 	if (!rc) {
 		setattr_copy(inode, attr);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2cb9e178d..1346cfa35 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -31,14 +31,11 @@
 static void memswap(void *a, void *b, size_t len)
 {
 	unsigned char *ap, *bp;
-	unsigned char tmp;
 
 	ap = (unsigned char *)a;
 	bp = (unsigned char *)b;
 	while (len-- > 0) {
-		tmp = *ap;
-		*ap = *bp;
-		*bp = tmp;
+		swap(*ap, *bp);
 		ap++;
 		bp++;
 	}
@@ -675,8 +672,8 @@ encryption_policy_out:
 			if (err)
 				return err;
 		}
-		if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt,
-				 16))
+		if (copy_to_user((void __user *) arg,
+				 sbi->s_es->s_encrypt_pw_salt, 16))
 			return -EFAULT;
 		return 0;
 	}
@@ -690,7 +687,7 @@ encryption_policy_out:
 		err = ext4_get_policy(inode, &policy);
 		if (err)
 			return err;
-		if (copy_to_user((void *)arg, &policy, sizeof(policy)))
+		if (copy_to_user((void __user *)arg, &policy, sizeof(policy)))
 			return -EFAULT;
 		return 0;
 #else
@@ -758,7 +755,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return err;
 	}
 	case EXT4_IOC_MOVE_EXT:
-	case FITRIM:
 	case EXT4_IOC_RESIZE_FS:
 	case EXT4_IOC_PRECACHE_EXTENTS:
 	case EXT4_IOC_SET_ENCRYPTION_POLICY:
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 41260489d..34b610ea5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -26,6 +26,7 @@
 #include <linux/log2.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <trace/events/ext4.h>
 
 #ifdef CONFIG_EXT4_DEBUG
@@ -882,10 +883,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 	/* wait for I/O completion */
 	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
-		if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+		if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i]))
 			err = -EIO;
-			goto out;
-		}
 	}
 
 	first_block = page->index * blocks_per_page;
@@ -898,6 +897,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			/* skip initialized uptodate buddy */
 			continue;
 
+		if (!buffer_verified(bh[group - first_group]))
+			/* Skip faulty bitmaps */
+			continue;
+		err = 0;
+
 		/*
 		 * data carry information regarding this
 		 * particular group in the format specified
@@ -2008,7 +2012,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 	}
 }
 
-/* This is now called BEFORE we load the buddy bitmap. */
+/*
+ * This is now called BEFORE we load the buddy bitmap.
+ * Returns either 1 or 0 indicating that the group is either suitable
+ * for the allocation or not. In addition it can also return negative
+ * error code when something goes wrong.
+ */
 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 				ext4_group_t group, int cr)
 {
@@ -2031,7 +2040,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
 		int ret = ext4_mb_init_group(ac->ac_sb, group);
 		if (ret)
-			return 0;
+			return ret;
 	}
 
 	fragments = grp->bb_fragments;
@@ -2078,7 +2087,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
 	ext4_group_t ngroups, group, i;
 	int cr;
-	int err = 0;
+	int err = 0, first_err = 0;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	struct ext4_buddy e4b;
@@ -2145,6 +2154,7 @@ repeat:
 		group = ac->ac_g_ex.fe_group;
 
 		for (i = 0; i < ngroups; group++, i++) {
+			int ret = 0;
 			cond_resched();
 			/*
 			 * Artificially restricted ngroups for non-extent
@@ -2154,8 +2164,12 @@ repeat:
 				group = 0;
 
 			/* This now checks without needing the buddy page */
-			if (!ext4_mb_good_group(ac, group, cr))
+			ret = ext4_mb_good_group(ac, group, cr);
+			if (ret <= 0) {
+				if (!first_err)
+					first_err = ret;
 				continue;
+			}
 
 			err = ext4_mb_load_buddy(sb, group, &e4b);
 			if (err)
@@ -2167,9 +2181,12 @@ repeat:
 			 * We need to check again after locking the
 			 * block group
 			 */
-			if (!ext4_mb_good_group(ac, group, cr)) {
+			ret = ext4_mb_good_group(ac, group, cr);
+			if (ret <= 0) {
 				ext4_unlock_group(sb, group);
 				ext4_mb_unload_buddy(&e4b);
+				if (!first_err)
+					first_err = ret;
 				continue;
 			}
 
@@ -2216,6 +2233,8 @@ repeat:
 		}
 	}
 out:
+	if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
+		err = first_err;
 	return err;
 }
 
@@ -2257,12 +2276,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 
 	group--;
 	if (group == 0)
-		seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
-				"[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
-				  "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
-			   "group", "free", "frags", "first",
-			   "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
-			   "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+		seq_puts(seq, "#group: free  frags first ["
+			      " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
+			      " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]");
 
 	i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
 		sizeof(struct ext4_group_info);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 370420bfa..fb6f11709 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -166,12 +166,9 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
 	 */
 	wait_on_page_writeback(page[0]);
 	wait_on_page_writeback(page[1]);
-	if (inode1 > inode2) {
-		struct page *tmp;
-		tmp = page[0];
-		page[0] = page[1];
-		page[1] = tmp;
-	}
+	if (inode1 > inode2)
+		swap(page[0], page[1]);
+
 	return 0;
 }
 
@@ -574,12 +571,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
-	/* TODO: This is non obvious task to swap blocks for inodes with full
-	   jornaling enabled */
+
+	/* TODO: it's not obvious how to swap blocks for inodes with full
+	   journaling enabled */
 	if (ext4_should_journal_data(orig_inode) ||
 	    ext4_should_journal_data(donor_inode)) {
-		return -EINVAL;
+		ext4_msg(orig_inode->i_sb, KERN_ERR,
+			 "Online defrag not supported with data journaling");
+		return -EOPNOTSUPP;
 	}
+
 	/* Protect orig and donor inodes against a truncate */
 	lock_two_nondirectories(orig_inode, donor_inode);
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 814f3beb4..011dcfb5c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -61,7 +61,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
 
 	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
 
-	bh = ext4_bread(handle, inode, *block, 1);
+	bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
 	if (IS_ERR(bh))
 		return bh;
 	inode->i_size += inode->i_sb->s_blocksize;
@@ -84,12 +84,13 @@ typedef enum {
 } dirblock_type_t;
 
 #define ext4_read_dirblock(inode, block, type) \
-	__ext4_read_dirblock((inode), (block), (type), __LINE__)
+	__ext4_read_dirblock((inode), (block), (type), __func__, __LINE__)
 
 static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
-					      ext4_lblk_t block,
-					      dirblock_type_t type,
-					      unsigned int line)
+						ext4_lblk_t block,
+						dirblock_type_t type,
+						const char *func,
+						unsigned int line)
 {
 	struct buffer_head *bh;
 	struct ext4_dir_entry *dirent;
@@ -97,15 +98,17 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 
 	bh = ext4_bread(NULL, inode, block, 0);
 	if (IS_ERR(bh)) {
-		__ext4_warning(inode->i_sb, __func__, line,
-			       "error %ld reading directory block "
-			       "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino,
-			       (unsigned long) block);
+		__ext4_warning(inode->i_sb, func, line,
+			       "inode #%lu: lblock %lu: comm %s: "
+			       "error %ld reading directory block",
+			       inode->i_ino, (unsigned long)block,
+			       current->comm, PTR_ERR(bh));
 
 		return bh;
 	}
 	if (!bh) {
-		ext4_error_inode(inode, __func__, line, block, "Directory hole found");
+		ext4_error_inode(inode, func, line, block,
+				 "Directory hole found");
 		return ERR_PTR(-EIO);
 	}
 	dirent = (struct ext4_dir_entry *) bh->b_data;
@@ -119,7 +122,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 			is_dx_block = 1;
 	}
 	if (!is_dx_block && type == INDEX) {
-		ext4_error_inode(inode, __func__, line, block,
+		ext4_error_inode(inode, func, line, block,
 		       "directory leaf block found instead of index block");
 		return ERR_PTR(-EIO);
 	}
@@ -136,8 +139,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 		if (ext4_dx_csum_verify(inode, dirent))
 			set_buffer_verified(bh);
 		else {
-			ext4_error_inode(inode, __func__, line, block,
-				"Directory index failed checksum");
+			ext4_error_inode(inode, func, line, block,
+					 "Directory index failed checksum");
 			brelse(bh);
 			return ERR_PTR(-EIO);
 		}
@@ -146,8 +149,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 		if (ext4_dirent_csum_verify(inode, dirent))
 			set_buffer_verified(bh);
 		else {
-			ext4_error_inode(inode, __func__, line, block,
-				"Directory block failed checksum");
+			ext4_error_inode(inode, func, line, block,
+					 "Directory block failed checksum");
 			brelse(bh);
 			return ERR_PTR(-EIO);
 		}
@@ -248,7 +251,7 @@ static void dx_set_count(struct dx_entry *entries, unsigned value);
 static void dx_set_limit(struct dx_entry *entries, unsigned value);
 static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
 static unsigned dx_node_limit(struct inode *dir);
-static struct dx_frame *dx_probe(const struct qstr *d_name,
+static struct dx_frame *dx_probe(struct ext4_filename *fname,
 				 struct inode *dir,
 				 struct dx_hash_info *hinfo,
 				 struct dx_frame *frame);
@@ -267,10 +270,10 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frames,
 				 __u32 *start_hash);
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
-		const struct qstr *d_name,
+		struct ext4_filename *fname,
 		struct ext4_dir_entry_2 **res_dir);
-static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
-			     struct inode *inode);
+static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+			     struct dentry *dentry, struct inode *inode);
 
 /* checksumming functions */
 void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
@@ -327,10 +330,14 @@ static __le32 ext4_dirent_csum(struct inode *inode,
 	return cpu_to_le32(csum);
 }
 
-static void warn_no_space_for_csum(struct inode *inode)
+#define warn_no_space_for_csum(inode)					\
+	__warn_no_space_for_csum((inode), __func__, __LINE__)
+
+static void __warn_no_space_for_csum(struct inode *inode, const char *func,
+				     unsigned int line)
 {
-	ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
-		     "checksum.  Please run e2fsck -D.", inode->i_ino);
+	__ext4_warning_inode(inode, func, line,
+		"No space for directory leaf checksum. Please run e2fsck -D.");
 }
 
 int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
@@ -607,17 +614,15 @@ static struct stats dx_show_leaf(struct inode *dir,
 				char *name;
 				struct ext4_str fname_crypto_str
 					= {.name = NULL, .len = 0};
-				struct ext4_fname_crypto_ctx *ctx = NULL;
-				int res;
+				int res = 0;
 
 				name  = de->name;
 				len = de->name_len;
-				ctx = ext4_get_fname_crypto_ctx(dir,
-								EXT4_NAME_LEN);
-				if (IS_ERR(ctx)) {
-					printk(KERN_WARNING "Error acquiring"
-					" crypto ctxt--skipping crypto\n");
-					ctx = NULL;
+				if (ext4_encrypted_inode(inode))
+					res = ext4_get_encryption_info(dir);
+				if (res) {
+					printk(KERN_WARNING "Error setting up"
+					       " fname crypto: %d\n", res);
 				}
 				if (ctx == NULL) {
 					/* Directory is not encrypted */
@@ -637,7 +642,6 @@ static struct stats dx_show_leaf(struct inode *dir,
 							"allocating crypto "
 							"buffer--skipping "
 							"crypto\n");
-						ext4_put_fname_crypto_ctx(&ctx);
 						ctx = NULL;
 					}
 					res = ext4_fname_disk_to_usr(ctx, NULL, de,
@@ -658,7 +662,6 @@ static struct stats dx_show_leaf(struct inode *dir,
 					printk("%*.s:(E)%x.%u ", len, name,
 					       h.hash, (unsigned) ((char *) de
 								   - base));
-					ext4_put_fname_crypto_ctx(&ctx);
 					ext4_fname_crypto_free_buffer(
 						&fname_crypto_str);
 				}
@@ -724,7 +727,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
  * back to userspace.
  */
 static struct dx_frame *
-dx_probe(const struct qstr *d_name, struct inode *dir,
+dx_probe(struct ext4_filename *fname, struct inode *dir,
 	 struct dx_hash_info *hinfo, struct dx_frame *frame_in)
 {
 	unsigned count, indirect;
@@ -742,56 +745,41 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	if (root->info.hash_version != DX_HASH_TEA &&
 	    root->info.hash_version != DX_HASH_HALF_MD4 &&
 	    root->info.hash_version != DX_HASH_LEGACY) {
-		ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
-			     root->info.hash_version);
+		ext4_warning_inode(dir, "Unrecognised inode hash code %u",
+				   root->info.hash_version);
 		goto fail;
 	}
+	if (fname)
+		hinfo = &fname->hinfo;
 	hinfo->hash_version = root->info.hash_version;
 	if (hinfo->hash_version <= DX_HASH_TEA)
 		hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	if (d_name) {
-		struct ext4_fname_crypto_ctx *ctx = NULL;
-		int res;
-
-		/* Check if the directory is encrypted */
-		ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
-		if (IS_ERR(ctx)) {
-			ret_err = ERR_PTR(PTR_ERR(ctx));
-			goto fail;
-		}
-		res = ext4_fname_usr_to_hash(ctx, d_name, hinfo);
-		if (res < 0) {
-			ret_err = ERR_PTR(res);
-			goto fail;
-		}
-		ext4_put_fname_crypto_ctx(&ctx);
-	}
-#else
-	if (d_name)
-		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
-#endif
+	if (fname && fname_name(fname))
+		ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo);
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
-		ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
-			     root->info.unused_flags);
+		ext4_warning_inode(dir, "Unimplemented hash flags: %#06x",
+				   root->info.unused_flags);
 		goto fail;
 	}
 
-	if ((indirect = root->info.indirect_levels) > 1) {
-		ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
-			     root->info.indirect_levels);
+	indirect = root->info.indirect_levels;
+	if (indirect > 1) {
+		ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
+				   root->info.indirect_levels);
 		goto fail;
 	}
 
-	entries = (struct dx_entry *) (((char *)&root->info) +
-				       root->info.info_length);
+	entries = (struct dx_entry *)(((char *)&root->info) +
+				      root->info.info_length);
 
 	if (dx_get_limit(entries) != dx_root_limit(dir,
 						   root->info.info_length)) {
-		ext4_warning(dir->i_sb, "dx entry: limit != root limit");
+		ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
+				   dx_get_limit(entries),
+				   dx_root_limit(dir, root->info.info_length));
 		goto fail;
 	}
 
@@ -799,15 +787,16 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	while (1) {
 		count = dx_get_count(entries);
 		if (!count || count > dx_get_limit(entries)) {
-			ext4_warning(dir->i_sb,
-				     "dx entry: no count or count > limit");
+			ext4_warning_inode(dir,
+					   "dx entry: count %u beyond limit %u",
+					   count, dx_get_limit(entries));
 			goto fail;
 		}
 
 		p = entries + 1;
 		q = entries + count - 1;
 		while (p <= q) {
-			m = p + (q - p)/2;
+			m = p + (q - p) / 2;
 			dxtrace(printk("."));
 			if (dx_get_hash(m) > hash)
 				q = m - 1;
@@ -831,7 +820,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 		}
 
 		at = p - 1;
-		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+		dxtrace(printk(" %x->%u\n", at == entries ? 0 : dx_get_hash(at),
+			       dx_get_block(at)));
 		frame->entries = entries;
 		frame->at = at;
 		if (!indirect--)
@@ -845,9 +835,10 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 		}
 		entries = ((struct dx_node *) frame->bh->b_data)->entries;
 
-		if (dx_get_limit(entries) != dx_node_limit (dir)) {
-			ext4_warning(dir->i_sb,
-				     "dx entry: limit != node limit");
+		if (dx_get_limit(entries) != dx_node_limit(dir)) {
+			ext4_warning_inode(dir,
+				"dx entry: limit %u != node limit %u",
+				dx_get_limit(entries), dx_node_limit(dir));
 			goto fail;
 		}
 	}
@@ -858,18 +849,17 @@ fail:
 	}
 
 	if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
-		ext4_warning(dir->i_sb,
-			     "Corrupt dir inode %lu, running e2fsck is "
-			     "recommended.", dir->i_ino);
+		ext4_warning_inode(dir,
+			"Corrupt directory, running e2fsck is recommended");
 	return ret_err;
 }
 
-static void dx_release (struct dx_frame *frames)
+static void dx_release(struct dx_frame *frames)
 {
 	if (frames[0].bh == NULL)
 		return;
 
-	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+	if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
 		brelse(frames[1].bh);
 	brelse(frames[0].bh);
 }
@@ -962,7 +952,6 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de, *top;
 	int err = 0, count = 0;
-	struct ext4_fname_crypto_ctx *ctx = NULL;
 	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str;
 
 	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
@@ -977,17 +966,15 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 					   EXT4_DIR_REC_LEN(0));
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 	/* Check if the directory is encrypted */
-	ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
-	if (IS_ERR(ctx)) {
-		err = PTR_ERR(ctx);
-		brelse(bh);
-		return err;
-	}
-	if (ctx != NULL) {
-		err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
+	if (ext4_encrypted_inode(dir)) {
+		err = ext4_get_encryption_info(dir);
+		if (err < 0) {
+			brelse(bh);
+			return err;
+		}
+		err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN,
 						     &fname_crypto_str);
 		if (err < 0) {
-			ext4_put_fname_crypto_ctx(&ctx);
 			brelse(bh);
 			return err;
 		}
@@ -1008,16 +995,17 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 			continue;
 		if (de->inode == 0)
 			continue;
-		if (ctx == NULL) {
-			/* Directory is not encrypted */
+		if (!ext4_encrypted_inode(dir)) {
 			tmp_str.name = de->name;
 			tmp_str.len = de->name_len;
 			err = ext4_htree_store_dirent(dir_file,
 				   hinfo->hash, hinfo->minor_hash, de,
 				   &tmp_str);
 		} else {
+			int save_len = fname_crypto_str.len;
+
 			/* Directory is encrypted */
-			err = ext4_fname_disk_to_usr(ctx, hinfo, de,
+			err = ext4_fname_disk_to_usr(dir, hinfo, de,
 						     &fname_crypto_str);
 			if (err < 0) {
 				count = err;
@@ -1026,6 +1014,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 			err = ext4_htree_store_dirent(dir_file,
 				   hinfo->hash, hinfo->minor_hash, de,
 					&fname_crypto_str);
+			fname_crypto_str.len = save_len;
 		}
 		if (err != 0) {
 			count = err;
@@ -1036,7 +1025,6 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 errout:
 	brelse(bh);
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-	ext4_put_fname_crypto_ctx(&ctx);
 	ext4_fname_crypto_free_buffer(&fname_crypto_str);
 #endif
 	return count;
@@ -1155,12 +1143,13 @@ errout:
 
 static inline int search_dirblock(struct buffer_head *bh,
 				  struct inode *dir,
+				  struct ext4_filename *fname,
 				  const struct qstr *d_name,
 				  unsigned int offset,
 				  struct ext4_dir_entry_2 **res_dir)
 {
-	return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
-			  d_name, offset, res_dir);
+	return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+			       fname, d_name, offset, res_dir);
 }
 
 /*
@@ -1242,54 +1231,54 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
  * `len <= EXT4_NAME_LEN' is guaranteed by caller.
  * `de != NULL' is guaranteed by caller.
  */
-static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx,
-			     struct ext4_str *fname_crypto_str,
-			     int len, const char * const name,
+static inline int ext4_match(struct ext4_filename *fname,
 			     struct ext4_dir_entry_2 *de)
 {
-	int res;
+	const void *name = fname_name(fname);
+	u32 len = fname_len(fname);
 
 	if (!de->inode)
 		return 0;
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-	if (ctx)
-		return ext4_fname_match(ctx, fname_crypto_str, len, name, de);
+	if (unlikely(!name)) {
+		if (fname->usr_fname->name[0] == '_') {
+			int ret;
+			if (de->name_len < 16)
+				return 0;
+			ret = memcmp(de->name + de->name_len - 16,
+				     fname->crypto_buf.name + 8, 16);
+			return (ret == 0) ? 1 : 0;
+		}
+		name = fname->crypto_buf.name;
+		len = fname->crypto_buf.len;
+	}
 #endif
-	if (len != de->name_len)
+	if (de->name_len != len)
 		return 0;
-	res = memcmp(name, de->name, len);
-	return (res == 0) ? 1 : 0;
+	return (memcmp(de->name, name, len) == 0) ? 1 : 0;
 }
 
 /*
  * Returns 0 if not found, -1 on failure, and 1 on success
  */
-int search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
-	       struct inode *dir, const struct qstr *d_name,
-	       unsigned int offset, struct ext4_dir_entry_2 **res_dir)
+int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
+		    struct inode *dir, struct ext4_filename *fname,
+		    const struct qstr *d_name,
+		    unsigned int offset, struct ext4_dir_entry_2 **res_dir)
 {
 	struct ext4_dir_entry_2 * de;
 	char * dlimit;
 	int de_len;
-	const char *name = d_name->name;
-	int namelen = d_name->len;
-	struct ext4_fname_crypto_ctx *ctx = NULL;
-	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
 	int res;
 
-	ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
-	if (IS_ERR(ctx))
-		return -1;
-
 	de = (struct ext4_dir_entry_2 *)search_buf;
 	dlimit = search_buf + buf_size;
 	while ((char *) de < dlimit) {
 		/* this code is executed quadratically often */
 		/* do minimal checking `by hand' */
 		if ((char *) de + de->name_len <= dlimit) {
-			res = ext4_match(ctx, &fname_crypto_str, namelen,
-					 name, de);
+			res = ext4_match(fname, de);
 			if (res < 0) {
 				res = -1;
 				goto return_result;
@@ -1322,8 +1311,6 @@ int search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
 
 	res = 0;
 return_result:
-	ext4_put_fname_crypto_ctx(&ctx);
-	ext4_fname_crypto_free_buffer(&fname_crypto_str);
 	return res;
 }
 
@@ -1370,7 +1357,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 				   buffer */
 	int num = 0;
 	ext4_lblk_t  nblocks;
-	int i, namelen;
+	int i, namelen, retval;
+	struct ext4_filename fname;
 
 	*res_dir = NULL;
 	sb = dir->i_sb;
@@ -1378,14 +1366,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
 
+	retval = ext4_fname_setup_filename(dir, d_name, 1, &fname);
+	if (retval)
+		return ERR_PTR(retval);
+
 	if (ext4_has_inline_data(dir)) {
 		int has_inline_data = 1;
-		ret = ext4_find_inline_entry(dir, d_name, res_dir,
+		ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir,
 					     &has_inline_data);
 		if (has_inline_data) {
 			if (inlined)
 				*inlined = 1;
-			return ret;
+			goto cleanup_and_exit;
 		}
 	}
 
@@ -1400,14 +1392,14 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 		goto restart;
 	}
 	if (is_dx(dir)) {
-		bh = ext4_dx_find_entry(dir, d_name, res_dir);
+		ret = ext4_dx_find_entry(dir, &fname, res_dir);
 		/*
 		 * On success, or if the error was file not found,
 		 * return.  Otherwise, fall back to doing a search the
 		 * old fashioned way.
 		 */
-		if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR)
-			return bh;
+		if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
+			goto cleanup_and_exit;
 		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
 			       "falling back\n"));
 	}
@@ -1438,8 +1430,10 @@ restart:
 				num++;
 				bh = ext4_getblk(NULL, dir, b++, 0);
 				if (unlikely(IS_ERR(bh))) {
-					if (ra_max == 0)
-						return bh;
+					if (ra_max == 0) {
+						ret = bh;
+						goto cleanup_and_exit;
+					}
 					break;
 				}
 				bh_use[ra_max] = bh;
@@ -1469,7 +1463,7 @@ restart:
 			goto next;
 		}
 		set_buffer_verified(bh);
-		i = search_dirblock(bh, dir, d_name,
+		i = search_dirblock(bh, dir, &fname, d_name,
 			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
 		if (i == 1) {
 			EXT4_I(dir)->i_dir_start_lookup = block;
@@ -1500,15 +1494,17 @@ cleanup_and_exit:
 	/* Clean up the read-ahead blocks */
 	for (; ra_ptr < ra_max; ra_ptr++)
 		brelse(bh_use[ra_ptr]);
+	ext4_fname_free_filename(&fname);
 	return ret;
 }
 
-static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
-		       struct ext4_dir_entry_2 **res_dir)
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+			struct ext4_filename *fname,
+			struct ext4_dir_entry_2 **res_dir)
 {
 	struct super_block * sb = dir->i_sb;
-	struct dx_hash_info	hinfo;
 	struct dx_frame frames[2], *frame;
+	const struct qstr *d_name = fname->usr_fname;
 	struct buffer_head *bh;
 	ext4_lblk_t block;
 	int retval;
@@ -1516,7 +1512,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 	*res_dir = NULL;
 #endif
-	frame = dx_probe(d_name, dir, &hinfo, frames);
+	frame = dx_probe(fname, dir, NULL, frames);
 	if (IS_ERR(frame))
 		return (struct buffer_head *) frame;
 	do {
@@ -1525,7 +1521,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 		if (IS_ERR(bh))
 			goto errout;
 
-		retval = search_dirblock(bh, dir, d_name,
+		retval = search_dirblock(bh, dir, fname, d_name,
 					 block << EXT4_BLOCK_SIZE_BITS(sb),
 					 res_dir);
 		if (retval == 1)
@@ -1537,12 +1533,12 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 		}
 
 		/* Check to see if we should continue to search */
-		retval = ext4_htree_next_block(dir, hinfo.hash, frame,
+		retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame,
 					       frames, NULL);
 		if (retval < 0) {
-			ext4_warning(sb,
-			     "error %d reading index page in directory #%lu",
-			     retval, dir->i_ino);
+			ext4_warning_inode(dir,
+				"error %d reading directory index block",
+				retval);
 			bh = ERR_PTR(retval);
 			goto errout;
 		}
@@ -1796,32 +1792,16 @@ journal_error:
 int ext4_find_dest_de(struct inode *dir, struct inode *inode,
 		      struct buffer_head *bh,
 		      void *buf, int buf_size,
-		      const char *name, int namelen,
+		      struct ext4_filename *fname,
 		      struct ext4_dir_entry_2 **dest_de)
 {
 	struct ext4_dir_entry_2 *de;
-	unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
+	unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname));
 	int nlen, rlen;
 	unsigned int offset = 0;
 	char *top;
-	struct ext4_fname_crypto_ctx *ctx = NULL;
-	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
 	int res;
 
-	ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
-	if (IS_ERR(ctx))
-		return -1;
-
-	if (ctx != NULL) {
-		/* Calculate record length needed to store the entry */
-		res = ext4_fname_crypto_namelen_on_disk(ctx, namelen);
-		if (res < 0) {
-			ext4_put_fname_crypto_ctx(&ctx);
-			return res;
-		}
-		reclen = EXT4_DIR_REC_LEN(res);
-	}
-
 	de = (struct ext4_dir_entry_2 *)buf;
 	top = buf + buf_size - reclen;
 	while ((char *) de <= top) {
@@ -1831,7 +1811,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
 			goto return_result;
 		}
 		/* Provide crypto context and crypto buffer to ext4 match */
-		res = ext4_match(ctx, &fname_crypto_str, namelen, name, de);
+		res = ext4_match(fname, de);
 		if (res < 0)
 			goto return_result;
 		if (res > 0) {
@@ -1853,8 +1833,6 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
 		res = 0;
 	}
 return_result:
-	ext4_put_fname_crypto_ctx(&ctx);
-	ext4_fname_crypto_free_buffer(&fname_crypto_str);
 	return res;
 }
 
@@ -1862,39 +1840,10 @@ int ext4_insert_dentry(struct inode *dir,
 		       struct inode *inode,
 		       struct ext4_dir_entry_2 *de,
 		       int buf_size,
-		       const struct qstr *iname,
-		       const char *name, int namelen)
+		       struct ext4_filename *fname)
 {
 
 	int nlen, rlen;
-	struct ext4_fname_crypto_ctx *ctx = NULL;
-	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
-	struct ext4_str tmp_str;
-	int res;
-
-	ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
-	if (IS_ERR(ctx))
-		return -EIO;
-	/* By default, the input name would be written to the disk */
-	tmp_str.name = (unsigned char *)name;
-	tmp_str.len = namelen;
-	if (ctx != NULL) {
-		/* Directory is encrypted */
-		res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
-						     &fname_crypto_str);
-		if (res < 0) {
-			ext4_put_fname_crypto_ctx(&ctx);
-			return -ENOMEM;
-		}
-		res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str);
-		if (res < 0) {
-			ext4_put_fname_crypto_ctx(&ctx);
-			ext4_fname_crypto_free_buffer(&fname_crypto_str);
-			return res;
-		}
-		tmp_str.name = fname_crypto_str.name;
-		tmp_str.len = fname_crypto_str.len;
-	}
 
 	nlen = EXT4_DIR_REC_LEN(de->name_len);
 	rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
@@ -1908,11 +1857,8 @@ int ext4_insert_dentry(struct inode *dir,
 	de->file_type = EXT4_FT_UNKNOWN;
 	de->inode = cpu_to_le32(inode->i_ino);
 	ext4_set_de_type(inode->i_sb, de, inode->i_mode);
-	de->name_len = tmp_str.len;
-
-	memcpy(de->name, tmp_str.name, tmp_str.len);
-	ext4_put_fname_crypto_ctx(&ctx);
-	ext4_fname_crypto_free_buffer(&fname_crypto_str);
+	de->name_len = fname_len(fname);
+	memcpy(de->name, fname_name(fname), fname_len(fname));
 	return 0;
 }
 
@@ -1924,13 +1870,11 @@ int ext4_insert_dentry(struct inode *dir,
  * space.  It will return -ENOSPC if no space is available, and -EIO
  * and -EEXIST if directory entry already exists.
  */
-static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
+			     struct inode *dir,
 			     struct inode *inode, struct ext4_dir_entry_2 *de,
 			     struct buffer_head *bh)
 {
-	struct inode	*dir = d_inode(dentry->d_parent);
-	const char	*name = dentry->d_name.name;
-	int		namelen = dentry->d_name.len;
 	unsigned int	blocksize = dir->i_sb->s_blocksize;
 	int		csum_size = 0;
 	int		err;
@@ -1939,9 +1883,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	if (!de) {
-		err = ext4_find_dest_de(dir, inode,
-					bh, bh->b_data, blocksize - csum_size,
-					name, namelen, &de);
+		err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
+					blocksize - csum_size, fname, &de);
 		if (err)
 			return err;
 	}
@@ -1954,8 +1897,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 
 	/* By now the buffer is marked for journaling. Due to crypto operations,
 	 * the following function call may fail */
-	err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name,
-				 name, namelen);
+	err = ext4_insert_dentry(dir, inode, de, blocksize, fname);
 	if (err < 0)
 		return err;
 
@@ -1985,17 +1927,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
  * This converts a one block unindexed directory to a 3 block indexed
  * directory, and adds the dentry to the indexed directory.
  */
-static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+			    struct dentry *dentry,
 			    struct inode *inode, struct buffer_head *bh)
 {
 	struct inode	*dir = d_inode(dentry->d_parent);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	struct ext4_fname_crypto_ctx *ctx = NULL;
-	int res;
-#else
-	const char	*name = dentry->d_name.name;
-	int		namelen = dentry->d_name.len;
-#endif
 	struct buffer_head *bh2;
 	struct dx_root	*root;
 	struct dx_frame	frames[2], *frame;
@@ -2006,17 +1942,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	unsigned	len;
 	int		retval;
 	unsigned	blocksize;
-	struct dx_hash_info hinfo;
 	ext4_lblk_t  block;
 	struct fake_dirent *fde;
 	int csum_size = 0;
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-#endif
-
 	if (ext4_has_metadata_csum(inode->i_sb))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
@@ -2078,22 +2007,12 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
 
 	/* Initialize as for dx_probe */
-	hinfo.hash_version = root->info.hash_version;
-	if (hinfo.hash_version <= DX_HASH_TEA)
-		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
-	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo);
-	if (res < 0) {
-		ext4_put_fname_crypto_ctx(&ctx);
-		ext4_mark_inode_dirty(handle, dir);
-		brelse(bh);
-		return res;
-	}
-	ext4_put_fname_crypto_ctx(&ctx);
-#else
-	ext4fs_dirhash(name, namelen, &hinfo);
-#endif
+	fname->hinfo.hash_version = root->info.hash_version;
+	if (fname->hinfo.hash_version <= DX_HASH_TEA)
+		fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+	fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+	ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo);
+
 	memset(frames, 0, sizeof(frames));
 	frame = frames;
 	frame->entries = entries;
@@ -2108,14 +2027,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	if (retval)
 		goto out_frames;	
 
-	de = do_split(handle,dir, &bh, frame, &hinfo);
+	de = do_split(handle,dir, &bh, frame, &fname->hinfo);
 	if (IS_ERR(de)) {
 		retval = PTR_ERR(de);
 		goto out_frames;
 	}
 	dx_release(frames);
 
-	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
 	brelse(bh);
 	return retval;
 out_frames:
@@ -2147,6 +2066,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	struct ext4_dir_entry_2 *de;
 	struct ext4_dir_entry_tail *t;
 	struct super_block *sb;
+	struct ext4_filename fname;
 	int	retval;
 	int	dx_fallback=0;
 	unsigned blocksize;
@@ -2161,10 +2081,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	if (!dentry->d_name.len)
 		return -EINVAL;
 
+	retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
+	if (retval)
+		return retval;
+
 	if (ext4_has_inline_data(dir)) {
-		retval = ext4_try_add_inline_entry(handle, dentry, inode);
+		retval = ext4_try_add_inline_entry(handle, &fname,
+						   dentry, inode);
 		if (retval < 0)
-			return retval;
+			goto out;
 		if (retval == 1) {
 			retval = 0;
 			goto out;
@@ -2172,7 +2097,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	}
 
 	if (is_dx(dir)) {
-		retval = ext4_dx_add_entry(handle, dentry, inode);
+		retval = ext4_dx_add_entry(handle, &fname, dentry, inode);
 		if (!retval || (retval != ERR_BAD_DX_DIR))
 			goto out;
 		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
@@ -2182,24 +2107,31 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	blocks = dir->i_size >> sb->s_blocksize_bits;
 	for (block = 0; block < blocks; block++) {
 		bh = ext4_read_dirblock(dir, block, DIRENT);
-		if (IS_ERR(bh))
-			return PTR_ERR(bh);
-
-		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+		if (IS_ERR(bh)) {
+			retval = PTR_ERR(bh);
+			bh = NULL;
+			goto out;
+		}
+		retval = add_dirent_to_buf(handle, &fname, dir, inode,
+					   NULL, bh);
 		if (retval != -ENOSPC)
 			goto out;
 
 		if (blocks == 1 && !dx_fallback &&
 		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
-			retval = make_indexed_dir(handle, dentry, inode, bh);
+			retval = make_indexed_dir(handle, &fname, dentry,
+						  inode, bh);
 			bh = NULL; /* make_indexed_dir releases bh */
 			goto out;
 		}
 		brelse(bh);
 	}
 	bh = ext4_append(handle, dir, &block);
-	if (IS_ERR(bh))
-		return PTR_ERR(bh);
+	if (IS_ERR(bh)) {
+		retval = PTR_ERR(bh);
+		bh = NULL;
+		goto out;
+	}
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
 	de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
@@ -2209,8 +2141,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		initialize_dirent_tail(t, blocksize);
 	}
 
-	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh);
 out:
+	ext4_fname_free_filename(&fname);
 	brelse(bh);
 	if (retval == 0)
 		ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -2220,19 +2153,18 @@ out:
 /*
  * Returns 0 for success, or a negative error value
  */
-static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
-			     struct inode *inode)
+static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+			     struct dentry *dentry, struct inode *inode)
 {
 	struct dx_frame frames[2], *frame;
 	struct dx_entry *entries, *at;
-	struct dx_hash_info hinfo;
 	struct buffer_head *bh;
 	struct inode *dir = d_inode(dentry->d_parent);
 	struct super_block *sb = dir->i_sb;
 	struct ext4_dir_entry_2 *de;
 	int err;
 
-	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames);
+	frame = dx_probe(fname, dir, NULL, frames);
 	if (IS_ERR(frame))
 		return PTR_ERR(frame);
 	entries = frame->entries;
@@ -2249,7 +2181,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	if (err)
 		goto journal_error;
 
-	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+	err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh);
 	if (err != -ENOSPC)
 		goto cleanup;
 
@@ -2267,7 +2199,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 
 		if (levels && (dx_get_count(frames->entries) ==
 			       dx_get_limit(frames->entries))) {
-			ext4_warning(sb, "Directory index full!");
+			ext4_warning_inode(dir, "Directory index full!");
 			err = -ENOSPC;
 			goto cleanup;
 		}
@@ -2345,12 +2277,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		}
 	}
-	de = do_split(handle, dir, &bh, frame, &hinfo);
+	de = do_split(handle, dir, &bh, frame, &fname->hinfo);
 	if (IS_ERR(de)) {
 		err = PTR_ERR(de);
 		goto cleanup;
 	}
-	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	err = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
 	goto cleanup;
 
 journal_error:
@@ -2517,20 +2449,7 @@ retry:
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
-		err = 0;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-		if (!err && (ext4_encrypted_inode(dir) ||
-			     DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) {
-			err = ext4_inherit_context(dir, inode);
-			if (err) {
-				clear_nlink(inode);
-				unlock_new_inode(inode);
-				iput(inode);
-			}
-		}
-#endif
-		if (!err)
-			err = ext4_add_nondir(handle, dentry, inode);
+		err = ext4_add_nondir(handle, dentry, inode);
 		if (!err && IS_DIRSYNC(dir))
 			ext4_handle_sync(handle);
 	}
@@ -2711,14 +2630,6 @@ retry:
 	err = ext4_init_new_dir(handle, dir, inode);
 	if (err)
 		goto out_clear_inode;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	if (ext4_encrypted_inode(dir) ||
-	    DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) {
-		err = ext4_inherit_context(dir, inode);
-		if (err)
-			goto out_clear_inode;
-	}
-#endif
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (!err)
 		err = ext4_add_entry(handle, dentry, inode);
@@ -2779,12 +2690,9 @@ int ext4_empty_dir(struct inode *inode)
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de1 = ext4_next_entry(de, sb->s_blocksize);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
-			!le32_to_cpu(de1->inode) ||
-			strcmp(".", de->name) ||
-			strcmp("..", de1->name)) {
-		ext4_warning(inode->i_sb,
-			     "bad directory (dir #%lu) - no `.' or `..'",
-			     inode->i_ino);
+			le32_to_cpu(de1->inode) == 0 ||
+			strcmp(".", de->name) || strcmp("..", de1->name)) {
+		ext4_warning_inode(inode, "directory missing '.' and/or '..'");
 		brelse(bh);
 		return 1;
 	}
@@ -3037,8 +2945,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	if (retval)
 		goto end_rmdir;
 	if (!EXT4_DIR_LINK_EMPTY(inode))
-		ext4_warning(inode->i_sb,
-			     "empty directory has too many links (%d)",
+		ext4_warning_inode(inode,
+			     "empty directory '%.*s' has too many links (%u)",
+			     dentry->d_name.len, dentry->d_name.name,
 			     inode->i_nlink);
 	inode->i_version++;
 	clear_nlink(inode);
@@ -3098,10 +3007,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	if (!inode->i_nlink) {
-		ext4_warning(inode->i_sb,
-			     "Deleting nonexistent file (%lu), %d",
-			     inode->i_ino, inode->i_nlink);
+	if (inode->i_nlink == 0) {
+		ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
+				   dentry->d_name.len, dentry->d_name.name);
 		set_nlink(inode, 1);
 	}
 	retval = ext4_delete_entry(handle, dir, de, bh);
@@ -3140,10 +3048,23 @@ static int ext4_symlink(struct inode *dir,
 
 	encryption_required = (ext4_encrypted_inode(dir) ||
 			       DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)));
-	if (encryption_required)
-		disk_link.len = encrypted_symlink_data_len(len) + 1;
-	if (disk_link.len > dir->i_sb->s_blocksize)
-		return -ENAMETOOLONG;
+	if (encryption_required) {
+		err = ext4_get_encryption_info(dir);
+		if (err)
+			return err;
+		if (ext4_encryption_info(dir) == NULL)
+			return -EPERM;
+		disk_link.len = (ext4_fname_encrypted_size(dir, len) +
+				 sizeof(struct ext4_encrypted_symlink_data));
+		sd = kzalloc(disk_link.len, GFP_KERNEL);
+		if (!sd)
+			return -ENOMEM;
+	}
+
+	if (disk_link.len > dir->i_sb->s_blocksize) {
+		err = -ENAMETOOLONG;
+		goto err_free_sd;
+	}
 
 	dquot_initialize(dir);
 
@@ -3174,42 +3095,29 @@ static int ext4_symlink(struct inode *dir,
 	if (IS_ERR(inode)) {
 		if (handle)
 			ext4_journal_stop(handle);
-		return PTR_ERR(inode);
+		err = PTR_ERR(inode);
+		goto err_free_sd;
 	}
 
 	if (encryption_required) {
-		struct ext4_fname_crypto_ctx *ctx = NULL;
 		struct qstr istr;
 		struct ext4_str ostr;
 
-		sd = kzalloc(disk_link.len, GFP_NOFS);
-		if (!sd) {
-			err = -ENOMEM;
-			goto err_drop_inode;
-		}
-		err = ext4_inherit_context(dir, inode);
-		if (err)
-			goto err_drop_inode;
-		ctx = ext4_get_fname_crypto_ctx(inode,
-						inode->i_sb->s_blocksize);
-		if (IS_ERR_OR_NULL(ctx)) {
-			/* We just set the policy, so ctx should not be NULL */
-			err = (ctx == NULL) ? -EIO : PTR_ERR(ctx);
-			goto err_drop_inode;
-		}
 		istr.name = (const unsigned char *) symname;
 		istr.len = len;
 		ostr.name = sd->encrypted_path;
-		err = ext4_fname_usr_to_disk(ctx, &istr, &ostr);
-		ext4_put_fname_crypto_ctx(&ctx);
+		ostr.len = disk_link.len;
+		err = ext4_fname_usr_to_disk(inode, &istr, &ostr);
 		if (err < 0)
 			goto err_drop_inode;
 		sd->len = cpu_to_le16(ostr.len);
 		disk_link.name = (char *) sd;
+		inode->i_op = &ext4_encrypted_symlink_inode_operations;
 	}
 
 	if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
-		inode->i_op = &ext4_symlink_inode_operations;
+		if (!encryption_required)
+			inode->i_op = &ext4_symlink_inode_operations;
 		ext4_set_aops(inode);
 		/*
 		 * We cannot call page_symlink() with transaction started
@@ -3249,9 +3157,10 @@ static int ext4_symlink(struct inode *dir,
 	} else {
 		/* clear the extent format for fast symlink */
 		ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
-		inode->i_op = encryption_required ?
-			&ext4_symlink_inode_operations :
-			&ext4_fast_symlink_inode_operations;
+		if (!encryption_required) {
+			inode->i_op = &ext4_fast_symlink_inode_operations;
+			inode->i_link = (char *)&EXT4_I(inode)->i_data;
+		}
 		memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
 		       disk_link.len);
 		inode->i_size = disk_link.len - 1;
@@ -3268,10 +3177,11 @@ static int ext4_symlink(struct inode *dir,
 err_drop_inode:
 	if (handle)
 		ext4_journal_stop(handle);
-	kfree(sd);
 	clear_nlink(inode);
 	unlock_new_inode(inode);
 	iput(inode);
+err_free_sd:
+	kfree(sd);
 	return err;
 }
 
@@ -3487,9 +3397,9 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
 	}
 
 	if (retval) {
-		ext4_warning(ent->dir->i_sb,
-				"Deleting old file (%lu), %d, error=%d",
-				ent->dir->i_ino, ent->dir->i_nlink, retval);
+		ext4_warning_inode(ent->dir,
+				   "Deleting old file: nlink %d, error=%d",
+				   ent->dir->i_nlink, retval);
 	}
 }
 
@@ -3759,6 +3669,15 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	u8 new_file_type;
 	int retval;
 
+	if ((ext4_encrypted_inode(old_dir) ||
+	     ext4_encrypted_inode(new_dir)) &&
+	    (old_dir != new_dir) &&
+	    (!ext4_is_child_context_consistent_with_parent(new_dir,
+							   old.inode) ||
+	     !ext4_is_child_context_consistent_with_parent(old_dir,
+							   new.inode)))
+		return -EPERM;
+
 	dquot_initialize(old.dir);
 	dquot_initialize(new.dir);
 
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 5765f88b3..5602450f0 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -84,7 +84,7 @@ static void ext4_finish_bio(struct bio *bio)
 			/* The bounce data pages are unmapped. */
 			data_page = page;
 			ctx = (struct ext4_crypto_ctx *)page_private(data_page);
-			page = ctx->control_page;
+			page = ctx->w.control_page;
 		}
 #endif
 
@@ -359,7 +359,6 @@ void ext4_io_submit(struct ext4_io_submit *io)
 	if (bio) {
 		bio_get(io->io_bio);
 		submit_bio(io->io_op, io->io_bio);
-		BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
 		bio_put(io->io_bio);
 	}
 	io->io_bio = NULL;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 171b9ac4b..ec3ef93a5 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -54,8 +54,8 @@ static void completion_pages(struct work_struct *work)
 {
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 	struct ext4_crypto_ctx *ctx =
-		container_of(work, struct ext4_crypto_ctx, work);
-	struct bio	*bio	= ctx->bio;
+		container_of(work, struct ext4_crypto_ctx, r.work);
+	struct bio	*bio	= ctx->r.bio;
 	struct bio_vec	*bv;
 	int		i;
 
@@ -109,9 +109,9 @@ static void mpage_end_io(struct bio *bio, int err)
 		if (err) {
 			ext4_release_crypto_ctx(ctx);
 		} else {
-			INIT_WORK(&ctx->work, completion_pages);
-			ctx->bio = bio;
-			queue_work(ext4_read_workqueue, &ctx->work);
+			INIT_WORK(&ctx->r.work, completion_pages);
+			ctx->r.bio = bio;
+			queue_work(ext4_read_workqueue, &ctx->r.work);
 			return;
 		}
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ca12affdb..58987b5c5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
@@ -451,7 +452,7 @@ void __ext4_error_file(struct file *file, const char *function,
 	es = EXT4_SB(inode->i_sb)->s_es;
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	if (ext4_error_ratelimit(inode->i_sb)) {
-		path = d_path(&(file->f_path), pathname, sizeof(pathname));
+		path = file_path(file, pathname, sizeof(pathname));
 		if (IS_ERR(path))
 			path = "(unknown)";
 		va_start(args, fmt);
@@ -591,14 +592,17 @@ void __ext4_msg(struct super_block *sb,
 	va_end(args);
 }
 
+#define ext4_warning_ratelimit(sb)					\
+		___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),	\
+			     "EXT4-fs warning")
+
 void __ext4_warning(struct super_block *sb, const char *function,
 		    unsigned int line, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
 
-	if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
-			  "EXT4-fs warning"))
+	if (!ext4_warning_ratelimit(sb))
 		return;
 
 	va_start(args, fmt);
@@ -609,6 +613,24 @@ void __ext4_warning(struct super_block *sb, const char *function,
 	va_end(args);
 }
 
+void __ext4_warning_inode(const struct inode *inode, const char *function,
+			  unsigned int line, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (!ext4_warning_ratelimit(inode->i_sb))
+		return;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
+	       "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
+	       function, line, inode->i_ino, current->comm, &vaf);
+	va_end(args);
+}
+
 void __ext4_grp_locked_error(const char *function, unsigned int line,
 			     struct super_block *sb, ext4_group_t grp,
 			     unsigned long ino, ext4_fsblk_t block,
@@ -880,9 +902,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	atomic_set(&ei->i_unwritten, 0);
 	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-	ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID;
+	ei->i_crypt_info = NULL;
 #endif
-
 	return &ei->vfs_inode;
 }
 
@@ -959,6 +980,10 @@ void ext4_clear_inode(struct inode *inode)
 		jbd2_free_inode(EXT4_I(inode)->jinode);
 		EXT4_I(inode)->jinode = NULL;
 	}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+	if (EXT4_I(inode)->i_crypt_info)
+		ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info);
+#endif
 }
 
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -3421,7 +3446,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
-	char *cp;
 	const char *descr;
 	int ret = -ENOMEM;
 	int blocksize, clustersize;
@@ -3450,15 +3474,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (sb->s_bdev->bd_part)
 		sbi->s_sectors_written_start =
 			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	/* Modes of operations for file and directory encryption. */
-	sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
-	sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID;
-#endif
 
 	/* Cleanup superblock name */
-	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
-		*cp = '!';
+	strreplace(sb->s_id, '/', '!');
 
 	/* -EINVAL is default */
 	ret = -EINVAL;
@@ -4068,7 +4086,15 @@ no_journal:
 		}
 	}
 
-	if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) &&
+	if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
+	     EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
+	    (blocksize != PAGE_CACHE_SIZE)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Unsupported blocksize for fs encryption");
+		goto failed_mount_wq;
+	}
+
+	if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
 	    !(sb->s_flags & MS_RDONLY) &&
 	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
@@ -5414,6 +5440,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	struct inode *inode = sb_dqopt(sb)->files[type];
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int err, offset = off & (sb->s_blocksize - 1);
+	int retries = 0;
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
@@ -5434,7 +5461,12 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 		return -EIO;
 	}
 
-	bh = ext4_bread(handle, inode, blk, 1);
+	do {
+		bh = ext4_bread(handle, inode, blk,
+				EXT4_GET_BLOCKS_CREATE |
+				EXT4_GET_BLOCKS_METADATA_NOFAIL);
+	} while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) &&
+		 ext4_should_retry_alloc(inode->i_sb, &retries));
 	if (IS_ERR(bh))
 		return PTR_ERR(bh);
 	if (!bh)
@@ -5651,6 +5683,7 @@ out7:
 
 static void __exit ext4_exit_fs(void)
 {
+	ext4_exit_crypto();
 	ext4_destroy_lazyinit_thread();
 	unregister_as_ext2();
 	unregister_as_ext3();
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 187b78920..c677f2c10 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,34 +23,28 @@
 #include "xattr.h"
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
 	struct ext4_str cstr, pstr;
 	struct inode *inode = d_inode(dentry);
-	struct ext4_fname_crypto_ctx *ctx = NULL;
 	struct ext4_encrypted_symlink_data *sd;
 	loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
 	int res;
 	u32 plen, max_size = inode->i_sb->s_blocksize;
 
-	if (!ext4_encrypted_inode(inode))
-		return page_follow_link_light(dentry, nd);
-
-	ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize);
-	if (IS_ERR(ctx))
-		return ctx;
+	res = ext4_get_encryption_info(inode);
+	if (res)
+		return ERR_PTR(res);
 
 	if (ext4_inode_is_fast_symlink(inode)) {
 		caddr = (char *) EXT4_I(inode)->i_data;
 		max_size = sizeof(EXT4_I(inode)->i_data);
 	} else {
 		cpage = read_mapping_page(inode->i_mapping, 0, NULL);
-		if (IS_ERR(cpage)) {
-			ext4_put_fname_crypto_ctx(&ctx);
-			return cpage;
-		}
+		if (IS_ERR(cpage))
+			return ERR_CAST(cpage);
 		caddr = kmap(cpage);
 		caddr[size] = 0;
 	}
@@ -74,21 +68,19 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		goto errout;
 	}
 	pstr.name = paddr;
-	res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr);
+	pstr.len = plen;
+	res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
 	if (res < 0)
 		goto errout;
 	/* Null-terminate the name */
 	if (res <= plen)
 		paddr[res] = '\0';
-	nd_set_link(nd, paddr);
-	ext4_put_fname_crypto_ctx(&ctx);
 	if (cpage) {
 		kunmap(cpage);
 		page_cache_release(cpage);
 	}
-	return NULL;
+	return *cookie = paddr;
 errout:
-	ext4_put_fname_crypto_ctx(&ctx);
 	if (cpage) {
 		kunmap(cpage);
 		page_cache_release(cpage);
@@ -97,36 +89,22 @@ errout:
 	return ERR_PTR(res);
 }
 
-static void ext4_put_link(struct dentry *dentry, struct nameidata *nd,
-			  void *cookie)
-{
-	struct page *page = cookie;
-
-	if (!page) {
-		kfree(nd_get_link(nd));
-	} else {
-		kunmap(page);
-		page_cache_release(page);
-	}
-}
+const struct inode_operations ext4_encrypted_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link    = ext4_encrypted_follow_link,
+	.put_link       = kfree_put_link,
+	.setattr	= ext4_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+};
 #endif
 
-static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct ext4_inode_info *ei = EXT4_I(d_inode(dentry));
-	nd_set_link(nd, (char *) ei->i_data);
-	return NULL;
-}
-
 const struct inode_operations ext4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	.follow_link    = ext4_follow_link,
-	.put_link       = ext4_put_link,
-#else
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
-#endif
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -136,7 +114,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link    = ext4_follow_fast_link,
+	.follow_link    = simple_follow_link,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 05f0f663f..c62976200 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -72,6 +72,25 @@ config F2FS_CHECK_FS
 
 	  If you want to improve the performance, say N.
 
+config F2FS_FS_ENCRYPTION
+	bool "F2FS Encryption"
+	depends on F2FS_FS
+	depends on F2FS_FS_XATTR
+	select CRYPTO_AES
+	select CRYPTO_CBC
+	select CRYPTO_ECB
+	select CRYPTO_XTS
+	select CRYPTO_CTS
+	select CRYPTO_CTR
+	select CRYPTO_SHA256
+	select KEYS
+	select ENCRYPTED_KEYS
+	help
+	  Enable encryption of f2fs files and directories.  This
+	  feature is similar to ecryptfs, but it is more memory
+	  efficient since it avoids caching the encrypted and
+	  decrypted pages in the page cache.
+
 config F2FS_IO_TRACE
 	bool "F2FS IO tracer"
 	depends on F2FS_FS
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index d92397731..396be1a39 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -6,3 +6,5 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
 f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
+f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \
+		crypto_key.o crypto_fname.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 4320ffab3..c8f25f724 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -334,51 +334,45 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
 		struct page *dpage)
 {
 	struct posix_acl *p;
+	struct posix_acl *clone;
 	int ret;
 
+	*acl = NULL;
+	*default_acl = NULL;
+
 	if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
-		goto no_acl;
+		return 0;
 
 	p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage);
-	if (IS_ERR(p)) {
-		if (p == ERR_PTR(-EOPNOTSUPP))
-			goto apply_umask;
-		return PTR_ERR(p);
+	if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
+		*mode &= ~current_umask();
+		return 0;
 	}
+	if (IS_ERR(p))
+		return PTR_ERR(p);
 
-	if (!p)
-		goto apply_umask;
-
-	*acl = f2fs_acl_clone(p, GFP_NOFS);
-	if (!*acl)
+	clone = f2fs_acl_clone(p, GFP_NOFS);
+	if (!clone)
 		goto no_mem;
 
-	ret = f2fs_acl_create_masq(*acl, mode);
+	ret = f2fs_acl_create_masq(clone, mode);
 	if (ret < 0)
 		goto no_mem_clone;
 
-	if (ret == 0) {
-		posix_acl_release(*acl);
-		*acl = NULL;
-	}
+	if (ret == 0)
+		posix_acl_release(clone);
+	else
+		*acl = clone;
 
-	if (!S_ISDIR(*mode)) {
+	if (!S_ISDIR(*mode))
 		posix_acl_release(p);
-		*default_acl = NULL;
-	} else {
+	else
 		*default_acl = p;
-	}
-	return 0;
 
-apply_umask:
-	*mode &= ~current_umask();
-no_acl:
-	*default_acl = NULL;
-	*acl = NULL;
 	return 0;
 
 no_mem_clone:
-	posix_acl_release(*acl);
+	posix_acl_release(clone);
 no_mem:
 	posix_acl_release(p);
 	return -ENOMEM;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index a5e17a2a0..b70bbe1a6 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -52,9 +52,11 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 	struct address_space *mapping = META_MAPPING(sbi);
 	struct page *page;
 	struct f2fs_io_info fio = {
+		.sbi = sbi,
 		.type = META,
 		.rw = READ_SYNC | REQ_META | REQ_PRIO,
 		.blk_addr = index,
+		.encrypted_page = NULL,
 	};
 repeat:
 	page = grab_cache_page(mapping, index);
@@ -65,7 +67,9 @@ repeat:
 	if (PageUptodate(page))
 		goto out;
 
-	if (f2fs_submit_page_bio(sbi, page, &fio))
+	fio.page = page;
+
+	if (f2fs_submit_page_bio(&fio))
 		goto repeat;
 
 	lock_page(page);
@@ -77,8 +81,7 @@ out:
 	return page;
 }
 
-static inline bool is_valid_blkaddr(struct f2fs_sb_info *sbi,
-						block_t blkaddr, int type)
+bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
 {
 	switch (type) {
 	case META_NAT:
@@ -118,8 +121,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
 	struct page *page;
 	block_t blkno = start;
 	struct f2fs_io_info fio = {
+		.sbi = sbi,
 		.type = META,
-		.rw = READ_SYNC | REQ_META | REQ_PRIO
+		.rw = READ_SYNC | REQ_META | REQ_PRIO,
+		.encrypted_page = NULL,
 	};
 
 	for (; nrpages-- > 0; blkno++) {
@@ -161,7 +166,8 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
 			continue;
 		}
 
-		f2fs_submit_page_mbio(sbi, page, &fio);
+		fio.page = page;
+		f2fs_submit_page_mbio(&fio);
 		f2fs_put_page(page, 0);
 	}
 out:
@@ -510,7 +516,12 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 		grab_meta_page(sbi, start_blk + index);
 
 	index = 1;
-	spin_lock(&im->ino_lock);
+
+	/*
+	 * we don't need to do spin_lock(&im->ino_lock) here, since all the
+	 * orphan inode operations are covered under f2fs_lock_op().
+	 * And, spin_lock should be avoided due to page operations below.
+	 */
 	head = &im->ino_list;
 
 	/* loop for each orphan inode entry and write them in Jornal block */
@@ -550,8 +561,6 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 		set_page_dirty(page);
 		f2fs_put_page(page, 1);
 	}
-
-	spin_unlock(&im->ino_lock);
 }
 
 static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -879,10 +888,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
 	nid_t last_nid = nm_i->next_scan_nid;
 	block_t start_blk;
-	struct page *cp_page;
 	unsigned int data_sum_blocks, orphan_blocks;
 	__u32 crc32 = 0;
-	void *kaddr;
 	int i;
 	int cp_payload_blks = __cp_payload(sbi);
 
@@ -979,19 +986,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	start_blk = __start_cp_addr(sbi);
 
 	/* write out checkpoint buffer at block 0 */
-	cp_page = grab_meta_page(sbi, start_blk++);
-	kaddr = page_address(cp_page);
-	memcpy(kaddr, ckpt, F2FS_BLKSIZE);
-	set_page_dirty(cp_page);
-	f2fs_put_page(cp_page, 1);
-
-	for (i = 1; i < 1 + cp_payload_blks; i++) {
-		cp_page = grab_meta_page(sbi, start_blk++);
-		kaddr = page_address(cp_page);
-		memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE);
-		set_page_dirty(cp_page);
-		f2fs_put_page(cp_page, 1);
-	}
+	update_meta_page(sbi, ckpt, start_blk++);
+
+	for (i = 1; i < 1 + cp_payload_blks; i++)
+		update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE,
+							start_blk++);
 
 	if (orphan_num) {
 		write_orphan_inodes(sbi, start_blk);
@@ -1006,11 +1005,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	}
 
 	/* writeout checkpoint block */
-	cp_page = grab_meta_page(sbi, start_blk);
-	kaddr = page_address(cp_page);
-	memcpy(kaddr, ckpt, F2FS_BLKSIZE);
-	set_page_dirty(cp_page);
-	f2fs_put_page(cp_page, 1);
+	update_meta_page(sbi, ckpt, start_blk);
 
 	/* wait for previous submitted node/meta pages writeback */
 	wait_on_all_pages_writeback(sbi);
@@ -1036,7 +1031,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	if (unlikely(f2fs_cp_error(sbi)))
 		return;
 
-	clear_prefree_segments(sbi);
+	clear_prefree_segments(sbi, cpc);
 	clear_sbi_flag(sbi, SBI_IS_DIRTY);
 }
 
@@ -1051,7 +1046,8 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	mutex_lock(&sbi->cp_mutex);
 
 	if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
-		(cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC))
+		(cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
+		(cpc->reason == CP_DISCARD && !sbi->discard_blks)))
 		goto out;
 	if (unlikely(f2fs_cp_error(sbi)))
 		goto out;
diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c
new file mode 100644
index 000000000..4a62ef14e
--- /dev/null
+++ b/fs/f2fs/crypto.c
@@ -0,0 +1,491 @@
+/*
+ * linux/fs/f2fs/crypto.c
+ *
+ * Copied from linux/fs/ext4/crypto.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * This contains encryption functions for f2fs
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ *	Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ *	Ildar Muslukhov, 2014
+ * Remove ext4_encrypted_zeroout(),
+ *   add f2fs_restore_and_release_control_page()
+ *	Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/user-type.h>
+#include <keys/encrypted-type.h>
+#include <linux/crypto.h>
+#include <linux/ecryptfs.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+#include <linux/f2fs_fs.h>
+#include <linux/ratelimit.h>
+#include <linux/bio.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+
+/* Encryption added and removed here! (L: */
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+		"Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+		"Number of crypto contexts to preallocate");
+
+static mempool_t *f2fs_bounce_page_pool;
+
+static LIST_HEAD(f2fs_free_crypto_ctxs);
+static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock);
+
+static struct workqueue_struct *f2fs_read_workqueue;
+static DEFINE_MUTEX(crypto_init);
+
+static struct kmem_cache *f2fs_crypto_ctx_cachep;
+struct kmem_cache *f2fs_crypt_info_cachep;
+
+/**
+ * f2fs_release_crypto_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx)
+{
+	unsigned long flags;
+
+	if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+		mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool);
+		ctx->w.bounce_page = NULL;
+	}
+	ctx->w.control_page = NULL;
+	if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+		kmem_cache_free(f2fs_crypto_ctx_cachep, ctx);
+	} else {
+		spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
+		list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
+		spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
+	}
+}
+
+/**
+ * f2fs_get_crypto_ctx() - Gets an encryption context
+ * @inode:       The inode for which we are doing the crypto
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode)
+{
+	struct f2fs_crypto_ctx *ctx = NULL;
+	unsigned long flags;
+	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+
+	if (ci == NULL)
+		return ERR_PTR(-ENOKEY);
+
+	/*
+	 * We first try getting the ctx from a free list because in
+	 * the common case the ctx will have an allocated and
+	 * initialized crypto tfm, so it's probably a worthwhile
+	 * optimization. For the bounce page, we first try getting it
+	 * from the kernel allocator because that's just about as fast
+	 * as getting it from a list and because a cache of free pages
+	 * should generally be a "last resort" option for a filesystem
+	 * to be able to do its job.
+	 */
+	spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
+	ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs,
+					struct f2fs_crypto_ctx, free_list);
+	if (ctx)
+		list_del(&ctx->free_list);
+	spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
+	if (!ctx) {
+		ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS);
+		if (!ctx)
+			return ERR_PTR(-ENOMEM);
+		ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+	} else {
+		ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+	}
+	ctx->flags &= ~F2FS_WRITE_PATH_FL;
+	return ctx;
+}
+
+/*
+ * Call f2fs_decrypt on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+	struct f2fs_crypto_ctx *ctx =
+		container_of(work, struct f2fs_crypto_ctx, r.work);
+	struct bio *bio = ctx->r.bio;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+		int ret = f2fs_decrypt(ctx, page);
+
+		if (ret) {
+			WARN_ON_ONCE(1);
+			SetPageError(page);
+		} else
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	f2fs_release_crypto_ctx(ctx);
+	bio_put(bio);
+}
+
+void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio)
+{
+	INIT_WORK(&ctx->r.work, completion_pages);
+	ctx->r.bio = bio;
+	queue_work(f2fs_read_workqueue, &ctx->r.work);
+}
+
+static void f2fs_crypto_destroy(void)
+{
+	struct f2fs_crypto_ctx *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list)
+		kmem_cache_free(f2fs_crypto_ctx_cachep, pos);
+	INIT_LIST_HEAD(&f2fs_free_crypto_ctxs);
+	if (f2fs_bounce_page_pool)
+		mempool_destroy(f2fs_bounce_page_pool);
+	f2fs_bounce_page_pool = NULL;
+}
+
+/**
+ * f2fs_crypto_initialize() - Set up for f2fs encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int f2fs_crypto_initialize(void)
+{
+	int i, res = -ENOMEM;
+
+	if (f2fs_bounce_page_pool)
+		return 0;
+
+	mutex_lock(&crypto_init);
+	if (f2fs_bounce_page_pool)
+		goto already_initialized;
+
+	for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+		struct f2fs_crypto_ctx *ctx;
+
+		ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL);
+		if (!ctx)
+			goto fail;
+		list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
+	}
+
+	/* must be allocated at the last step to avoid race condition above */
+	f2fs_bounce_page_pool =
+		mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+	if (!f2fs_bounce_page_pool)
+		goto fail;
+
+already_initialized:
+	mutex_unlock(&crypto_init);
+	return 0;
+fail:
+	f2fs_crypto_destroy();
+	mutex_unlock(&crypto_init);
+	return res;
+}
+
+/**
+ * f2fs_exit_crypto() - Shutdown the f2fs encryption system
+ */
+void f2fs_exit_crypto(void)
+{
+	f2fs_crypto_destroy();
+
+	if (f2fs_read_workqueue)
+		destroy_workqueue(f2fs_read_workqueue);
+	if (f2fs_crypto_ctx_cachep)
+		kmem_cache_destroy(f2fs_crypto_ctx_cachep);
+	if (f2fs_crypt_info_cachep)
+		kmem_cache_destroy(f2fs_crypt_info_cachep);
+}
+
+int __init f2fs_init_crypto(void)
+{
+	int res = -ENOMEM;
+
+	f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0);
+	if (!f2fs_read_workqueue)
+		goto fail;
+
+	f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx,
+						SLAB_RECLAIM_ACCOUNT);
+	if (!f2fs_crypto_ctx_cachep)
+		goto fail;
+
+	f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info,
+						SLAB_RECLAIM_ACCOUNT);
+	if (!f2fs_crypt_info_cachep)
+		goto fail;
+
+	return 0;
+fail:
+	f2fs_exit_crypto();
+	return res;
+}
+
+void f2fs_restore_and_release_control_page(struct page **page)
+{
+	struct f2fs_crypto_ctx *ctx;
+	struct page *bounce_page;
+
+	/* The bounce data pages are unmapped. */
+	if ((*page)->mapping)
+		return;
+
+	/* The bounce data page is unmapped. */
+	bounce_page = *page;
+	ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page);
+
+	/* restore control page */
+	*page = ctx->w.control_page;
+
+	f2fs_restore_control_page(bounce_page);
+}
+
+void f2fs_restore_control_page(struct page *data_page)
+{
+	struct f2fs_crypto_ctx *ctx =
+		(struct f2fs_crypto_ctx *)page_private(data_page);
+
+	set_page_private(data_page, (unsigned long)NULL);
+	ClearPagePrivate(data_page);
+	unlock_page(data_page);
+	f2fs_release_crypto_ctx(ctx);
+}
+
+/**
+ * f2fs_crypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void f2fs_crypt_complete(struct crypto_async_request *req, int res)
+{
+	struct f2fs_completion_result *ecr = req->data;
+
+	if (res == -EINPROGRESS)
+		return;
+	ecr->res = res;
+	complete(&ecr->completion);
+}
+
+typedef enum {
+	F2FS_DECRYPT = 0,
+	F2FS_ENCRYPT,
+} f2fs_direction_t;
+
+static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx,
+				struct inode *inode,
+				f2fs_direction_t rw,
+				pgoff_t index,
+				struct page *src_page,
+				struct page *dest_page)
+{
+	u8 xts_tweak[F2FS_XTS_TWEAK_SIZE];
+	struct ablkcipher_request *req = NULL;
+	DECLARE_F2FS_COMPLETION_RESULT(ecr);
+	struct scatterlist dst, src;
+	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+	int res = 0;
+
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		printk_ratelimited(KERN_ERR
+				"%s: crypto_request_alloc() failed\n",
+				__func__);
+		return -ENOMEM;
+	}
+	ablkcipher_request_set_callback(
+		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+		f2fs_crypt_complete, &ecr);
+
+	BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index));
+	memcpy(xts_tweak, &index, sizeof(index));
+	memset(&xts_tweak[sizeof(index)], 0,
+			F2FS_XTS_TWEAK_SIZE - sizeof(index));
+
+	sg_init_table(&dst, 1);
+	sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+	sg_init_table(&src, 1);
+	sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
+	ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+					xts_tweak);
+	if (rw == F2FS_DECRYPT)
+		res = crypto_ablkcipher_decrypt(req);
+	else
+		res = crypto_ablkcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		BUG_ON(req->base.data != &ecr);
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+	ablkcipher_request_free(req);
+	if (res) {
+		printk_ratelimited(KERN_ERR
+			"%s: crypto_ablkcipher_encrypt() returned %d\n",
+			__func__, res);
+		return res;
+	}
+	return 0;
+}
+
+static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx)
+{
+	ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT);
+	if (ctx->w.bounce_page == NULL)
+		return ERR_PTR(-ENOMEM);
+	ctx->flags |= F2FS_WRITE_PATH_FL;
+	return ctx->w.bounce_page;
+}
+
+/**
+ * f2fs_encrypt() - Encrypts a page
+ * @inode:          The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path.  The caller must call
+ * f2fs_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *f2fs_encrypt(struct inode *inode,
+			  struct page *plaintext_page)
+{
+	struct f2fs_crypto_ctx *ctx;
+	struct page *ciphertext_page = NULL;
+	int err;
+
+	BUG_ON(!PageLocked(plaintext_page));
+
+	ctx = f2fs_get_crypto_ctx(inode);
+	if (IS_ERR(ctx))
+		return (struct page *)ctx;
+
+	/* The encryption operation will require a bounce page. */
+	ciphertext_page = alloc_bounce_page(ctx);
+	if (IS_ERR(ciphertext_page))
+		goto err_out;
+
+	ctx->w.control_page = plaintext_page;
+	err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index,
+					plaintext_page, ciphertext_page);
+	if (err) {
+		ciphertext_page = ERR_PTR(err);
+		goto err_out;
+	}
+
+	SetPagePrivate(ciphertext_page);
+	set_page_private(ciphertext_page, (unsigned long)ctx);
+	lock_page(ciphertext_page);
+	return ciphertext_page;
+
+err_out:
+	f2fs_release_crypto_ctx(ctx);
+	return ciphertext_page;
+}
+
+/**
+ * f2fs_decrypt() - Decrypts a page in-place
+ * @ctx:  The encryption context.
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	return f2fs_page_crypto(ctx, page->mapping->host,
+				F2FS_DECRYPT, page->index, page, page);
+}
+
+/*
+ * Convenience function which takes care of allocating and
+ * deallocating the encryption context
+ */
+int f2fs_decrypt_one(struct inode *inode, struct page *page)
+{
+	struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode);
+	int ret;
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+	ret = f2fs_decrypt(ctx, page);
+	f2fs_release_crypto_ctx(ctx);
+	return ret;
+}
+
+bool f2fs_valid_contents_enc_mode(uint32_t mode)
+{
+	return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS);
+}
+
+/**
+ * f2fs_validate_encryption_key_size() - Validate the encryption key size
+ * @mode: The key mode.
+ * @size: The key size to validate.
+ *
+ * Return: The validated key size for @mode. Zero if invalid.
+ */
+uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size)
+{
+	if (size == f2fs_encryption_key_size(mode))
+		return size;
+	return 0;
+}
diff --git a/fs/f2fs/crypto_fname.c b/fs/f2fs/crypto_fname.c
new file mode 100644
index 000000000..ab377d496
--- /dev/null
+++ b/fs/f2fs/crypto_fname.c
@@ -0,0 +1,440 @@
+/*
+ * linux/fs/f2fs/crypto_fname.c
+ *
+ * Copied from linux/fs/ext4/crypto.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * This contains functions for filename crypto management in f2fs
+ *
+ * Written by Uday Savagaonkar, 2014.
+ *
+ * Adjust f2fs dentry structure
+ *	Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ */
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/crypto.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+#include <linux/f2fs_fs.h>
+#include <linux/ratelimit.h>
+
+#include "f2fs.h"
+#include "f2fs_crypto.h"
+#include "xattr.h"
+
+/**
+ * f2fs_dir_crypt_complete() -
+ */
+static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
+{
+	struct f2fs_completion_result *ecr = req->data;
+
+	if (res == -EINPROGRESS)
+		return;
+	ecr->res = res;
+	complete(&ecr->completion);
+}
+
+bool f2fs_valid_filenames_enc_mode(uint32_t mode)
+{
+	return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS);
+}
+
+static unsigned max_name_len(struct inode *inode)
+{
+	return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
+					F2FS_NAME_LEN;
+}
+
+/**
+ * f2fs_fname_encrypt() -
+ *
+ * This function encrypts the input filename, and returns the length of the
+ * ciphertext. Errors are returned as negative numbers.  We trust the caller to
+ * allocate sufficient memory to oname string.
+ */
+static int f2fs_fname_encrypt(struct inode *inode,
+			const struct qstr *iname, struct f2fs_str *oname)
+{
+	u32 ciphertext_len;
+	struct ablkcipher_request *req = NULL;
+	DECLARE_F2FS_COMPLETION_RESULT(ecr);
+	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+	int res = 0;
+	char iv[F2FS_CRYPTO_BLOCK_SIZE];
+	struct scatterlist src_sg, dst_sg;
+	int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
+	char *workbuf, buf[32], *alloc_buf = NULL;
+	unsigned lim = max_name_len(inode);
+
+	if (iname->len <= 0 || iname->len > lim)
+		return -EIO;
+
+	ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ?
+		F2FS_CRYPTO_BLOCK_SIZE : iname->len;
+	ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding);
+	ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
+
+	if (ciphertext_len <= sizeof(buf)) {
+		workbuf = buf;
+	} else {
+		alloc_buf = kmalloc(ciphertext_len, GFP_NOFS);
+		if (!alloc_buf)
+			return -ENOMEM;
+		workbuf = alloc_buf;
+	}
+
+	/* Allocate request */
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		printk_ratelimited(KERN_ERR
+			"%s: crypto_request_alloc() failed\n", __func__);
+		kfree(alloc_buf);
+		return -ENOMEM;
+	}
+	ablkcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			f2fs_dir_crypt_complete, &ecr);
+
+	/* Copy the input */
+	memcpy(workbuf, iname->name, iname->len);
+	if (iname->len < ciphertext_len)
+		memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
+
+	/* Initialize IV */
+	memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+
+	/* Create encryption request */
+	sg_init_one(&src_sg, workbuf, ciphertext_len);
+	sg_init_one(&dst_sg, oname->name, ciphertext_len);
+	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+	res = crypto_ablkcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		BUG_ON(req->base.data != &ecr);
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+	kfree(alloc_buf);
+	ablkcipher_request_free(req);
+	if (res < 0) {
+		printk_ratelimited(KERN_ERR
+				"%s: Error (error code %d)\n", __func__, res);
+	}
+	oname->len = ciphertext_len;
+	return res;
+}
+
+/*
+ * f2fs_fname_decrypt()
+ *	This function decrypts the input filename, and returns
+ *	the length of the plaintext.
+ *	Errors are returned as negative numbers.
+ *	We trust the caller to allocate sufficient memory to oname string.
+ */
+static int f2fs_fname_decrypt(struct inode *inode,
+			const struct f2fs_str *iname, struct f2fs_str *oname)
+{
+	struct ablkcipher_request *req = NULL;
+	DECLARE_F2FS_COMPLETION_RESULT(ecr);
+	struct scatterlist src_sg, dst_sg;
+	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+	int res = 0;
+	char iv[F2FS_CRYPTO_BLOCK_SIZE];
+	unsigned lim = max_name_len(inode);
+
+	if (iname->len <= 0 || iname->len > lim)
+		return -EIO;
+
+	/* Allocate request */
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		printk_ratelimited(KERN_ERR
+			"%s: crypto_request_alloc() failed\n",  __func__);
+		return -ENOMEM;
+	}
+	ablkcipher_request_set_callback(req,
+		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+		f2fs_dir_crypt_complete, &ecr);
+
+	/* Initialize IV */
+	memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+
+	/* Create decryption request */
+	sg_init_one(&src_sg, iname->name, iname->len);
+	sg_init_one(&dst_sg, oname->name, oname->len);
+	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+	res = crypto_ablkcipher_decrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		BUG_ON(req->base.data != &ecr);
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+	ablkcipher_request_free(req);
+	if (res < 0) {
+		printk_ratelimited(KERN_ERR
+			"%s: Error in f2fs_fname_decrypt (error code %d)\n",
+			__func__, res);
+		return res;
+	}
+
+	oname->len = strnlen(oname->name, iname->len);
+	return oname->len;
+}
+
+static const char *lookup_table =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+
+/**
+ * f2fs_fname_encode_digest() -
+ *
+ * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
+ * The encoded string is roughly 4/3 times the size of the input string.
+ */
+static int digest_encode(const char *src, int len, char *dst)
+{
+	int i = 0, bits = 0, ac = 0;
+	char *cp = dst;
+
+	while (i < len) {
+		ac += (((unsigned char) src[i]) << bits);
+		bits += 8;
+		do {
+			*cp++ = lookup_table[ac & 0x3f];
+			ac >>= 6;
+			bits -= 6;
+		} while (bits >= 6);
+		i++;
+	}
+	if (bits)
+		*cp++ = lookup_table[ac & 0x3f];
+	return cp - dst;
+}
+
+static int digest_decode(const char *src, int len, char *dst)
+{
+	int i = 0, bits = 0, ac = 0;
+	const char *p;
+	char *cp = dst;
+
+	while (i < len) {
+		p = strchr(lookup_table, src[i]);
+		if (p == NULL || src[i] == 0)
+			return -2;
+		ac += (p - lookup_table) << bits;
+		bits += 6;
+		if (bits >= 8) {
+			*cp++ = ac & 0xff;
+			ac >>= 8;
+			bits -= 8;
+		}
+		i++;
+	}
+	if (ac)
+		return -1;
+	return cp - dst;
+}
+
+/**
+ * f2fs_fname_crypto_round_up() -
+ *
+ * Return: The next multiple of block size
+ */
+u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize)
+{
+	return ((size + blksize - 1) / blksize) * blksize;
+}
+
+/**
+ * f2fs_fname_crypto_alloc_obuff() -
+ *
+ * Allocates an output buffer that is sufficient for the crypto operation
+ * specified by the context and the direction.
+ */
+int f2fs_fname_crypto_alloc_buffer(struct inode *inode,
+				   u32 ilen, struct f2fs_str *crypto_str)
+{
+	unsigned int olen;
+	int padding = 16;
+	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+
+	if (ci)
+		padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
+	if (padding < F2FS_CRYPTO_BLOCK_SIZE)
+		padding = F2FS_CRYPTO_BLOCK_SIZE;
+	olen = f2fs_fname_crypto_round_up(ilen, padding);
+	crypto_str->len = olen;
+	if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
+		olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+	/* Allocated buffer can hold one more character to null-terminate the
+	 * string */
+	crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
+	if (!(crypto_str->name))
+		return -ENOMEM;
+	return 0;
+}
+
+/**
+ * f2fs_fname_crypto_free_buffer() -
+ *
+ * Frees the buffer allocated for crypto operation.
+ */
+void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str)
+{
+	if (!crypto_str)
+		return;
+	kfree(crypto_str->name);
+	crypto_str->name = NULL;
+}
+
+/**
+ * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space
+ */
+int f2fs_fname_disk_to_usr(struct inode *inode,
+			f2fs_hash_t *hash,
+			const struct f2fs_str *iname,
+			struct f2fs_str *oname)
+{
+	const struct qstr qname = FSTR_TO_QSTR(iname);
+	char buf[24];
+	int ret;
+
+	if (is_dot_dotdot(&qname)) {
+		oname->name[0] = '.';
+		oname->name[iname->len - 1] = '.';
+		oname->len = iname->len;
+		return oname->len;
+	}
+
+	if (F2FS_I(inode)->i_crypt_info)
+		return f2fs_fname_decrypt(inode, iname, oname);
+
+	if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) {
+		ret = digest_encode(iname->name, iname->len, oname->name);
+		oname->len = ret;
+		return ret;
+	}
+	if (hash) {
+		memcpy(buf, hash, 4);
+		memset(buf + 4, 0, 4);
+	} else
+		memset(buf, 0, 8);
+	memcpy(buf + 8, iname->name + iname->len - 16, 16);
+	oname->name[0] = '_';
+	ret = digest_encode(buf, 24, oname->name + 1);
+	oname->len = ret + 1;
+	return ret + 1;
+}
+
+/**
+ * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space
+ */
+int f2fs_fname_usr_to_disk(struct inode *inode,
+			const struct qstr *iname,
+			struct f2fs_str *oname)
+{
+	int res;
+	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+
+	if (is_dot_dotdot(iname)) {
+		oname->name[0] = '.';
+		oname->name[iname->len - 1] = '.';
+		oname->len = iname->len;
+		return oname->len;
+	}
+
+	if (ci) {
+		res = f2fs_fname_encrypt(inode, iname, oname);
+		return res;
+	}
+	/* Without a proper key, a user is not allowed to modify the filenames
+	 * in a directory. Consequently, a user space name cannot be mapped to
+	 * a disk-space name */
+	return -EACCES;
+}
+
+int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+			      int lookup, struct f2fs_filename *fname)
+{
+	struct f2fs_crypt_info *ci;
+	int ret = 0, bigname = 0;
+
+	memset(fname, 0, sizeof(struct f2fs_filename));
+	fname->usr_fname = iname;
+
+	if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) {
+		fname->disk_name.name = (unsigned char *)iname->name;
+		fname->disk_name.len = iname->len;
+		return 0;
+	}
+	ret = f2fs_get_encryption_info(dir);
+	if (ret)
+		return ret;
+	ci = F2FS_I(dir)->i_crypt_info;
+	if (ci) {
+		ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len,
+						     &fname->crypto_buf);
+		if (ret < 0)
+			return ret;
+		ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf);
+		if (ret < 0)
+			goto errout;
+		fname->disk_name.name = fname->crypto_buf.name;
+		fname->disk_name.len = fname->crypto_buf.len;
+		return 0;
+	}
+	if (!lookup)
+		return -EACCES;
+
+	/* We don't have the key and we are doing a lookup; decode the
+	 * user-supplied name
+	 */
+	if (iname->name[0] == '_')
+		bigname = 1;
+	if ((bigname && (iname->len != 33)) ||
+	    (!bigname && (iname->len > 43)))
+		return -ENOENT;
+
+	fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+	if (fname->crypto_buf.name == NULL)
+		return -ENOMEM;
+	ret = digest_decode(iname->name + bigname, iname->len - bigname,
+				fname->crypto_buf.name);
+	if (ret < 0) {
+		ret = -ENOENT;
+		goto errout;
+	}
+	fname->crypto_buf.len = ret;
+	if (bigname) {
+		memcpy(&fname->hash, fname->crypto_buf.name, 4);
+	} else {
+		fname->disk_name.name = fname->crypto_buf.name;
+		fname->disk_name.len = fname->crypto_buf.len;
+	}
+	return 0;
+errout:
+	f2fs_fname_crypto_free_buffer(&fname->crypto_buf);
+	return ret;
+}
+
+void f2fs_fname_free_filename(struct f2fs_filename *fname)
+{
+	kfree(fname->crypto_buf.name);
+	fname->crypto_buf.name = NULL;
+	fname->usr_fname = NULL;
+	fname->disk_name.name = NULL;
+}
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
new file mode 100644
index 000000000..95b8f936f
--- /dev/null
+++ b/fs/f2fs/crypto_key.c
@@ -0,0 +1,255 @@
+/*
+ * linux/fs/f2fs/crypto_key.c
+ *
+ * Copied from linux/fs/f2fs/crypto_key.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions for f2fs
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+#include <crypto/hash.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+	struct f2fs_completion_result *ecr = req->data;
+
+	if (rc == -EINPROGRESS)
+		return;
+
+	ecr->res = rc;
+	complete(&ecr->completion);
+}
+
+/**
+ * f2fs_derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivatio.
+ * @source_key:   Source key to which to apply derivation.
+ * @derived_key:  Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE],
+				char source_key[F2FS_AES_256_XTS_KEY_SIZE],
+				char derived_key[F2FS_AES_256_XTS_KEY_SIZE])
+{
+	int res = 0;
+	struct ablkcipher_request *req = NULL;
+	DECLARE_F2FS_COMPLETION_RESULT(ecr);
+	struct scatterlist src_sg, dst_sg;
+	struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
+								0);
+
+	if (IS_ERR(tfm)) {
+		res = PTR_ERR(tfm);
+		tfm = NULL;
+		goto out;
+	}
+	crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		res = -ENOMEM;
+		goto out;
+	}
+	ablkcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			derive_crypt_complete, &ecr);
+	res = crypto_ablkcipher_setkey(tfm, deriving_key,
+				F2FS_AES_128_ECB_KEY_SIZE);
+	if (res < 0)
+		goto out;
+
+	sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE);
+	sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE);
+	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
+					F2FS_AES_256_XTS_KEY_SIZE, NULL);
+	res = crypto_ablkcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		BUG_ON(req->base.data != &ecr);
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+out:
+	if (req)
+		ablkcipher_request_free(req);
+	if (tfm)
+		crypto_free_ablkcipher(tfm);
+	return res;
+}
+
+static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
+{
+	if (!ci)
+		return;
+
+	if (ci->ci_keyring_key)
+		key_put(ci->ci_keyring_key);
+	crypto_free_ablkcipher(ci->ci_ctfm);
+	kmem_cache_free(f2fs_crypt_info_cachep, ci);
+}
+
+void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct f2fs_crypt_info *prev;
+
+	if (ci == NULL)
+		ci = ACCESS_ONCE(fi->i_crypt_info);
+	if (ci == NULL)
+		return;
+	prev = cmpxchg(&fi->i_crypt_info, ci, NULL);
+	if (prev != ci)
+		return;
+
+	f2fs_free_crypt_info(ci);
+}
+
+int _f2fs_get_encryption_info(struct inode *inode)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	struct f2fs_crypt_info *crypt_info;
+	char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
+				(F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
+	struct key *keyring_key = NULL;
+	struct f2fs_encryption_key *master_key;
+	struct f2fs_encryption_context ctx;
+	struct user_key_payload *ukp;
+	struct crypto_ablkcipher *ctfm;
+	const char *cipher_str;
+	char raw_key[F2FS_MAX_KEY_SIZE];
+	char mode;
+	int res;
+
+	res = f2fs_crypto_initialize();
+	if (res)
+		return res;
+retry:
+	crypt_info = ACCESS_ONCE(fi->i_crypt_info);
+	if (crypt_info) {
+		if (!crypt_info->ci_keyring_key ||
+				key_validate(crypt_info->ci_keyring_key) == 0)
+			return 0;
+		f2fs_free_encryption_info(inode, crypt_info);
+		goto retry;
+	}
+
+	res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+				&ctx, sizeof(ctx), NULL);
+	if (res < 0)
+		return res;
+	else if (res != sizeof(ctx))
+		return -EINVAL;
+	res = 0;
+
+	crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS);
+	if (!crypt_info)
+		return -ENOMEM;
+
+	crypt_info->ci_flags = ctx.flags;
+	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+	crypt_info->ci_ctfm = NULL;
+	crypt_info->ci_keyring_key = NULL;
+	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+				sizeof(crypt_info->ci_master_key));
+	if (S_ISREG(inode->i_mode))
+		mode = crypt_info->ci_data_mode;
+	else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		mode = crypt_info->ci_filename_mode;
+	else
+		BUG();
+
+	switch (mode) {
+	case F2FS_ENCRYPTION_MODE_AES_256_XTS:
+		cipher_str = "xts(aes)";
+		break;
+	case F2FS_ENCRYPTION_MODE_AES_256_CTS:
+		cipher_str = "cts(cbc(aes))";
+		break;
+	default:
+		printk_once(KERN_WARNING
+			    "f2fs: unsupported key mode %d (ino %u)\n",
+			    mode, (unsigned) inode->i_ino);
+		res = -ENOKEY;
+		goto out;
+	}
+
+	memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX,
+					F2FS_KEY_DESC_PREFIX_SIZE);
+	sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE,
+					"%*phN", F2FS_KEY_DESCRIPTOR_SIZE,
+					ctx.master_key_descriptor);
+	full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
+					(2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0';
+	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+	if (IS_ERR(keyring_key)) {
+		res = PTR_ERR(keyring_key);
+		keyring_key = NULL;
+		goto out;
+	}
+	crypt_info->ci_keyring_key = keyring_key;
+	BUG_ON(keyring_key->type != &key_type_logon);
+	ukp = ((struct user_key_payload *)keyring_key->payload.data);
+	if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
+		res = -EINVAL;
+		goto out;
+	}
+	master_key = (struct f2fs_encryption_key *)ukp->data;
+	BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE !=
+				F2FS_KEY_DERIVATION_NONCE_SIZE);
+	BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE);
+	res = f2fs_derive_key_aes(ctx.nonce, master_key->raw,
+				  raw_key);
+	if (res)
+		goto out;
+
+	ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
+	if (!ctfm || IS_ERR(ctfm)) {
+		res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+		printk(KERN_DEBUG
+		       "%s: error %d (inode %u) allocating crypto tfm\n",
+		       __func__, res, (unsigned) inode->i_ino);
+		goto out;
+	}
+	crypt_info->ci_ctfm = ctfm;
+	crypto_ablkcipher_clear_flags(ctfm, ~0);
+	crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
+			     CRYPTO_TFM_REQ_WEAK_KEY);
+	res = crypto_ablkcipher_setkey(ctfm, raw_key,
+					f2fs_encryption_key_size(mode));
+	if (res)
+		goto out;
+
+	memzero_explicit(raw_key, sizeof(raw_key));
+	if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) {
+		f2fs_free_crypt_info(crypt_info);
+		goto retry;
+	}
+	return 0;
+
+out:
+	if (res == -ENOKEY && !S_ISREG(inode->i_mode))
+		res = 0;
+
+	f2fs_free_crypt_info(crypt_info);
+	memzero_explicit(raw_key, sizeof(raw_key));
+	return res;
+}
+
+int f2fs_has_encryption_key(struct inode *inode)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+
+	return (fi->i_crypt_info != NULL);
+}
diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c
new file mode 100644
index 000000000..d4a96af51
--- /dev/null
+++ b/fs/f2fs/crypto_policy.c
@@ -0,0 +1,209 @@
+/*
+ * copied from linux/fs/ext4/crypto_policy.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility.
+ *
+ * This contains encryption policy functions for f2fs with some modifications
+ * to support f2fs-specific xattr APIs.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+
+static int f2fs_inode_has_encryption_context(struct inode *inode)
+{
+	int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+			F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL);
+	return (res > 0);
+}
+
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int f2fs_is_encryption_context_consistent_with_policy(
+	struct inode *inode, const struct f2fs_encryption_policy *policy)
+{
+	struct f2fs_encryption_context ctx;
+	int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+				sizeof(ctx), NULL);
+
+	if (res != sizeof(ctx))
+		return 0;
+
+	return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+				F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+			(ctx.flags == policy->flags) &&
+			(ctx.contents_encryption_mode ==
+			 policy->contents_encryption_mode) &&
+			(ctx.filenames_encryption_mode ==
+			 policy->filenames_encryption_mode));
+}
+
+static int f2fs_create_encryption_context_from_policy(
+	struct inode *inode, const struct f2fs_encryption_policy *policy)
+{
+	struct f2fs_encryption_context ctx;
+
+	ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+			F2FS_KEY_DESCRIPTOR_SIZE);
+
+	if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) {
+		printk(KERN_WARNING
+		       "%s: Invalid contents encryption mode %d\n", __func__,
+			policy->contents_encryption_mode);
+		return -EINVAL;
+	}
+
+	if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
+		printk(KERN_WARNING
+		       "%s: Invalid filenames encryption mode %d\n", __func__,
+			policy->filenames_encryption_mode);
+		return -EINVAL;
+	}
+
+	if (policy->flags & ~F2FS_POLICY_FLAGS_VALID)
+		return -EINVAL;
+
+	ctx.contents_encryption_mode = policy->contents_encryption_mode;
+	ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+	ctx.flags = policy->flags;
+	BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE);
+	get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
+
+	return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+			F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+			sizeof(ctx), NULL, XATTR_CREATE);
+}
+
+int f2fs_process_policy(const struct f2fs_encryption_policy *policy,
+			struct inode *inode)
+{
+	if (policy->version != 0)
+		return -EINVAL;
+
+	if (!S_ISDIR(inode->i_mode))
+		return -EINVAL;
+
+	if (!f2fs_inode_has_encryption_context(inode)) {
+		if (!f2fs_empty_dir(inode))
+			return -ENOTEMPTY;
+		return f2fs_create_encryption_context_from_policy(inode,
+								  policy);
+	}
+
+	if (f2fs_is_encryption_context_consistent_with_policy(inode, policy))
+		return 0;
+
+	printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
+	       __func__);
+	return -EINVAL;
+}
+
+int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy)
+{
+	struct f2fs_encryption_context ctx;
+	int res;
+
+	if (!f2fs_encrypted_inode(inode))
+		return -ENODATA;
+
+	res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+				&ctx, sizeof(ctx), NULL);
+	if (res != sizeof(ctx))
+		return -ENODATA;
+	if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+		return -EINVAL;
+
+	policy->version = 0;
+	policy->contents_encryption_mode = ctx.contents_encryption_mode;
+	policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+	policy->flags = ctx.flags;
+	memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+			F2FS_KEY_DESCRIPTOR_SIZE);
+	return 0;
+}
+
+int f2fs_is_child_context_consistent_with_parent(struct inode *parent,
+						struct inode *child)
+{
+	struct f2fs_crypt_info *parent_ci, *child_ci;
+	int res;
+
+	if ((parent == NULL) || (child == NULL)) {
+		pr_err("parent %p child %p\n", parent, child);
+		BUG_ON(1);
+	}
+
+	/* no restrictions if the parent directory is not encrypted */
+	if (!f2fs_encrypted_inode(parent))
+		return 1;
+	/* if the child directory is not encrypted, this is always a problem */
+	if (!f2fs_encrypted_inode(child))
+		return 0;
+	res = f2fs_get_encryption_info(parent);
+	if (res)
+		return 0;
+	res = f2fs_get_encryption_info(child);
+	if (res)
+		return 0;
+	parent_ci = F2FS_I(parent)->i_crypt_info;
+	child_ci = F2FS_I(child)->i_crypt_info;
+	if (!parent_ci && !child_ci)
+		return 1;
+	if (!parent_ci || !child_ci)
+		return 0;
+
+	return (memcmp(parent_ci->ci_master_key,
+			child_ci->ci_master_key,
+			F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+		(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+		(parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
+		(parent_ci->ci_flags == child_ci->ci_flags));
+}
+
+/**
+ * f2fs_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child:  Child inode that inherits the context from @parent.
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int f2fs_inherit_context(struct inode *parent, struct inode *child,
+						struct page *ipage)
+{
+	struct f2fs_encryption_context ctx;
+	struct f2fs_crypt_info *ci;
+	int res;
+
+	res = f2fs_get_encryption_info(parent);
+	if (res < 0)
+		return res;
+
+	ci = F2FS_I(parent)->i_crypt_info;
+	BUG_ON(ci == NULL);
+
+	ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+
+	ctx.contents_encryption_mode = ci->ci_data_mode;
+	ctx.filenames_encryption_mode = ci->ci_filename_mode;
+	ctx.flags = ci->ci_flags;
+	memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+			F2FS_KEY_DESCRIPTOR_SIZE);
+
+	get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
+	return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION,
+				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+				sizeof(ctx), ipage, XATTR_CREATE);
+}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 1e1aae669..f71e19a9d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/prefetch.h>
 #include <linux/uio.h>
+#include <linux/cleancache.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -33,6 +34,15 @@ static void f2fs_read_end_io(struct bio *bio, int err)
 	struct bio_vec *bvec;
 	int i;
 
+	if (f2fs_bio_encrypted(bio)) {
+		if (err) {
+			f2fs_release_crypto_ctx(bio->bi_private);
+		} else {
+			f2fs_end_io_crypto_work(bio->bi_private, bio);
+			return;
+		}
+	}
+
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 
@@ -56,6 +66,8 @@ static void f2fs_write_end_io(struct bio *bio, int err)
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 
+		f2fs_restore_and_release_control_page(&page);
+
 		if (unlikely(err)) {
 			set_page_dirty(page);
 			set_bit(AS_EIO, &page->mapping->flags);
@@ -86,7 +98,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 	bio->bi_bdev = sbi->sb->s_bdev;
 	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
-	bio->bi_private = sbi;
+	bio->bi_private = is_read ? NULL : sbi;
 
 	return bio;
 }
@@ -133,16 +145,16 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
  * Fill the locked page with data located in the block address.
  * Return unlocked page.
  */
-int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
-					struct f2fs_io_info *fio)
+int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 {
 	struct bio *bio;
+	struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
 	trace_f2fs_submit_page_bio(page, fio);
-	f2fs_trace_ios(page, fio, 0);
+	f2fs_trace_ios(fio, 0);
 
 	/* Allocate a new bio */
-	bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw));
+	bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw));
 
 	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
 		bio_put(bio);
@@ -154,12 +166,13 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
 	return 0;
 }
 
-void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
-					struct f2fs_io_info *fio)
+void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 {
+	struct f2fs_sb_info *sbi = fio->sbi;
 	enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
 	struct f2fs_bio_info *io;
 	bool is_read = is_read_io(fio->rw);
+	struct page *bio_page;
 
 	io = is_read ? &sbi->read_io : &sbi->write_io[btype];
 
@@ -181,17 +194,19 @@ alloc_new:
 		io->fio = *fio;
 	}
 
-	if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
+	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+
+	if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) <
 							PAGE_CACHE_SIZE) {
 		__submit_merged_bio(io);
 		goto alloc_new;
 	}
 
 	io->last_block_in_bio = fio->blk_addr;
-	f2fs_trace_ios(page, fio, 0);
+	f2fs_trace_ios(fio, 0);
 
 	up_write(&io->io_rwsem);
-	trace_f2fs_submit_page_mbio(page, fio);
+	trace_f2fs_submit_page_mbio(fio->page, fio);
 }
 
 /*
@@ -251,19 +266,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
 	return err;
 }
 
-static void f2fs_map_bh(struct super_block *sb, pgoff_t pgofs,
-			struct extent_info *ei, struct buffer_head *bh_result)
-{
-	unsigned int blkbits = sb->s_blocksize_bits;
-	size_t max_size = bh_result->b_size;
-	size_t mapped_size;
-
-	clear_buffer_new(bh_result);
-	map_bh(bh_result, sb, ei->blk + pgofs - ei->fofs);
-	mapped_size = (ei->fofs + ei->len - pgofs) << blkbits;
-	bh_result->b_size = min(max_size, mapped_size);
-}
-
 static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs,
 							struct extent_info *ei)
 {
@@ -905,7 +907,7 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn)
 		sync_inode_page(dn);
 }
 
-struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
+struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
@@ -913,83 +915,15 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 	struct extent_info ei;
 	int err;
 	struct f2fs_io_info fio = {
+		.sbi = F2FS_I_SB(inode),
 		.type = DATA,
-		.rw = sync ? READ_SYNC : READA,
+		.rw = rw,
+		.encrypted_page = NULL,
 	};
 
-	/*
-	 * If sync is false, it needs to check its block allocation.
-	 * This is need and triggered by two flows:
-	 *   gc and truncate_partial_data_page.
-	 */
-	if (!sync)
-		goto search;
-
-	page = find_get_page(mapping, index);
-	if (page && PageUptodate(page))
-		return page;
-	f2fs_put_page(page, 0);
-search:
-	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
-		dn.data_blkaddr = ei.blk + index - ei.fofs;
-		goto got_it;
-	}
-
-	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
-	if (err)
-		return ERR_PTR(err);
-	f2fs_put_dnode(&dn);
+	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+		return read_mapping_page(mapping, index, NULL);
 
-	if (dn.data_blkaddr == NULL_ADDR)
-		return ERR_PTR(-ENOENT);
-
-	/* By fallocate(), there is no cached page, but with NEW_ADDR */
-	if (unlikely(dn.data_blkaddr == NEW_ADDR))
-		return ERR_PTR(-EINVAL);
-
-got_it:
-	page = grab_cache_page(mapping, index);
-	if (!page)
-		return ERR_PTR(-ENOMEM);
-
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		return page;
-	}
-
-	fio.blk_addr = dn.data_blkaddr;
-	err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
-	if (err)
-		return ERR_PTR(err);
-
-	if (sync) {
-		wait_on_page_locked(page);
-		if (unlikely(!PageUptodate(page))) {
-			f2fs_put_page(page, 0);
-			return ERR_PTR(-EIO);
-		}
-	}
-	return page;
-}
-
-/*
- * If it tries to access a hole, return an error.
- * Because, the callers, functions in dir.c and GC, should be able to know
- * whether this page exists or not.
- */
-struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
-{
-	struct address_space *mapping = inode->i_mapping;
-	struct dnode_of_data dn;
-	struct page *page;
-	struct extent_info ei;
-	int err;
-	struct f2fs_io_info fio = {
-		.type = DATA,
-		.rw = READ_SYNC,
-	};
-repeat:
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
@@ -1011,10 +945,11 @@ repeat:
 		f2fs_put_page(page, 1);
 		return ERR_PTR(-ENOENT);
 	}
-
 got_it:
-	if (PageUptodate(page))
+	if (PageUptodate(page)) {
+		unlock_page(page);
 		return page;
+	}
 
 	/*
 	 * A new dentry page is allocated but not able to be written, since its
@@ -1025,14 +960,58 @@ got_it:
 	if (dn.data_blkaddr == NEW_ADDR) {
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 		SetPageUptodate(page);
+		unlock_page(page);
 		return page;
 	}
 
 	fio.blk_addr = dn.data_blkaddr;
-	err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
+	fio.page = page;
+	err = f2fs_submit_page_bio(&fio);
 	if (err)
 		return ERR_PTR(err);
+	return page;
+}
+
+struct page *find_data_page(struct inode *inode, pgoff_t index)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+
+	page = find_get_page(mapping, index);
+	if (page && PageUptodate(page))
+		return page;
+	f2fs_put_page(page, 0);
 
+	page = get_read_data_page(inode, index, READ_SYNC);
+	if (IS_ERR(page))
+		return page;
+
+	if (PageUptodate(page))
+		return page;
+
+	wait_on_page_locked(page);
+	if (unlikely(!PageUptodate(page))) {
+		f2fs_put_page(page, 0);
+		return ERR_PTR(-EIO);
+	}
+	return page;
+}
+
+/*
+ * If it tries to access a hole, return an error.
+ * Because, the callers, functions in dir.c and GC, should be able to know
+ * whether this page exists or not.
+ */
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+repeat:
+	page = get_read_data_page(inode, index, READ_SYNC);
+	if (IS_ERR(page))
+		return page;
+
+	/* wait for read completion */
 	lock_page(page);
 	if (unlikely(!PageUptodate(page))) {
 		f2fs_put_page(page, 1);
@@ -1060,46 +1039,37 @@ struct page *get_new_data_page(struct inode *inode,
 	struct page *page;
 	struct dnode_of_data dn;
 	int err;
+repeat:
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
 
 	set_new_dnode(&dn, inode, ipage, NULL, 0);
 	err = f2fs_reserve_block(&dn, index);
-	if (err)
+	if (err) {
+		f2fs_put_page(page, 1);
 		return ERR_PTR(err);
-repeat:
-	page = grab_cache_page(mapping, index);
-	if (!page) {
-		err = -ENOMEM;
-		goto put_err;
 	}
+	if (!ipage)
+		f2fs_put_dnode(&dn);
 
 	if (PageUptodate(page))
-		return page;
+		goto got_it;
 
 	if (dn.data_blkaddr == NEW_ADDR) {
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 		SetPageUptodate(page);
 	} else {
-		struct f2fs_io_info fio = {
-			.type = DATA,
-			.rw = READ_SYNC,
-			.blk_addr = dn.data_blkaddr,
-		};
-		err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
-		if (err)
-			goto put_err;
+		f2fs_put_page(page, 1);
 
-		lock_page(page);
-		if (unlikely(!PageUptodate(page))) {
-			f2fs_put_page(page, 1);
-			err = -EIO;
-			goto put_err;
-		}
-		if (unlikely(page->mapping != mapping)) {
-			f2fs_put_page(page, 1);
+		page = get_read_data_page(inode, index, READ_SYNC);
+		if (IS_ERR(page))
 			goto repeat;
-		}
-	}
 
+		/* wait for read completion */
+		lock_page(page);
+	}
+got_it:
 	if (new_i_size &&
 		i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
 		i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
@@ -1107,10 +1077,6 @@ repeat:
 		set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
 	}
 	return page;
-
-put_err:
-	f2fs_put_dnode(&dn);
-	return ERR_PTR(err);
 }
 
 static int __allocate_data_block(struct dnode_of_data *dn)
@@ -1208,18 +1174,18 @@ out:
 }
 
 /*
- * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
+ * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
+ * f2fs_map_blocks structure.
  * If original data blocks are allocated, then give them to blockdev.
  * Otherwise,
  *     a. preallocate requested block addresses
  *     b. do not use extent cache for better performance
  *     c. give the block addresses to blockdev
  */
-static int __get_data_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create, bool fiemap)
+static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+			int create, bool fiemap)
 {
-	unsigned int blkbits = inode->i_sb->s_blocksize_bits;
-	unsigned maxblocks = bh_result->b_size >> blkbits;
+	unsigned int maxblocks = map->m_len;
 	struct dnode_of_data dn;
 	int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
 	pgoff_t pgofs, end_offset;
@@ -1227,11 +1193,16 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 	struct extent_info ei;
 	bool allocated = false;
 
-	/* Get the page offset from the block offset(iblock) */
-	pgofs =	(pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
+	map->m_len = 0;
+	map->m_flags = 0;
+
+	/* it only supports block size == page size */
+	pgofs =	(pgoff_t)map->m_lblk;
 
 	if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
-		f2fs_map_bh(inode->i_sb, pgofs, &ei, bh_result);
+		map->m_pblk = ei.blk + pgofs - ei.fofs;
+		map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
+		map->m_flags = F2FS_MAP_MAPPED;
 		goto out;
 	}
 
@@ -1250,21 +1221,23 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 		goto put_out;
 
 	if (dn.data_blkaddr != NULL_ADDR) {
-		clear_buffer_new(bh_result);
-		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+		map->m_flags = F2FS_MAP_MAPPED;
+		map->m_pblk = dn.data_blkaddr;
+		if (dn.data_blkaddr == NEW_ADDR)
+			map->m_flags |= F2FS_MAP_UNWRITTEN;
 	} else if (create) {
 		err = __allocate_data_block(&dn);
 		if (err)
 			goto put_out;
 		allocated = true;
-		set_buffer_new(bh_result);
-		map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+		map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED;
+		map->m_pblk = dn.data_blkaddr;
 	} else {
 		goto put_out;
 	}
 
 	end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
-	bh_result->b_size = (((size_t)1) << blkbits);
+	map->m_len = 1;
 	dn.ofs_in_node++;
 	pgofs++;
 
@@ -1288,22 +1261,25 @@ get_next:
 		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
 	}
 
-	if (maxblocks > (bh_result->b_size >> blkbits)) {
+	if (maxblocks > map->m_len) {
 		block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
 		if (blkaddr == NULL_ADDR && create) {
 			err = __allocate_data_block(&dn);
 			if (err)
 				goto sync_out;
 			allocated = true;
-			set_buffer_new(bh_result);
+			map->m_flags |= F2FS_MAP_NEW;
 			blkaddr = dn.data_blkaddr;
 		}
 		/* Give more consecutive addresses for the readahead */
-		if (blkaddr == (bh_result->b_blocknr + ofs)) {
+		if ((map->m_pblk != NEW_ADDR &&
+				blkaddr == (map->m_pblk + ofs)) ||
+				(map->m_pblk == NEW_ADDR &&
+				blkaddr == NEW_ADDR)) {
 			ofs++;
 			dn.ofs_in_node++;
 			pgofs++;
-			bh_result->b_size += (((size_t)1) << blkbits);
+			map->m_len++;
 			goto get_next;
 		}
 	}
@@ -1316,10 +1292,28 @@ unlock_out:
 	if (create)
 		f2fs_unlock_op(F2FS_I_SB(inode));
 out:
-	trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+	trace_f2fs_map_blocks(inode, map, err);
 	return err;
 }
 
+static int __get_data_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh, int create, bool fiemap)
+{
+	struct f2fs_map_blocks map;
+	int ret;
+
+	map.m_lblk = iblock;
+	map.m_len = bh->b_size >> inode->i_blkbits;
+
+	ret = f2fs_map_blocks(inode, &map, create, fiemap);
+	if (!ret) {
+		map_bh(bh, inode->i_sb, map.m_pblk);
+		bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
+		bh->b_size = map.m_len << inode->i_blkbits;
+	}
+	return ret;
+}
+
 static int get_data_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
@@ -1332,11 +1326,268 @@ static int get_data_block_fiemap(struct inode *inode, sector_t iblock,
 	return __get_data_block(inode, iblock, bh_result, create, true);
 }
 
+static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
+{
+	return (offset >> inode->i_blkbits);
+}
+
+static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
+{
+	return (blk << inode->i_blkbits);
+}
+
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		u64 start, u64 len)
 {
-	return generic_block_fiemap(inode, fieinfo,
-				start, len, get_data_block_fiemap);
+	struct buffer_head map_bh;
+	sector_t start_blk, last_blk;
+	loff_t isize = i_size_read(inode);
+	u64 logical = 0, phys = 0, size = 0;
+	u32 flags = 0;
+	bool past_eof = false, whole_file = false;
+	int ret = 0;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (len >= isize) {
+		whole_file = true;
+		len = isize;
+	}
+
+	if (logical_to_blk(inode, len) == 0)
+		len = blk_to_logical(inode, 1);
+
+	start_blk = logical_to_blk(inode, start);
+	last_blk = logical_to_blk(inode, start + len - 1);
+next:
+	memset(&map_bh, 0, sizeof(struct buffer_head));
+	map_bh.b_size = len;
+
+	ret = get_data_block_fiemap(inode, start_blk, &map_bh, 0);
+	if (ret)
+		goto out;
+
+	/* HOLE */
+	if (!buffer_mapped(&map_bh)) {
+		start_blk++;
+
+		if (!past_eof && blk_to_logical(inode, start_blk) >= isize)
+			past_eof = 1;
+
+		if (past_eof && size) {
+			flags |= FIEMAP_EXTENT_LAST;
+			ret = fiemap_fill_next_extent(fieinfo, logical,
+					phys, size, flags);
+		} else if (size) {
+			ret = fiemap_fill_next_extent(fieinfo, logical,
+					phys, size, flags);
+			size = 0;
+		}
+
+		/* if we have holes up to/past EOF then we're done */
+		if (start_blk > last_blk || past_eof || ret)
+			goto out;
+	} else {
+		if (start_blk > last_blk && !whole_file) {
+			ret = fiemap_fill_next_extent(fieinfo, logical,
+					phys, size, flags);
+			goto out;
+		}
+
+		/*
+		 * if size != 0 then we know we already have an extent
+		 * to add, so add it.
+		 */
+		if (size) {
+			ret = fiemap_fill_next_extent(fieinfo, logical,
+					phys, size, flags);
+			if (ret)
+				goto out;
+		}
+
+		logical = blk_to_logical(inode, start_blk);
+		phys = blk_to_logical(inode, map_bh.b_blocknr);
+		size = map_bh.b_size;
+		flags = 0;
+		if (buffer_unwritten(&map_bh))
+			flags = FIEMAP_EXTENT_UNWRITTEN;
+
+		start_blk += logical_to_blk(inode, size);
+
+		/*
+		 * If we are past the EOF, then we need to make sure as
+		 * soon as we find a hole that the last extent we found
+		 * is marked with FIEMAP_EXTENT_LAST
+		 */
+		if (!past_eof && logical + size >= isize)
+			past_eof = true;
+	}
+	cond_resched();
+	if (fatal_signal_pending(current))
+		ret = -EINTR;
+	else
+		goto next;
+out:
+	if (ret == 1)
+		ret = 0;
+
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+/*
+ * This function was originally taken from fs/mpage.c, and customized for f2fs.
+ * Major change was from block_size == page_size in f2fs by default.
+ */
+static int f2fs_mpage_readpages(struct address_space *mapping,
+			struct list_head *pages, struct page *page,
+			unsigned nr_pages)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	sector_t last_block_in_bio = 0;
+	struct inode *inode = mapping->host;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	sector_t block_in_file;
+	sector_t last_block;
+	sector_t last_block_in_file;
+	sector_t block_nr;
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	struct f2fs_map_blocks map;
+
+	map.m_pblk = 0;
+	map.m_lblk = 0;
+	map.m_len = 0;
+	map.m_flags = 0;
+
+	for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
+
+		prefetchw(&page->flags);
+		if (pages) {
+			page = list_entry(pages->prev, struct page, lru);
+			list_del(&page->lru);
+			if (add_to_page_cache_lru(page, mapping,
+						  page->index, GFP_KERNEL))
+				goto next_page;
+		}
+
+		block_in_file = (sector_t)page->index;
+		last_block = block_in_file + nr_pages;
+		last_block_in_file = (i_size_read(inode) + blocksize - 1) >>
+								blkbits;
+		if (last_block > last_block_in_file)
+			last_block = last_block_in_file;
+
+		/*
+		 * Map blocks using the previous result first.
+		 */
+		if ((map.m_flags & F2FS_MAP_MAPPED) &&
+				block_in_file > map.m_lblk &&
+				block_in_file < (map.m_lblk + map.m_len))
+			goto got_it;
+
+		/*
+		 * Then do more f2fs_map_blocks() calls until we are
+		 * done with this page.
+		 */
+		map.m_flags = 0;
+
+		if (block_in_file < last_block) {
+			map.m_lblk = block_in_file;
+			map.m_len = last_block - block_in_file;
+
+			if (f2fs_map_blocks(inode, &map, 0, false))
+				goto set_error_page;
+		}
+got_it:
+		if ((map.m_flags & F2FS_MAP_MAPPED)) {
+			block_nr = map.m_pblk + block_in_file - map.m_lblk;
+			SetPageMappedToDisk(page);
+
+			if (!PageUptodate(page) && !cleancache_get_page(page)) {
+				SetPageUptodate(page);
+				goto confused;
+			}
+		} else {
+			zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+			SetPageUptodate(page);
+			unlock_page(page);
+			goto next_page;
+		}
+
+		/*
+		 * This page will go to BIO.  Do we need to send this
+		 * BIO off first?
+		 */
+		if (bio && (last_block_in_bio != block_nr - 1)) {
+submit_and_realloc:
+			submit_bio(READ, bio);
+			bio = NULL;
+		}
+		if (bio == NULL) {
+			struct f2fs_crypto_ctx *ctx = NULL;
+
+			if (f2fs_encrypted_inode(inode) &&
+					S_ISREG(inode->i_mode)) {
+				struct page *cpage;
+
+				ctx = f2fs_get_crypto_ctx(inode);
+				if (IS_ERR(ctx))
+					goto set_error_page;
+
+				/* wait the page to be moved by cleaning */
+				cpage = find_lock_page(
+						META_MAPPING(F2FS_I_SB(inode)),
+						block_nr);
+				if (cpage) {
+					f2fs_wait_on_page_writeback(cpage,
+									DATA);
+					f2fs_put_page(cpage, 1);
+				}
+			}
+
+			bio = bio_alloc(GFP_KERNEL,
+				min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
+			if (!bio) {
+				if (ctx)
+					f2fs_release_crypto_ctx(ctx);
+				goto set_error_page;
+			}
+			bio->bi_bdev = bdev;
+			bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr);
+			bio->bi_end_io = f2fs_read_end_io;
+			bio->bi_private = ctx;
+		}
+
+		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
+			goto submit_and_realloc;
+
+		last_block_in_bio = block_nr;
+		goto next_page;
+set_error_page:
+		SetPageError(page);
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		unlock_page(page);
+		goto next_page;
+confused:
+		if (bio) {
+			submit_bio(READ, bio);
+			bio = NULL;
+		}
+		unlock_page(page);
+next_page:
+		if (pages)
+			page_cache_release(page);
+	}
+	BUG_ON(pages && !list_empty(pages));
+	if (bio)
+		submit_bio(READ, bio);
+	return 0;
 }
 
 static int f2fs_read_data_page(struct file *file, struct page *page)
@@ -1350,8 +1601,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
 	if (f2fs_has_inline_data(inode))
 		ret = f2fs_read_inline_data(inode, page);
 	if (ret == -EAGAIN)
-		ret = mpage_readpage(page, get_data_block);
-
+		ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1);
 	return ret;
 }
 
@@ -1365,11 +1615,12 @@ static int f2fs_read_data_pages(struct file *file,
 	if (f2fs_has_inline_data(inode))
 		return 0;
 
-	return mpage_readpages(mapping, pages, nr_pages, get_data_block);
+	return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages);
 }
 
-int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
+int do_write_data_page(struct f2fs_io_info *fio)
 {
+	struct page *page = fio->page;
 	struct inode *inode = page->mapping->host;
 	struct dnode_of_data dn;
 	int err = 0;
@@ -1387,6 +1638,14 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
 		goto out_writepage;
 	}
 
+	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+		fio->encrypted_page = f2fs_encrypt(inode, fio->page);
+		if (IS_ERR(fio->encrypted_page)) {
+			err = PTR_ERR(fio->encrypted_page);
+			goto out_writepage;
+		}
+	}
+
 	set_page_writeback(page);
 
 	/*
@@ -1396,11 +1655,11 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
 	if (unlikely(fio->blk_addr != NEW_ADDR &&
 			!is_cold_data(page) &&
 			need_inplace_update(inode))) {
-		rewrite_data_page(page, fio);
+		rewrite_data_page(fio);
 		set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
 		trace_f2fs_do_write_data_page(page, IPU);
 	} else {
-		write_data_page(page, &dn, fio);
+		write_data_page(&dn, fio);
 		set_data_blkaddr(&dn);
 		f2fs_update_extent_cache(&dn);
 		trace_f2fs_do_write_data_page(page, OPU);
@@ -1425,8 +1684,11 @@ static int f2fs_write_data_page(struct page *page,
 	bool need_balance_fs = false;
 	int err = 0;
 	struct f2fs_io_info fio = {
+		.sbi = sbi,
 		.type = DATA,
 		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+		.page = page,
+		.encrypted_page = NULL,
 	};
 
 	trace_f2fs_writepage(page, DATA);
@@ -1456,7 +1718,7 @@ write:
 	if (S_ISDIR(inode->i_mode)) {
 		if (unlikely(f2fs_cp_error(sbi)))
 			goto redirty_out;
-		err = do_write_data_page(page, &fio);
+		err = do_write_data_page(&fio);
 		goto done;
 	}
 
@@ -1476,7 +1738,7 @@ write:
 	if (f2fs_has_inline_data(inode))
 		err = f2fs_write_inline_data(inode, page);
 	if (err == -EAGAIN)
-		err = do_write_data_page(page, &fio);
+		err = do_write_data_page(&fio);
 	f2fs_unlock_op(sbi);
 done:
 	if (err && err != -ENOENT)
@@ -1645,11 +1907,14 @@ put_next:
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 	} else {
 		struct f2fs_io_info fio = {
+			.sbi = sbi,
 			.type = DATA,
 			.rw = READ_SYNC,
 			.blk_addr = dn.data_blkaddr,
+			.page = page,
+			.encrypted_page = NULL,
 		};
-		err = f2fs_submit_page_bio(sbi, page, &fio);
+		err = f2fs_submit_page_bio(&fio);
 		if (err)
 			goto fail;
 
@@ -1663,6 +1928,15 @@ put_next:
 			f2fs_put_page(page, 1);
 			goto repeat;
 		}
+
+		/* avoid symlink page */
+		if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+			err = f2fs_decrypt_one(inode, page);
+			if (err) {
+				f2fs_put_page(page, 1);
+				goto fail;
+			}
+		}
 	}
 out:
 	SetPageUptodate(page);
@@ -1733,6 +2007,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 			return err;
 	}
 
+	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+		return 0;
+
 	if (check_direct_IO(inode, iter, offset))
 		return 0;
 
@@ -1795,8 +2072,6 @@ static int f2fs_set_data_page_dirty(struct page *page)
 		return 1;
 	}
 
-	mark_inode_dirty(inode);
-
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
 		update_dirty_page(inode, page);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index f5388f372..75176e0dd 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -94,7 +94,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 static void update_sit_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_stat_info *si = F2FS_STAT(sbi);
-	unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
+	unsigned long long blks_per_sec, hblks_per_sec, total_vblocks;
+	unsigned long long bimodal, dist;
 	unsigned int segno, vblocks;
 	int ndirty = 0;
 
@@ -112,10 +113,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 			ndirty++;
 		}
 	}
-	dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
-	si->bimodal = bimodal / dist;
+	dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100);
+	si->bimodal = div_u64(bimodal, dist);
 	if (si->dirty_count)
-		si->avg_vblocks = total_vblocks / ndirty;
+		si->avg_vblocks = div_u64(total_vblocks, ndirty);
 	else
 		si->avg_vblocks = 0;
 }
@@ -143,7 +144,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	si->base_mem += sizeof(struct sit_info);
 	si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
 	si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
-	si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+	si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
 	si->base_mem += SIT_VBLOCK_MAP_SIZE;
 	if (sbi->segs_per_sec > 1)
 		si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 3a3302ab7..a34ebd831 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -76,20 +76,10 @@ static unsigned long dir_block_index(unsigned int level,
 	return bidx;
 }
 
-static bool early_match_name(size_t namelen, f2fs_hash_t namehash,
-				struct f2fs_dir_entry *de)
-{
-	if (le16_to_cpu(de->name_len) != namelen)
-		return false;
-
-	if (de->hash_code != namehash)
-		return false;
-
-	return true;
-}
-
 static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
-				struct qstr *name, int *max_slots,
+				struct f2fs_filename *fname,
+				f2fs_hash_t namehash,
+				int *max_slots,
 				struct page **res_page)
 {
 	struct f2fs_dentry_block *dentry_blk;
@@ -98,9 +88,8 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
 
 	dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
 
-	make_dentry_ptr(&d, (void *)dentry_blk, 1);
-	de = find_target_dentry(name, max_slots, &d);
-
+	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+	de = find_target_dentry(fname, namehash, max_slots, &d);
 	if (de)
 		*res_page = dentry_page;
 	else
@@ -114,13 +103,15 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
 	return de;
 }
 
-struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots,
-						struct f2fs_dentry_ptr *d)
+struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname,
+			f2fs_hash_t namehash, int *max_slots,
+			struct f2fs_dentry_ptr *d)
 {
 	struct f2fs_dir_entry *de;
 	unsigned long bit_pos = 0;
-	f2fs_hash_t namehash = f2fs_dentry_hash(name);
 	int max_len = 0;
+	struct f2fs_str de_name = FSTR_INIT(NULL, 0);
+	struct f2fs_str *name = &fname->disk_name;
 
 	if (max_slots)
 		*max_slots = 0;
@@ -132,8 +123,18 @@ struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots,
 		}
 
 		de = &d->dentry[bit_pos];
-		if (early_match_name(name->len, namehash, de) &&
-			!memcmp(d->filename[bit_pos], name->name, name->len))
+
+		/* encrypted case */
+		de_name.name = d->filename[bit_pos];
+		de_name.len = le16_to_cpu(de->name_len);
+
+		/* show encrypted name */
+		if (fname->hash) {
+			if (de->hash_code == fname->hash)
+				goto found;
+		} else if (de_name.len == name->len &&
+			de->hash_code == namehash &&
+			!memcmp(de_name.name, name->name, name->len))
 			goto found;
 
 		if (max_slots && max_len > *max_slots)
@@ -155,16 +156,21 @@ found:
 }
 
 static struct f2fs_dir_entry *find_in_level(struct inode *dir,
-			unsigned int level, struct qstr *name,
-			f2fs_hash_t namehash, struct page **res_page)
+					unsigned int level,
+					struct f2fs_filename *fname,
+					struct page **res_page)
 {
-	int s = GET_DENTRY_SLOTS(name->len);
+	struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
+	int s = GET_DENTRY_SLOTS(name.len);
 	unsigned int nbucket, nblock;
 	unsigned int bidx, end_block;
 	struct page *dentry_page;
 	struct f2fs_dir_entry *de = NULL;
 	bool room = false;
 	int max_slots;
+	f2fs_hash_t namehash;
+
+	namehash = f2fs_dentry_hash(&name);
 
 	f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
 
@@ -177,13 +183,14 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 
 	for (; bidx < end_block; bidx++) {
 		/* no need to allocate new dentry pages to all the indices */
-		dentry_page = find_data_page(dir, bidx, true);
+		dentry_page = find_data_page(dir, bidx);
 		if (IS_ERR(dentry_page)) {
 			room = true;
 			continue;
 		}
 
-		de = find_in_block(dentry_page, name, &max_slots, res_page);
+		de = find_in_block(dentry_page, fname, namehash, &max_slots,
+								res_page);
 		if (de)
 			break;
 
@@ -211,30 +218,34 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 {
 	unsigned long npages = dir_blocks(dir);
 	struct f2fs_dir_entry *de = NULL;
-	f2fs_hash_t name_hash;
 	unsigned int max_depth;
 	unsigned int level;
+	struct f2fs_filename fname;
+	int err;
 
 	*res_page = NULL;
 
-	if (f2fs_has_inline_dentry(dir))
-		return find_in_inline_dir(dir, child, res_page);
+	err = f2fs_fname_setup_filename(dir, child, 1, &fname);
+	if (err)
+		return NULL;
+
+	if (f2fs_has_inline_dentry(dir)) {
+		de = find_in_inline_dir(dir, &fname, res_page);
+		goto out;
+	}
 
 	if (npages == 0)
-		return NULL;
+		goto out;
 
-	name_hash = f2fs_dentry_hash(child);
 	max_depth = F2FS_I(dir)->i_current_depth;
 
 	for (level = 0; level < max_depth; level++) {
-		de = find_in_level(dir, level, child, name_hash, res_page);
+		de = find_in_level(dir, level, &fname, res_page);
 		if (de)
 			break;
 	}
-	if (!de && F2FS_I(dir)->chash != name_hash) {
-		F2FS_I(dir)->chash = name_hash;
-		F2FS_I(dir)->clevel = level - 1;
-	}
+out:
+	f2fs_fname_free_filename(&fname);
 	return de;
 }
 
@@ -303,10 +314,14 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
 	set_page_dirty(ipage);
 }
 
-int update_dent_inode(struct inode *inode, const struct qstr *name)
+int update_dent_inode(struct inode *inode, struct inode *to,
+					const struct qstr *name)
 {
 	struct page *page;
 
+	if (file_enc_name(to))
+		return 0;
+
 	page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
 	if (IS_ERR(page))
 		return PTR_ERR(page);
@@ -356,7 +371,7 @@ static int make_empty_dir(struct inode *inode,
 
 	dentry_blk = kmap_atomic(dentry_page);
 
-	make_dentry_ptr(&d, (void *)dentry_blk, 1);
+	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
 	do_make_empty_dir(inode, parent, &d);
 
 	kunmap_atomic(dentry_blk);
@@ -390,6 +405,12 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
 		err = f2fs_init_security(inode, dir, name, page);
 		if (err)
 			goto put_error;
+
+		if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
+			err = f2fs_inherit_context(dir, inode, page);
+			if (err)
+				goto put_error;
+		}
 	} else {
 		page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
 		if (IS_ERR(page))
@@ -501,24 +522,33 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	unsigned long bidx, block;
 	f2fs_hash_t dentry_hash;
 	unsigned int nbucket, nblock;
-	size_t namelen = name->len;
 	struct page *dentry_page = NULL;
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct f2fs_dentry_ptr d;
-	int slots = GET_DENTRY_SLOTS(namelen);
 	struct page *page = NULL;
-	int err = 0;
+	struct f2fs_filename fname;
+	struct qstr new_name;
+	int slots, err;
+
+	err = f2fs_fname_setup_filename(dir, name, 0, &fname);
+	if (err)
+		return err;
+
+	new_name.name = fname_name(&fname);
+	new_name.len = fname_len(&fname);
 
 	if (f2fs_has_inline_dentry(dir)) {
-		err = f2fs_add_inline_entry(dir, name, inode, ino, mode);
+		err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
 		if (!err || err != -EAGAIN)
-			return err;
+			goto out;
 		else
 			err = 0;
 	}
 
-	dentry_hash = f2fs_dentry_hash(name);
 	level = 0;
+	slots = GET_DENTRY_SLOTS(new_name.len);
+	dentry_hash = f2fs_dentry_hash(&new_name);
+
 	current_depth = F2FS_I(dir)->i_current_depth;
 	if (F2FS_I(dir)->chash == dentry_hash) {
 		level = F2FS_I(dir)->clevel;
@@ -526,8 +556,10 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	}
 
 start:
-	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
-		return -ENOSPC;
+	if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) {
+		err = -ENOSPC;
+		goto out;
+	}
 
 	/* Increase the depth, if required */
 	if (level == current_depth)
@@ -541,8 +573,10 @@ start:
 
 	for (block = bidx; block <= (bidx + nblock - 1); block++) {
 		dentry_page = get_new_data_page(dir, NULL, block, true);
-		if (IS_ERR(dentry_page))
-			return PTR_ERR(dentry_page);
+		if (IS_ERR(dentry_page)) {
+			err = PTR_ERR(dentry_page);
+			goto out;
+		}
 
 		dentry_blk = kmap(dentry_page);
 		bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
@@ -562,15 +596,17 @@ add_dentry:
 
 	if (inode) {
 		down_write(&F2FS_I(inode)->i_sem);
-		page = init_inode_metadata(inode, dir, name, NULL);
+		page = init_inode_metadata(inode, dir, &new_name, NULL);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto fail;
 		}
+		if (f2fs_encrypted_inode(dir))
+			file_set_enc_name(inode);
 	}
 
-	make_dentry_ptr(&d, (void *)dentry_blk, 1);
-	f2fs_update_dentry(ino, mode, &d, name, dentry_hash, bit_pos);
+	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+	f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos);
 
 	set_page_dirty(dentry_page);
 
@@ -592,6 +628,8 @@ fail:
 	}
 	kunmap(dentry_page);
 	f2fs_put_page(dentry_page, 1);
+out:
+	f2fs_fname_free_filename(&fname);
 	return err;
 }
 
@@ -729,11 +767,12 @@ bool f2fs_empty_dir(struct inode *dir)
 }
 
 bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
-						unsigned int start_pos)
+				unsigned int start_pos, struct f2fs_str *fstr)
 {
 	unsigned char d_type = DT_UNKNOWN;
 	unsigned int bit_pos;
 	struct f2fs_dir_entry *de = NULL;
+	struct f2fs_str de_name = FSTR_INIT(NULL, 0);
 
 	bit_pos = ((unsigned long)ctx->pos % d->max);
 
@@ -747,8 +786,24 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			d_type = f2fs_filetype_table[de->file_type];
 		else
 			d_type = DT_UNKNOWN;
-		if (!dir_emit(ctx, d->filename[bit_pos],
-					le16_to_cpu(de->name_len),
+
+		/* encrypted case */
+		de_name.name = d->filename[bit_pos];
+		de_name.len = le16_to_cpu(de->name_len);
+
+		if (f2fs_encrypted_inode(d->inode)) {
+			int save_len = fstr->len;
+			int ret;
+
+			ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
+							&de_name, fstr);
+			de_name = *fstr;
+			fstr->len = save_len;
+			if (ret < 0)
+				return true;
+		}
+
+		if (!dir_emit(ctx, de_name.name, de_name.len,
 					le32_to_cpu(de->ino), d_type))
 			return true;
 
@@ -767,9 +822,24 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 	struct file_ra_state *ra = &file->f_ra;
 	unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
 	struct f2fs_dentry_ptr d;
+	struct f2fs_str fstr = FSTR_INIT(NULL, 0);
+	int err = 0;
 
-	if (f2fs_has_inline_dentry(inode))
-		return f2fs_read_inline_dir(file, ctx);
+	if (f2fs_encrypted_inode(inode)) {
+		err = f2fs_get_encryption_info(inode);
+		if (err)
+			return err;
+
+		err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN,
+								&fstr);
+		if (err < 0)
+			return err;
+	}
+
+	if (f2fs_has_inline_dentry(inode)) {
+		err = f2fs_read_inline_dir(file, ctx, &fstr);
+		goto out;
+	}
 
 	/* readahead for multi pages of dir */
 	if (npages - n > 1 && !ra_has_index(ra, n))
@@ -783,9 +853,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 
 		dentry_blk = kmap(dentry_page);
 
-		make_dentry_ptr(&d, (void *)dentry_blk, 1);
+		make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
 
-		if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK))
+		if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr))
 			goto stop;
 
 		ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
@@ -798,8 +868,9 @@ stop:
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
 	}
-
-	return 0;
+out:
+	f2fs_fname_crypto_free_buffer(&fstr);
+	return err;
 }
 
 const struct file_operations f2fs_dir_operations = {
@@ -808,4 +879,7 @@ const struct file_operations f2fs_dir_operations = {
 	.iterate	= f2fs_readdir,
 	.fsync		= f2fs_sync_file,
 	.unlocked_ioctl	= f2fs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = f2fs_compat_ioctl,
+#endif
 };
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8de34ab6d..a8327ed73 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -70,6 +70,15 @@ struct f2fs_mount_info {
 	unsigned int	opt;
 };
 
+#define F2FS_FEATURE_ENCRYPT	0x0001
+
+#define F2FS_HAS_FEATURE(sb, mask)					\
+	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
+#define F2FS_SET_FEATURE(sb, mask)					\
+	F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)
+#define F2FS_CLEAR_FEATURE(sb, mask)					\
+	F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
+
 #define CRCPOLY_LE 0xedb88320
 
 static inline __u32 f2fs_crc32(void *buf, size_t len)
@@ -110,6 +119,8 @@ enum {
 #define DEF_BATCHED_TRIM_SECTIONS	32
 #define BATCHED_TRIM_SEGMENTS(sbi)	\
 		(SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
+#define BATCHED_TRIM_BLOCKS(sbi)	\
+		(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
 
 struct cp_control {
 	int reason;
@@ -218,6 +229,13 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 #define F2FS_IOC_RELEASE_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 4)
 #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
 
+#define F2FS_IOC_SET_ENCRYPTION_POLICY					\
+		_IOR('f', 19, struct f2fs_encryption_policy)
+#define F2FS_IOC_GET_ENCRYPTION_PWSALT					\
+		_IOW('f', 20, __u8[16])
+#define F2FS_IOC_GET_ENCRYPTION_POLICY					\
+		_IOW('f', 21, struct f2fs_encryption_policy)
+
 /*
  * should be same as XFS_IOC_GOINGDOWN.
  * Flags for going down operation used by FS_IOC_GOINGDOWN
@@ -239,16 +257,38 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
  * For INODE and NODE manager
  */
 /* for directory operations */
+struct f2fs_str {
+	unsigned char *name;
+	u32 len;
+};
+
+struct f2fs_filename {
+	const struct qstr *usr_fname;
+	struct f2fs_str disk_name;
+	f2fs_hash_t hash;
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	struct f2fs_str crypto_buf;
+#endif
+};
+
+#define FSTR_INIT(n, l)		{ .name = n, .len = l }
+#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p)		((p)->disk_name.name)
+#define fname_len(p)		((p)->disk_name.len)
+
 struct f2fs_dentry_ptr {
+	struct inode *inode;
 	const void *bitmap;
 	struct f2fs_dir_entry *dentry;
 	__u8 (*filename)[F2FS_SLOT_LEN];
 	int max;
 };
 
-static inline void make_dentry_ptr(struct f2fs_dentry_ptr *d,
-					void *src, int type)
+static inline void make_dentry_ptr(struct inode *inode,
+		struct f2fs_dentry_ptr *d, void *src, int type)
 {
+	d->inode = inode;
+
 	if (type == 1) {
 		struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src;
 		d->max = NR_DENTRY_IN_BLOCK;
@@ -315,10 +355,51 @@ struct extent_tree {
 };
 
 /*
+ * This structure is taken from ext4_map_blocks.
+ *
+ * Note that, however, f2fs uses NEW and MAPPED flags for f2fs_map_blocks().
+ */
+#define F2FS_MAP_NEW		(1 << BH_New)
+#define F2FS_MAP_MAPPED		(1 << BH_Mapped)
+#define F2FS_MAP_UNWRITTEN	(1 << BH_Unwritten)
+#define F2FS_MAP_FLAGS		(F2FS_MAP_NEW | F2FS_MAP_MAPPED |\
+				F2FS_MAP_UNWRITTEN)
+
+struct f2fs_map_blocks {
+	block_t m_pblk;
+	block_t m_lblk;
+	unsigned int m_len;
+	unsigned int m_flags;
+};
+
+/*
  * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
  */
 #define FADVISE_COLD_BIT	0x01
 #define FADVISE_LOST_PINO_BIT	0x02
+#define FADVISE_ENCRYPT_BIT	0x04
+#define FADVISE_ENC_NAME_BIT	0x08
+
+#define file_is_cold(inode)	is_file(inode, FADVISE_COLD_BIT)
+#define file_wrong_pino(inode)	is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_set_cold(inode)	set_file(inode, FADVISE_COLD_BIT)
+#define file_lost_pino(inode)	set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode)	clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode)	clear_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_is_encrypt(inode)	is_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_set_encrypt(inode)	set_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_enc_name(inode)	is_file(inode, FADVISE_ENC_NAME_BIT)
+#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
+
+/* Encryption algorithms */
+#define F2FS_ENCRYPTION_MODE_INVALID		0
+#define F2FS_ENCRYPTION_MODE_AES_256_XTS	1
+#define F2FS_ENCRYPTION_MODE_AES_256_GCM	2
+#define F2FS_ENCRYPTION_MODE_AES_256_CBC	3
+#define F2FS_ENCRYPTION_MODE_AES_256_CTS	4
+
+#include "f2fs_crypto.h"
 
 #define DEF_DIR_LEVEL		0
 
@@ -346,6 +427,11 @@ struct f2fs_inode_info {
 	struct radix_tree_root inmem_root;	/* radix tree for inmem pages */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	/* Encryption params */
+	struct f2fs_crypt_info *i_crypt_info;
+#endif
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -571,9 +657,12 @@ enum page_type {
 };
 
 struct f2fs_io_info {
+	struct f2fs_sb_info *sbi;	/* f2fs_sb_info pointer */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	int rw;			/* contains R/RS/W/WS with REQ_META/REQ_PRIO */
 	block_t blk_addr;	/* block address to be written */
+	struct page *page;	/* page to be written */
+	struct page *encrypted_page;	/* encrypted page */
 };
 
 #define is_read_io(rw)	(((rw) & 1) == READ)
@@ -666,6 +755,7 @@ struct f2fs_sb_info {
 	block_t user_block_count;		/* # of user blocks */
 	block_t total_valid_block_count;	/* # of valid blocks */
 	block_t alloc_valid_block_count;	/* # of allocated blocks */
+	block_t discard_blks;			/* discard command candidats */
 	block_t last_valid_block_count;		/* for recovery */
 	u32 s_next_generation;			/* for NFS support */
 	atomic_t nr_pages[NR_COUNT_TYPE];	/* # of pages, see count_type */
@@ -1193,6 +1283,24 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr)
 	return mask & *addr;
 }
 
+static inline void f2fs_set_bit(unsigned int nr, char *addr)
+{
+	int mask;
+
+	addr += (nr >> 3);
+	mask = 1 << (7 - (nr & 0x07));
+	*addr |= mask;
+}
+
+static inline void f2fs_clear_bit(unsigned int nr, char *addr)
+{
+	int mask;
+
+	addr += (nr >> 3);
+	mask = 1 << (7 - (nr & 0x07));
+	*addr &= ~mask;
+}
+
 static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr)
 {
 	int mask;
@@ -1391,6 +1499,21 @@ static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
 		kunmap(page);
 }
 
+static inline int is_file(struct inode *inode, int type)
+{
+	return F2FS_I(inode)->i_advise & type;
+}
+
+static inline void set_file(struct inode *inode, int type)
+{
+	F2FS_I(inode)->i_advise |= type;
+}
+
+static inline void clear_file(struct inode *inode, int type)
+{
+	F2FS_I(inode)->i_advise &= ~type;
+}
+
 static inline int f2fs_readonly(struct super_block *sb)
 {
 	return sb->s_flags & MS_RDONLY;
@@ -1407,6 +1530,17 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
 	sbi->sb->s_flags |= MS_RDONLY;
 }
 
+static inline bool is_dot_dotdot(const struct qstr *str)
+{
+	if (str->len == 1 && str->name[0] == '.')
+		return true;
+
+	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+		return true;
+
+	return false;
+}
+
 #define get_inode_mode(i) \
 	((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -1453,10 +1587,11 @@ struct dentry *f2fs_get_parent(struct dentry *child);
  */
 extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
 void set_de_type(struct f2fs_dir_entry *, umode_t);
-struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *,
-			struct f2fs_dentry_ptr *);
+
+struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *,
+			f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
 bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
-			unsigned int);
+			unsigned int, struct f2fs_str *);
 void do_make_empty_dir(struct inode *, struct inode *,
 			struct f2fs_dentry_ptr *);
 struct page *init_inode_metadata(struct inode *, struct inode *,
@@ -1470,7 +1605,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
 ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
 void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
 				struct page *, struct inode *);
-int update_dent_inode(struct inode *, const struct qstr *);
+int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
 void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
 			const struct qstr *, f2fs_hash_t , unsigned int);
 int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
@@ -1478,7 +1613,6 @@ int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
 							struct inode *);
 int f2fs_do_tmpfile(struct inode *, struct inode *);
-int f2fs_make_empty(struct inode *, struct inode *);
 bool f2fs_empty_dir(struct inode *);
 
 static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
@@ -1490,6 +1624,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
 /*
  * super.c
  */
+int f2fs_commit_super(struct f2fs_sb_info *, bool);
 int f2fs_sync_fs(struct super_block *, int);
 extern __printf(3, 4)
 void f2fs_msg(struct super_block *, const char *, const char *, ...);
@@ -1506,8 +1641,8 @@ struct dnode_of_data;
 struct node_info;
 
 bool available_free_memory(struct f2fs_sb_info *, int);
+int need_dentry_mark(struct f2fs_sb_info *, nid_t);
 bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
-bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
 bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
 void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
@@ -1548,21 +1683,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *);
 void destroy_flush_cmd_control(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
-void clear_prefree_segments(struct f2fs_sb_info *);
+void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
 void release_discard_addrs(struct f2fs_sb_info *);
 void discard_next_dnode(struct f2fs_sb_info *, block_t);
 int npages_for_summary_flush(struct f2fs_sb_info *, bool);
 void allocate_new_segments(struct f2fs_sb_info *);
 int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
+void update_meta_page(struct f2fs_sb_info *, void *, block_t);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
-void write_node_page(struct f2fs_sb_info *, struct page *,
-				unsigned int, struct f2fs_io_info *);
-void write_data_page(struct page *, struct dnode_of_data *,
-			struct f2fs_io_info *);
-void rewrite_data_page(struct page *, struct f2fs_io_info *);
-void recover_data_page(struct f2fs_sb_info *, struct page *,
-				struct f2fs_summary *, block_t, block_t);
+void write_node_page(unsigned int, struct f2fs_io_info *);
+void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
+void rewrite_data_page(struct f2fs_io_info *);
+void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
+				block_t, block_t, unsigned char, bool);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
 		block_t, block_t *, struct f2fs_summary *, int);
 void f2fs_wait_on_page_writeback(struct page *, enum page_type);
@@ -1581,6 +1715,7 @@ void destroy_segment_manager_caches(void);
  */
 struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
 int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
 void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
@@ -1607,10 +1742,8 @@ void destroy_checkpoint_caches(void);
  * data.c
  */
 void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
-int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *,
-						struct f2fs_io_info *);
-void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *,
-						struct f2fs_io_info *);
+int f2fs_submit_page_bio(struct f2fs_io_info *);
+void f2fs_submit_page_mbio(struct f2fs_io_info *);
 void set_data_blkaddr(struct dnode_of_data *);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
@@ -1619,10 +1752,11 @@ void f2fs_destroy_extent_tree(struct inode *);
 void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *);
 void f2fs_update_extent_cache(struct dnode_of_data *);
 void f2fs_preserve_extent_tree(struct inode *);
-struct page *find_data_page(struct inode *, pgoff_t, bool);
+struct page *get_read_data_page(struct inode *, pgoff_t, int);
+struct page *find_data_page(struct inode *, pgoff_t);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int do_write_data_page(struct page *, struct f2fs_io_info *);
+int do_write_data_page(struct f2fs_io_info *);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
 void init_extent_cache_info(struct f2fs_sb_info *);
 int __init create_extent_cache(void);
@@ -1787,13 +1921,15 @@ extern const struct address_space_operations f2fs_node_aops;
 extern const struct address_space_operations f2fs_meta_aops;
 extern const struct inode_operations f2fs_dir_inode_operations;
 extern const struct inode_operations f2fs_symlink_inode_operations;
+extern const struct inode_operations f2fs_encrypted_symlink_inode_operations;
 extern const struct inode_operations f2fs_special_inode_operations;
 extern struct kmem_cache *inode_entry_slab;
 
 /*
  * inline.c
  */
-bool f2fs_may_inline(struct inode *);
+bool f2fs_may_inline_data(struct inode *);
+bool f2fs_may_inline_dentry(struct inode *);
 void read_inline_data(struct page *, struct page *);
 bool truncate_inline_inode(struct page *, u64);
 int f2fs_read_inline_data(struct inode *, struct page *);
@@ -1801,8 +1937,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
 int f2fs_convert_inline_inode(struct inode *);
 int f2fs_write_inline_data(struct inode *, struct page *);
 bool recover_inline_data(struct inode *, struct page *);
-struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
-							struct page **);
+struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
+				struct f2fs_filename *, struct page **);
 struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
 int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
 int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
@@ -1810,5 +1946,137 @@ int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
 void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
 						struct inode *, struct inode *);
 bool f2fs_empty_inline_dir(struct inode *);
-int f2fs_read_inline_dir(struct file *, struct dir_context *);
+int f2fs_read_inline_dir(struct file *, struct dir_context *,
+						struct f2fs_str *);
+
+/*
+ * crypto support
+ */
+static inline int f2fs_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	return file_is_encrypt(inode);
+#else
+	return 0;
+#endif
+}
+
+static inline void f2fs_set_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	file_set_encrypt(inode);
+#endif
+}
+
+static inline bool f2fs_bio_encrypted(struct bio *bio)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	return unlikely(bio->bi_private != NULL);
+#else
+	return false;
+#endif
+}
+
+static inline int f2fs_sb_has_crypto(struct super_block *sb)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
+#else
+	return 0;
+#endif
+}
+
+static inline bool f2fs_may_encrypt(struct inode *inode)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	mode_t mode = inode->i_mode;
+
+	return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
+#else
+	return 0;
+#endif
+}
+
+/* crypto_policy.c */
+int f2fs_is_child_context_consistent_with_parent(struct inode *,
+							struct inode *);
+int f2fs_inherit_context(struct inode *, struct inode *, struct page *);
+int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *);
+int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *);
+
+/* crypt.c */
+extern struct kmem_cache *f2fs_crypt_info_cachep;
+bool f2fs_valid_contents_enc_mode(uint32_t);
+uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t);
+struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *);
+void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *);
+struct page *f2fs_encrypt(struct inode *, struct page *);
+int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *);
+int f2fs_decrypt_one(struct inode *, struct page *);
+void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *);
+
+/* crypto_key.c */
+void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *);
+int _f2fs_get_encryption_info(struct inode *inode);
+
+/* crypto_fname.c */
+bool f2fs_valid_filenames_enc_mode(uint32_t);
+u32 f2fs_fname_crypto_round_up(u32, u32);
+int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *);
+int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *,
+			const struct f2fs_str *, struct f2fs_str *);
+int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *,
+			struct f2fs_str *);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+void f2fs_restore_and_release_control_page(struct page **);
+void f2fs_restore_control_page(struct page *);
+
+int __init f2fs_init_crypto(void);
+int f2fs_crypto_initialize(void);
+void f2fs_exit_crypto(void);
+
+int f2fs_has_encryption_key(struct inode *);
+
+static inline int f2fs_get_encryption_info(struct inode *inode)
+{
+	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+
+	if (!ci ||
+		(ci->ci_keyring_key &&
+		 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+					       (1 << KEY_FLAG_REVOKED) |
+					       (1 << KEY_FLAG_DEAD)))))
+		return _f2fs_get_encryption_info(inode);
+	return 0;
+}
+
+void f2fs_fname_crypto_free_buffer(struct f2fs_str *);
+int f2fs_fname_setup_filename(struct inode *, const struct qstr *,
+				int lookup, struct f2fs_filename *);
+void f2fs_fname_free_filename(struct f2fs_filename *);
+#else
+static inline void f2fs_restore_and_release_control_page(struct page **p) { }
+static inline void f2fs_restore_control_page(struct page *p) { }
+
+static inline int __init f2fs_init_crypto(void) { return 0; }
+static inline void f2fs_exit_crypto(void) { }
+
+static inline int f2fs_has_encryption_key(struct inode *i) { return 0; }
+static inline int f2fs_get_encryption_info(struct inode *i) { return 0; }
+static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { }
+
+static inline int f2fs_fname_setup_filename(struct inode *dir,
+					const struct qstr *iname,
+					int lookup, struct f2fs_filename *fname)
+{
+	memset(fname, 0, sizeof(struct f2fs_filename));
+	fname->usr_fname = iname;
+	fname->disk_name.name = (unsigned char *)iname->name;
+	fname->disk_name.len = iname->len;
+	return 0;
+}
+
+static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { }
+#endif
 #endif
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
new file mode 100644
index 000000000..c2c1c2b63
--- /dev/null
+++ b/fs/f2fs/f2fs_crypto.h
@@ -0,0 +1,151 @@
+/*
+ * linux/fs/f2fs/f2fs_crypto.h
+ *
+ * Copied from linux/fs/ext4/ext4_crypto.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption header content for f2fs
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#ifndef _F2FS_CRYPTO_H
+#define _F2FS_CRYPTO_H
+
+#include <linux/fs.h>
+
+#define F2FS_KEY_DESCRIPTOR_SIZE	8
+
+/* Policy provided via an ioctl on the topmost directory */
+struct f2fs_encryption_policy {
+	char version;
+	char contents_encryption_mode;
+	char filenames_encryption_mode;
+	char flags;
+	char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
+} __attribute__((__packed__));
+
+#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1	1
+#define F2FS_KEY_DERIVATION_NONCE_SIZE		16
+
+#define F2FS_POLICY_FLAGS_PAD_4		0x00
+#define F2FS_POLICY_FLAGS_PAD_8		0x01
+#define F2FS_POLICY_FLAGS_PAD_16	0x02
+#define F2FS_POLICY_FLAGS_PAD_32	0x03
+#define F2FS_POLICY_FLAGS_PAD_MASK	0x03
+#define F2FS_POLICY_FLAGS_VALID		0x03
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ *  1 byte: Protector format (1 = this version)
+ *  1 byte: File contents encryption mode
+ *  1 byte: File names encryption mode
+ *  1 byte: Flags
+ *  8 bytes: Master Key descriptor
+ *  16 bytes: Encryption Key derivation nonce
+ */
+struct f2fs_encryption_context {
+	char format;
+	char contents_encryption_mode;
+	char filenames_encryption_mode;
+	char flags;
+	char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
+	char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE];
+} __attribute__((__packed__));
+
+/* Encryption parameters */
+#define F2FS_XTS_TWEAK_SIZE 16
+#define F2FS_AES_128_ECB_KEY_SIZE 16
+#define F2FS_AES_256_GCM_KEY_SIZE 32
+#define F2FS_AES_256_CBC_KEY_SIZE 32
+#define F2FS_AES_256_CTS_KEY_SIZE 32
+#define F2FS_AES_256_XTS_KEY_SIZE 64
+#define F2FS_MAX_KEY_SIZE 64
+
+#define F2FS_KEY_DESC_PREFIX "f2fs:"
+#define F2FS_KEY_DESC_PREFIX_SIZE 5
+
+struct f2fs_encryption_key {
+	__u32 mode;
+	char raw[F2FS_MAX_KEY_SIZE];
+	__u32 size;
+} __attribute__((__packed__));
+
+struct f2fs_crypt_info {
+	char		ci_data_mode;
+	char		ci_filename_mode;
+	char		ci_flags;
+	struct crypto_ablkcipher *ci_ctfm;
+	struct key	*ci_keyring_key;
+	char		ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL             0x00000001
+#define F2FS_WRITE_PATH_FL			      0x00000002
+
+struct f2fs_crypto_ctx {
+	union {
+		struct {
+			struct page *bounce_page;       /* Ciphertext page */
+			struct page *control_page;      /* Original page  */
+		} w;
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		} r;
+		struct list_head free_list;     /* Free list */
+	};
+	char flags;                      /* Flags */
+};
+
+struct f2fs_completion_result {
+	struct completion completion;
+	int res;
+};
+
+#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \
+	struct f2fs_completion_result ecr = { \
+		COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+static inline int f2fs_encryption_key_size(int mode)
+{
+	switch (mode) {
+	case F2FS_ENCRYPTION_MODE_AES_256_XTS:
+		return F2FS_AES_256_XTS_KEY_SIZE;
+	case F2FS_ENCRYPTION_MODE_AES_256_GCM:
+		return F2FS_AES_256_GCM_KEY_SIZE;
+	case F2FS_ENCRYPTION_MODE_AES_256_CBC:
+		return F2FS_AES_256_CBC_KEY_SIZE;
+	case F2FS_ENCRYPTION_MODE_AES_256_CTS:
+		return F2FS_AES_256_CTS_KEY_SIZE;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+#define F2FS_FNAME_NUM_SCATTER_ENTRIES	4
+#define F2FS_CRYPTO_BLOCK_SIZE		16
+#define F2FS_FNAME_CRYPTO_DIGEST_SIZE	32
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct f2fs_encrypted_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __attribute__((__packed__));
+
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 encrypted_symlink_data_len(u32 l)
+{
+	return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1);
+}
+#endif	/* _F2FS_CRYPTO_H */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 2b52e48d7..b0f38c3b3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -20,6 +20,7 @@
 #include <linux/uaccess.h>
 #include <linux/mount.h>
 #include <linux/pagevec.h>
+#include <linux/random.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -105,7 +106,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
 	if (!dentry)
 		return 0;
 
-	if (update_dent_inode(inode, &dentry->d_name)) {
+	if (update_dent_inode(inode, inode, &dentry->d_name)) {
 		dput(dentry);
 		return 0;
 	}
@@ -122,6 +123,8 @@ static inline bool need_do_checkpoint(struct inode *inode)
 
 	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
 		need_cp = true;
+	else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino))
+		need_cp = true;
 	else if (file_wrong_pino(inode))
 		need_cp = true;
 	else if (!space_for_roll_forward(sbi))
@@ -271,7 +274,7 @@ flush_out:
 	ret = f2fs_issue_flush(sbi);
 out:
 	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
-	f2fs_trace_ios(NULL, NULL, 1);
+	f2fs_trace_ios(NULL, 1);
 	return ret;
 }
 
@@ -407,6 +410,12 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(file);
 
+	if (f2fs_encrypted_inode(inode)) {
+		int err = f2fs_get_encryption_info(inode);
+		if (err)
+			return 0;
+	}
+
 	/* we don't need to use inline_data strictly */
 	if (f2fs_has_inline_data(inode)) {
 		int err = f2fs_convert_inline_inode(inode);
@@ -419,6 +428,18 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+static int f2fs_file_open(struct inode *inode, struct file *filp)
+{
+	int ret = generic_file_open(inode, filp);
+
+	if (!ret && f2fs_encrypted_inode(inode)) {
+		ret = f2fs_get_encryption_info(inode);
+		if (ret)
+			ret = -EACCES;
+	}
+	return ret;
+}
+
 int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 {
 	int nr_free = 0, ofs = dn->ofs_in_node;
@@ -461,28 +482,32 @@ void truncate_data_blocks(struct dnode_of_data *dn)
 }
 
 static int truncate_partial_data_page(struct inode *inode, u64 from,
-								bool force)
+								bool cache_only)
 {
 	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 
-	if (!offset && !force)
+	if (!offset && !cache_only)
 		return 0;
 
-	page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, force);
-	if (IS_ERR(page))
+	if (cache_only) {
+		page = grab_cache_page(mapping, index);
+		if (page && PageUptodate(page))
+			goto truncate_out;
+		f2fs_put_page(page, 1);
 		return 0;
+	}
 
-	lock_page(page);
-	if (unlikely(!PageUptodate(page) ||
-			page->mapping != inode->i_mapping))
-		goto out;
-
+	page = get_lock_data_page(inode, index);
+	if (IS_ERR(page))
+		return 0;
+truncate_out:
 	f2fs_wait_on_page_writeback(page, DATA);
 	zero_user(page, offset, PAGE_CACHE_SIZE - offset);
-	if (!force)
+	if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
 		set_page_dirty(page);
-out:
 	f2fs_put_page(page, 1);
 	return 0;
 }
@@ -560,7 +585,7 @@ void f2fs_truncate(struct inode *inode)
 	trace_f2fs_truncate(inode);
 
 	/* we should check inline_data size */
-	if (f2fs_has_inline_data(inode) && !f2fs_may_inline(inode)) {
+	if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) {
 		if (f2fs_convert_inline_inode(inode))
 			return;
 	}
@@ -622,16 +647,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 		return err;
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		if (attr->ia_size != i_size_read(inode)) {
+		if (f2fs_encrypted_inode(inode) &&
+				f2fs_get_encryption_info(inode))
+			return -EACCES;
+
+		if (attr->ia_size <= i_size_read(inode)) {
 			truncate_setsize(inode, attr->ia_size);
 			f2fs_truncate(inode);
 			f2fs_balance_fs(F2FS_I_SB(inode));
 		} else {
 			/*
-			 * giving a chance to truncate blocks past EOF which
-			 * are fallocated with FALLOC_FL_KEEP_SIZE.
+			 * do not trim all blocks after i_size if target size is
+			 * larger than i_size.
 			 */
-			f2fs_truncate(inode);
+			truncate_setsize(inode, attr->ia_size);
 		}
 	}
 
@@ -718,10 +747,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (!S_ISREG(inode->i_mode))
 		return -EOPNOTSUPP;
 
-	/* skip punching hole beyond i_size */
-	if (offset >= inode->i_size)
-		return ret;
-
 	if (f2fs_has_inline_data(inode)) {
 		ret = f2fs_convert_inline_inode(inode);
 		if (ret)
@@ -765,6 +790,320 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	return ret;
 }
 
+static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct dnode_of_data dn;
+	pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+	int ret = 0;
+
+	f2fs_lock_op(sbi);
+
+	for (; end < nrpages; start++, end++) {
+		block_t new_addr, old_addr;
+
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA);
+		if (ret && ret != -ENOENT) {
+			goto out;
+		} else if (ret == -ENOENT) {
+			new_addr = NULL_ADDR;
+		} else {
+			new_addr = dn.data_blkaddr;
+			truncate_data_blocks_range(&dn, 1);
+			f2fs_put_dnode(&dn);
+		}
+
+		if (new_addr == NULL_ADDR) {
+			set_new_dnode(&dn, inode, NULL, NULL, 0);
+			ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA);
+			if (ret && ret != -ENOENT)
+				goto out;
+			else if (ret == -ENOENT)
+				continue;
+
+			if (dn.data_blkaddr == NULL_ADDR) {
+				f2fs_put_dnode(&dn);
+				continue;
+			} else {
+				truncate_data_blocks_range(&dn, 1);
+			}
+
+			f2fs_put_dnode(&dn);
+		} else {
+			struct page *ipage;
+
+			ipage = get_node_page(sbi, inode->i_ino);
+			if (IS_ERR(ipage)) {
+				ret = PTR_ERR(ipage);
+				goto out;
+			}
+
+			set_new_dnode(&dn, inode, ipage, NULL, 0);
+			ret = f2fs_reserve_block(&dn, start);
+			if (ret)
+				goto out;
+
+			old_addr = dn.data_blkaddr;
+			if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) {
+				dn.data_blkaddr = NULL_ADDR;
+				f2fs_update_extent_cache(&dn);
+				invalidate_blocks(sbi, old_addr);
+
+				dn.data_blkaddr = new_addr;
+				set_data_blkaddr(&dn);
+			} else if (new_addr != NEW_ADDR) {
+				struct node_info ni;
+
+				get_node_info(sbi, dn.nid, &ni);
+				f2fs_replace_block(sbi, &dn, old_addr, new_addr,
+							ni.version, true);
+			}
+
+			f2fs_put_dnode(&dn);
+		}
+	}
+	ret = 0;
+out:
+	f2fs_unlock_op(sbi);
+	return ret;
+}
+
+static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+	pgoff_t pg_start, pg_end;
+	loff_t new_size;
+	int ret;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (offset + len >= i_size_read(inode))
+		return -EINVAL;
+
+	/* collapse range should be aligned to block size of f2fs. */
+	if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
+		return -EINVAL;
+
+	pg_start = offset >> PAGE_CACHE_SHIFT;
+	pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+
+	/* write out all dirty pages from offset */
+	ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+	if (ret)
+		return ret;
+
+	truncate_pagecache(inode, offset);
+
+	ret = f2fs_do_collapse(inode, pg_start, pg_end);
+	if (ret)
+		return ret;
+
+	new_size = i_size_read(inode) - len;
+
+	ret = truncate_blocks(inode, new_size, true);
+	if (!ret)
+		i_size_write(inode, new_size);
+
+	return ret;
+}
+
+static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
+								int mode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index, pg_start, pg_end;
+	loff_t new_size = i_size_read(inode);
+	loff_t off_start, off_end;
+	int ret = 0;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	ret = inode_newsize_ok(inode, (len + offset));
+	if (ret)
+		return ret;
+
+	f2fs_balance_fs(sbi);
+
+	if (f2fs_has_inline_data(inode)) {
+		ret = f2fs_convert_inline_inode(inode);
+		if (ret)
+			return ret;
+	}
+
+	ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
+	if (ret)
+		return ret;
+
+	truncate_pagecache_range(inode, offset, offset + len - 1);
+
+	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
+	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+
+	off_start = offset & (PAGE_CACHE_SIZE - 1);
+	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+
+	if (pg_start == pg_end) {
+		fill_zero(inode, pg_start, off_start, off_end - off_start);
+		if (offset + len > new_size)
+			new_size = offset + len;
+		new_size = max_t(loff_t, new_size, offset + len);
+	} else {
+		if (off_start) {
+			fill_zero(inode, pg_start++, off_start,
+					PAGE_CACHE_SIZE - off_start);
+			new_size = max_t(loff_t, new_size,
+						pg_start << PAGE_CACHE_SHIFT);
+		}
+
+		for (index = pg_start; index < pg_end; index++) {
+			struct dnode_of_data dn;
+			struct page *ipage;
+
+			f2fs_lock_op(sbi);
+
+			ipage = get_node_page(sbi, inode->i_ino);
+			if (IS_ERR(ipage)) {
+				ret = PTR_ERR(ipage);
+				f2fs_unlock_op(sbi);
+				goto out;
+			}
+
+			set_new_dnode(&dn, inode, ipage, NULL, 0);
+			ret = f2fs_reserve_block(&dn, index);
+			if (ret) {
+				f2fs_unlock_op(sbi);
+				goto out;
+			}
+
+			if (dn.data_blkaddr != NEW_ADDR) {
+				invalidate_blocks(sbi, dn.data_blkaddr);
+
+				dn.data_blkaddr = NEW_ADDR;
+				set_data_blkaddr(&dn);
+
+				dn.data_blkaddr = NULL_ADDR;
+				f2fs_update_extent_cache(&dn);
+			}
+			f2fs_put_dnode(&dn);
+			f2fs_unlock_op(sbi);
+
+			new_size = max_t(loff_t, new_size,
+					(index + 1) << PAGE_CACHE_SHIFT);
+		}
+
+		if (off_end) {
+			fill_zero(inode, pg_end, 0, off_end);
+			new_size = max_t(loff_t, new_size, offset + len);
+		}
+	}
+
+out:
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) {
+		i_size_write(inode, new_size);
+		mark_inode_dirty(inode);
+		update_inode_page(inode);
+	}
+
+	return ret;
+}
+
+static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	pgoff_t pg_start, pg_end, delta, nrpages, idx;
+	loff_t new_size;
+	int ret;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	new_size = i_size_read(inode) + len;
+	if (new_size > inode->i_sb->s_maxbytes)
+		return -EFBIG;
+
+	if (offset >= i_size_read(inode))
+		return -EINVAL;
+
+	/* insert range should be aligned to block size of f2fs. */
+	if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
+		return -EINVAL;
+
+	f2fs_balance_fs(sbi);
+
+	ret = truncate_blocks(inode, i_size_read(inode), true);
+	if (ret)
+		return ret;
+
+	/* write out all dirty pages from offset */
+	ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+	if (ret)
+		return ret;
+
+	truncate_pagecache(inode, offset);
+
+	pg_start = offset >> PAGE_CACHE_SHIFT;
+	pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+	delta = pg_end - pg_start;
+	nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+
+	for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) {
+		struct dnode_of_data dn;
+		struct page *ipage;
+		block_t new_addr, old_addr;
+
+		f2fs_lock_op(sbi);
+
+		set_new_dnode(&dn, inode, NULL, NULL, 0);
+		ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA);
+		if (ret && ret != -ENOENT) {
+			goto out;
+		} else if (ret == -ENOENT) {
+			goto next;
+		} else if (dn.data_blkaddr == NULL_ADDR) {
+			f2fs_put_dnode(&dn);
+			goto next;
+		} else {
+			new_addr = dn.data_blkaddr;
+			truncate_data_blocks_range(&dn, 1);
+			f2fs_put_dnode(&dn);
+		}
+
+		ipage = get_node_page(sbi, inode->i_ino);
+		if (IS_ERR(ipage)) {
+			ret = PTR_ERR(ipage);
+			goto out;
+		}
+
+		set_new_dnode(&dn, inode, ipage, NULL, 0);
+		ret = f2fs_reserve_block(&dn, idx + delta);
+		if (ret)
+			goto out;
+
+		old_addr = dn.data_blkaddr;
+		f2fs_bug_on(sbi, old_addr != NEW_ADDR);
+
+		if (new_addr != NEW_ADDR) {
+			struct node_info ni;
+
+			get_node_info(sbi, dn.nid, &ni);
+			f2fs_replace_block(sbi, &dn, old_addr, new_addr,
+							ni.version, true);
+		}
+		f2fs_put_dnode(&dn);
+next:
+		f2fs_unlock_op(sbi);
+	}
+
+	i_size_write(inode, new_size);
+	return 0;
+out:
+	f2fs_unlock_op(sbi);
+	return ret;
+}
+
 static int expand_inode_data(struct inode *inode, loff_t offset,
 					loff_t len, int mode)
 {
@@ -830,23 +1169,40 @@ static long f2fs_fallocate(struct file *file, int mode,
 				loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(file);
-	long ret;
+	long ret = 0;
+
+	if (f2fs_encrypted_inode(inode) &&
+		(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
+		return -EOPNOTSUPP;
 
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+			FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+			FALLOC_FL_INSERT_RANGE))
 		return -EOPNOTSUPP;
 
 	mutex_lock(&inode->i_mutex);
 
-	if (mode & FALLOC_FL_PUNCH_HOLE)
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		if (offset >= inode->i_size)
+			goto out;
+
 		ret = punch_hole(inode, offset, len);
-	else
+	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+		ret = f2fs_collapse_range(inode, offset, len);
+	} else if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = f2fs_zero_range(inode, offset, len, mode);
+	} else if (mode & FALLOC_FL_INSERT_RANGE) {
+		ret = f2fs_insert_range(inode, offset, len);
+	} else {
 		ret = expand_inode_data(inode, offset, len, mode);
+	}
 
 	if (!ret) {
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		mark_inode_dirty(inode);
 	}
 
+out:
 	mutex_unlock(&inode->i_mutex);
 
 	trace_f2fs_fallocate(inode, mode, offset, len, ret);
@@ -975,12 +1331,13 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 	if (ret)
 		return ret;
 
-	if (f2fs_is_atomic_file(inode))
+	if (f2fs_is_atomic_file(inode)) {
+		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
 		commit_inmem_pages(inode, false);
+	}
 
 	ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
 	mnt_drop_write_file(filp);
-	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
 	return ret;
 }
 
@@ -1031,15 +1388,13 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 	f2fs_balance_fs(F2FS_I_SB(inode));
 
 	if (f2fs_is_atomic_file(inode)) {
-		commit_inmem_pages(inode, false);
 		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+		commit_inmem_pages(inode, false);
 	}
 
-	if (f2fs_is_volatile_file(inode)) {
+	if (f2fs_is_volatile_file(inode))
 		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
-		filemap_fdatawrite(inode->i_mapping);
-		set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
-	}
+
 	mnt_drop_write_file(filp);
 	return ret;
 }
@@ -1109,6 +1464,86 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
 	return 0;
 }
 
+static bool uuid_is_nonzero(__u8 u[16])
+{
+	int i;
+
+	for (i = 0; i < 16; i++)
+		if (u[i])
+			return true;
+	return false;
+}
+
+static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	struct f2fs_encryption_policy policy;
+	struct inode *inode = file_inode(filp);
+
+	if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg,
+				sizeof(policy)))
+		return -EFAULT;
+
+	return f2fs_process_policy(&policy, inode);
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	struct f2fs_encryption_policy policy;
+	struct inode *inode = file_inode(filp);
+	int err;
+
+	err = f2fs_get_policy(inode, &policy);
+	if (err)
+		return err;
+
+	if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy,
+							sizeof(policy)))
+		return -EFAULT;
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	int err;
+
+	if (!f2fs_sb_has_crypto(inode->i_sb))
+		return -EOPNOTSUPP;
+
+	if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
+		goto got_it;
+
+	err = mnt_want_write_file(filp);
+	if (err)
+		return err;
+
+	/* update superblock with uuid */
+	generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
+
+	err = f2fs_commit_super(sbi, false);
+
+	mnt_drop_write_file(filp);
+	if (err) {
+		/* undo new data */
+		memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
+		return err;
+	}
+got_it:
+	if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
+									16))
+		return -EFAULT;
+	return 0;
+}
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -1132,11 +1567,29 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_shutdown(filp, arg);
 	case FITRIM:
 		return f2fs_ioc_fitrim(filp, arg);
+	case F2FS_IOC_SET_ENCRYPTION_POLICY:
+		return f2fs_ioc_set_encryption_policy(filp, arg);
+	case F2FS_IOC_GET_ENCRYPTION_POLICY:
+		return f2fs_ioc_get_encryption_policy(filp, arg);
+	case F2FS_IOC_GET_ENCRYPTION_PWSALT:
+		return f2fs_ioc_get_encryption_pwsalt(filp, arg);
 	default:
 		return -ENOTTY;
 	}
 }
 
+static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	if (f2fs_encrypted_inode(inode) &&
+				!f2fs_has_encryption_key(inode) &&
+				f2fs_get_encryption_info(inode))
+		return -EACCES;
+
+	return generic_file_write_iter(iocb, from);
+}
+
 #ifdef CONFIG_COMPAT
 long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -1157,8 +1610,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 const struct file_operations f2fs_file_operations = {
 	.llseek		= f2fs_llseek,
 	.read_iter	= generic_file_read_iter,
-	.write_iter	= generic_file_write_iter,
-	.open		= generic_file_open,
+	.write_iter	= f2fs_file_write_iter,
+	.open		= f2fs_file_open,
 	.release	= f2fs_release_file,
 	.mmap		= f2fs_file_mmap,
 	.fsync		= f2fs_sync_file,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ed58211fe..22fb5ef37 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -518,12 +518,91 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	return 1;
 }
 
-static void move_data_page(struct inode *inode, struct page *page, int gc_type)
+static void move_encrypted_block(struct inode *inode, block_t bidx)
 {
 	struct f2fs_io_info fio = {
+		.sbi = F2FS_I_SB(inode),
 		.type = DATA,
-		.rw = WRITE_SYNC,
+		.rw = READ_SYNC,
+		.encrypted_page = NULL,
 	};
+	struct dnode_of_data dn;
+	struct f2fs_summary sum;
+	struct node_info ni;
+	struct page *page;
+	int err;
+
+	/* do not read out */
+	page = grab_cache_page(inode->i_mapping, bidx);
+	if (!page)
+		return;
+
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
+	if (err)
+		goto out;
+
+	if (unlikely(dn.data_blkaddr == NULL_ADDR))
+		goto put_out;
+
+	get_node_info(fio.sbi, dn.nid, &ni);
+	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+
+	/* read page */
+	fio.page = page;
+	fio.blk_addr = dn.data_blkaddr;
+
+	fio.encrypted_page = grab_cache_page(META_MAPPING(fio.sbi), fio.blk_addr);
+	if (!fio.encrypted_page)
+		goto put_out;
+
+	err = f2fs_submit_page_bio(&fio);
+	if (err)
+		goto put_page_out;
+
+	/* write page */
+	lock_page(fio.encrypted_page);
+
+	if (unlikely(!PageUptodate(fio.encrypted_page)))
+		goto put_page_out;
+	if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi)))
+		goto put_page_out;
+
+	set_page_dirty(fio.encrypted_page);
+	f2fs_wait_on_page_writeback(fio.encrypted_page, META);
+	if (clear_page_dirty_for_io(fio.encrypted_page))
+		dec_page_count(fio.sbi, F2FS_DIRTY_META);
+
+	set_page_writeback(fio.encrypted_page);
+
+	/* allocate block address */
+	f2fs_wait_on_page_writeback(dn.node_page, NODE);
+	allocate_data_block(fio.sbi, NULL, fio.blk_addr,
+					&fio.blk_addr, &sum, CURSEG_COLD_DATA);
+	fio.rw = WRITE_SYNC;
+	f2fs_submit_page_mbio(&fio);
+
+	dn.data_blkaddr = fio.blk_addr;
+	set_data_blkaddr(&dn);
+	f2fs_update_extent_cache(&dn);
+	set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+	if (page->index == 0)
+		set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+put_page_out:
+	f2fs_put_page(fio.encrypted_page, 1);
+put_out:
+	f2fs_put_dnode(&dn);
+out:
+	f2fs_put_page(page, 1);
+}
+
+static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
+{
+	struct page *page;
+
+	page = get_lock_data_page(inode, bidx);
+	if (IS_ERR(page))
+		return;
 
 	if (gc_type == BG_GC) {
 		if (PageWriteback(page))
@@ -531,12 +610,19 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 		set_page_dirty(page);
 		set_cold_data(page);
 	} else {
+		struct f2fs_io_info fio = {
+			.sbi = F2FS_I_SB(inode),
+			.type = DATA,
+			.rw = WRITE_SYNC,
+			.page = page,
+			.encrypted_page = NULL,
+		};
+		set_page_dirty(page);
 		f2fs_wait_on_page_writeback(page, DATA);
-
 		if (clear_page_dirty_for_io(page))
 			inode_dec_dirty_pages(inode);
 		set_cold_data(page);
-		do_write_data_page(page, &fio);
+		do_write_data_page(&fio);
 		clear_cold_data(page);
 	}
 out:
@@ -599,10 +685,16 @@ next_step:
 			if (IS_ERR(inode) || is_bad_inode(inode))
 				continue;
 
-			start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+			/* if encrypted inode, let's go phase 3 */
+			if (f2fs_encrypted_inode(inode) &&
+						S_ISREG(inode->i_mode)) {
+				add_gc_inode(gc_list, inode);
+				continue;
+			}
 
-			data_page = find_data_page(inode,
-					start_bidx + ofs_in_node, false);
+			start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+			data_page = get_read_data_page(inode,
+					start_bidx + ofs_in_node, READA);
 			if (IS_ERR(data_page)) {
 				iput(inode);
 				continue;
@@ -616,12 +708,12 @@ next_step:
 		/* phase 3 */
 		inode = find_gc_inode(gc_list, dni.ino);
 		if (inode) {
-			start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
-			data_page = get_lock_data_page(inode,
-						start_bidx + ofs_in_node);
-			if (IS_ERR(data_page))
-				continue;
-			move_data_page(inode, data_page, gc_type);
+			start_bidx = start_bidx_of_node(nofs, F2FS_I(inode))
+								+ ofs_in_node;
+			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+				move_encrypted_block(inode, start_bidx);
+			else
+				move_data_page(inode, start_bidx, gc_type);
 			stat_inc_data_blk_count(sbi, 1, gc_type);
 		}
 	}
@@ -670,6 +762,15 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
 
 	sum = page_address(sum_page);
 
+	/*
+	 * this is to avoid deadlock:
+	 * - lock_page(sum_page)         - f2fs_replace_block
+	 *  - check_valid_map()            - mutex_lock(sentry_lock)
+	 *   - mutex_lock(sentry_lock)     - change_curseg()
+	 *                                  - lock_page(sum_page)
+	 */
+	unlock_page(sum_page);
+
 	switch (GET_SUM_TYPE((&sum->footer))) {
 	case SUM_TYPE_NODE:
 		gc_node_segment(sbi, sum->entries, segno, gc_type);
@@ -683,7 +784,7 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
 	stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type);
 	stat_inc_call_count(sbi->stat_info);
 
-	f2fs_put_page(sum_page, 1);
+	f2fs_put_page(sum_page, 0);
 }
 
 int f2fs_gc(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index a844fcfb9..71b7206c4 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -79,8 +79,7 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
 	const unsigned char *name = name_info->name;
 	size_t len = name_info->len;
 
-	if ((len <= 2) && (name[0] == '.') &&
-		(name[1] == '.' || name[1] == '\0'))
+	if (is_dot_dotdot(name_info))
 		return 0;
 
 	/* Initialize the default seed for the hash checksum functions */
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 8140e4f0e..a13ffcc32 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -13,7 +13,7 @@
 
 #include "f2fs.h"
 
-bool f2fs_may_inline(struct inode *inode)
+bool f2fs_may_inline_data(struct inode *inode)
 {
 	if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
 		return false;
@@ -27,6 +27,20 @@ bool f2fs_may_inline(struct inode *inode)
 	if (i_size_read(inode) > MAX_INLINE_DATA)
 		return false;
 
+	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+		return false;
+
+	return true;
+}
+
+bool f2fs_may_inline_dentry(struct inode *inode)
+{
+	if (!test_opt(F2FS_I_SB(inode), INLINE_DENTRY))
+		return false;
+
+	if (!S_ISDIR(inode->i_mode))
+		return false;
+
 	return true;
 }
 
@@ -95,8 +109,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 {
 	void *src_addr, *dst_addr;
 	struct f2fs_io_info fio = {
+		.sbi = F2FS_I_SB(dn->inode),
 		.type = DATA,
 		.rw = WRITE_SYNC | REQ_PRIO,
+		.page = page,
+		.encrypted_page = NULL,
 	};
 	int dirty, err;
 
@@ -124,13 +141,15 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 	kunmap_atomic(dst_addr);
 	SetPageUptodate(page);
 no_update:
+	set_page_dirty(page);
+
 	/* clear dirty state */
 	dirty = clear_page_dirty_for_io(page);
 
 	/* write data page to try to make data consistent */
 	set_page_writeback(page);
 	fio.blk_addr = dn->data_blkaddr;
-	write_data_page(page, dn, &fio);
+	write_data_page(dn, &fio);
 	set_data_blkaddr(dn);
 	f2fs_update_extent_cache(dn);
 	f2fs_wait_on_page_writeback(page, DATA);
@@ -267,23 +286,26 @@ process_inline:
 }
 
 struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
-				struct qstr *name, struct page **res_page)
+			struct f2fs_filename *fname, struct page **res_page)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
 	struct f2fs_inline_dentry *inline_dentry;
+	struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
 	struct f2fs_dir_entry *de;
 	struct f2fs_dentry_ptr d;
 	struct page *ipage;
+	f2fs_hash_t namehash;
 
 	ipage = get_node_page(sbi, dir->i_ino);
 	if (IS_ERR(ipage))
 		return NULL;
 
-	inline_dentry = inline_data_addr(ipage);
+	namehash = f2fs_dentry_hash(&name);
 
-	make_dentry_ptr(&d, (void *)inline_dentry, 2);
-	de = find_target_dentry(name, NULL, &d);
+	inline_dentry = inline_data_addr(ipage);
 
+	make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+	de = find_target_dentry(fname, namehash, NULL, &d);
 	unlock_page(ipage);
 	if (de)
 		*res_page = ipage;
@@ -325,7 +347,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
 
 	dentry_blk = inline_data_addr(ipage);
 
-	make_dentry_ptr(&d, (void *)dentry_blk, 2);
+	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
 	do_make_empty_dir(inode, parent, &d);
 
 	set_page_dirty(ipage);
@@ -429,7 +451,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
 	f2fs_wait_on_page_writeback(ipage, NODE);
 
 	name_hash = f2fs_dentry_hash(name);
-	make_dentry_ptr(&d, (void *)dentry_blk, 2);
+	make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
 	f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos);
 
 	set_page_dirty(ipage);
@@ -506,7 +528,8 @@ bool f2fs_empty_inline_dir(struct inode *dir)
 	return true;
 }
 
-int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx)
+int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
+				struct f2fs_str *fstr)
 {
 	struct inode *inode = file_inode(file);
 	struct f2fs_inline_dentry *inline_dentry = NULL;
@@ -522,9 +545,9 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx)
 
 	inline_dentry = inline_data_addr(ipage);
 
-	make_dentry_ptr(&d, (void *)inline_dentry, 2);
+	make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
 
-	if (!f2fs_fill_dentries(ctx, &d, 0))
+	if (!f2fs_fill_dentries(ctx, &d, 0, fstr))
 		ctx->pos = NR_INLINE_DENTRY;
 
 	f2fs_put_page(ipage, 1);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index e622ec954..2550868dc 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -198,7 +198,10 @@ make_now:
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
 	} else if (S_ISLNK(inode->i_mode)) {
-		inode->i_op = &f2fs_symlink_inode_operations;
+		if (f2fs_encrypted_inode(inode))
+			inode->i_op = &f2fs_encrypted_symlink_inode_operations;
+		else
+			inode->i_op = &f2fs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
@@ -359,6 +362,10 @@ no_delete:
 	if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE))
 		add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
 out_clear:
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	if (F2FS_I(inode)->i_crypt_info)
+		f2fs_free_encryption_info(inode, F2FS_I(inode)->i_crypt_info);
+#endif
 	clear_inode(inode);
 }
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 658e8079a..fdbae21ee 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -56,11 +56,18 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 		goto out;
 	}
 
-	if (f2fs_may_inline(inode))
+	/* If the directory encrypted, then we should encrypt the inode. */
+	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
+		f2fs_set_encrypted_inode(inode);
+
+	if (f2fs_may_inline_data(inode))
 		set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
-	if (test_opt(sbi, INLINE_DENTRY) && S_ISDIR(inode->i_mode))
+	if (f2fs_may_inline_dentry(inode))
 		set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
 
+	stat_inc_inline_inode(inode);
+	stat_inc_inline_dir(inode);
+
 	trace_f2fs_new_inode(inode, 0);
 	mark_inode_dirty(inode);
 	return inode;
@@ -136,7 +143,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 
 	alloc_nid_done(sbi, ino);
 
-	stat_inc_inline_inode(inode);
 	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
 
@@ -155,6 +161,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	int err;
 
+	if (f2fs_encrypted_inode(dir) &&
+		!f2fs_is_child_context_consistent_with_parent(dir, inode))
+		return -EPERM;
+
 	f2fs_balance_fs(sbi);
 
 	inode->i_ctime = CURRENT_TIME;
@@ -232,32 +242,34 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	struct f2fs_dir_entry *de;
 	struct page *page;
+	nid_t ino;
+	int err = 0;
 
 	if (dentry->d_name.len > F2FS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
-	if (de) {
-		nid_t ino = le32_to_cpu(de->ino);
-		f2fs_dentry_kunmap(dir, page);
-		f2fs_put_page(page, 0);
+	if (!de)
+		return d_splice_alias(inode, dentry);
 
-		inode = f2fs_iget(dir->i_sb, ino);
-		if (IS_ERR(inode))
-			return ERR_CAST(inode);
+	ino = le32_to_cpu(de->ino);
+	f2fs_dentry_kunmap(dir, page);
+	f2fs_put_page(page, 0);
 
-		if (f2fs_has_inline_dots(inode)) {
-			int err;
+	inode = f2fs_iget(dir->i_sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
 
-			err = __recover_dot_dentries(inode, dir->i_ino);
-			if (err) {
-				iget_failed(inode);
-				return ERR_PTR(err);
-			}
-		}
+	if (f2fs_has_inline_dots(inode)) {
+		err = __recover_dot_dentries(inode, dir->i_ino);
+		if (err)
+			goto err_out;
 	}
-
 	return d_splice_alias(inode, dentry);
+
+err_out:
+	iget_failed(inode);
+	return ERR_PTR(err);
 }
 
 static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
@@ -296,19 +308,15 @@ fail:
 	return err;
 }
 
-static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
 {
-	struct page *page = page_follow_link_light(dentry, nd);
-
-	if (IS_ERR_OR_NULL(page))
-		return page;
-
-	/* this is broken symlink case */
-	if (*nd_get_link(nd) == 0) {
-		page_put_link(dentry, nd, page);
-		return ERR_PTR(-ENOENT);
+	const char *link = page_follow_link_light(dentry, cookie);
+	if (!IS_ERR(link) && !*link) {
+		/* this is broken symlink case */
+		page_put_link(NULL, *cookie);
+		link = ERR_PTR(-ENOENT);
 	}
-	return page;
+	return link;
 }
 
 static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
@@ -316,16 +324,26 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
-	size_t symlen = strlen(symname) + 1;
+	size_t len = strlen(symname);
+	size_t p_len;
+	char *p_str;
+	struct f2fs_str disk_link = FSTR_INIT(NULL, 0);
+	struct f2fs_encrypted_symlink_data *sd = NULL;
 	int err;
 
+	if (len > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
 	f2fs_balance_fs(sbi);
 
 	inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	inode->i_op = &f2fs_symlink_inode_operations;
+	if (f2fs_encrypted_inode(inode))
+		inode->i_op = &f2fs_encrypted_symlink_inode_operations;
+	else
+		inode->i_op = &f2fs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 
 	f2fs_lock_op(sbi);
@@ -333,10 +351,46 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	if (err)
 		goto out;
 	f2fs_unlock_op(sbi);
-
-	err = page_symlink(inode, symname, symlen);
 	alloc_nid_done(sbi, inode->i_ino);
 
+	if (f2fs_encrypted_inode(dir)) {
+		struct qstr istr = QSTR_INIT(symname, len);
+
+		err = f2fs_get_encryption_info(inode);
+		if (err)
+			goto err_out;
+
+		err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link);
+		if (err)
+			goto err_out;
+
+		err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link);
+		if (err < 0)
+			goto err_out;
+
+		p_len = encrypted_symlink_data_len(disk_link.len) + 1;
+
+		if (p_len > dir->i_sb->s_blocksize) {
+			err = -ENAMETOOLONG;
+			goto err_out;
+		}
+
+		sd = kzalloc(p_len, GFP_NOFS);
+		if (!sd) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+		memcpy(sd->encrypted_path, disk_link.name, disk_link.len);
+		sd->len = cpu_to_le16(disk_link.len);
+		p_str = (char *)sd;
+	} else {
+		p_len = len + 1;
+		p_str = (char *)symname;
+	}
+
+	err = page_symlink(inode, p_str, p_len);
+
+err_out:
 	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
 
@@ -349,10 +403,14 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	 * If the symlink path is stored into inline_data, there is no
 	 * performance regression.
 	 */
-	filemap_write_and_wait_range(inode->i_mapping, 0, symlen - 1);
+	if (!err)
+		filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1);
 
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
+
+	kfree(sd);
+	f2fs_fname_crypto_free_buffer(&disk_link);
 	return err;
 out:
 	handle_failed_inode(inode);
@@ -383,7 +441,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 		goto out_fail;
 	f2fs_unlock_op(sbi);
 
-	stat_inc_inline_dir(inode);
 	alloc_nid_done(sbi, inode->i_ino);
 
 	d_instantiate(dentry, inode);
@@ -445,19 +502,101 @@ out:
 	return err;
 }
 
+static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
+					umode_t mode, struct inode **whiteout)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+	struct inode *inode;
+	int err;
+
+	if (!whiteout)
+		f2fs_balance_fs(sbi);
+
+	inode = f2fs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (whiteout) {
+		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
+		inode->i_op = &f2fs_special_inode_operations;
+	} else {
+		inode->i_op = &f2fs_file_inode_operations;
+		inode->i_fop = &f2fs_file_operations;
+		inode->i_mapping->a_ops = &f2fs_dblock_aops;
+	}
+
+	f2fs_lock_op(sbi);
+	err = acquire_orphan_inode(sbi);
+	if (err)
+		goto out;
+
+	err = f2fs_do_tmpfile(inode, dir);
+	if (err)
+		goto release_out;
+
+	/*
+	 * add this non-linked tmpfile to orphan list, in this way we could
+	 * remove all unused data of tmpfile after abnormal power-off.
+	 */
+	add_orphan_inode(sbi, inode->i_ino);
+	f2fs_unlock_op(sbi);
+
+	alloc_nid_done(sbi, inode->i_ino);
+
+	if (whiteout) {
+		inode_dec_link_count(inode);
+		*whiteout = inode;
+	} else {
+		d_tmpfile(dentry, inode);
+	}
+	unlock_new_inode(inode);
+	return 0;
+
+release_out:
+	release_orphan_inode(sbi);
+out:
+	handle_failed_inode(inode);
+	return err;
+}
+
+static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	if (f2fs_encrypted_inode(dir)) {
+		int err = f2fs_get_encryption_info(dir);
+		if (err)
+			return err;
+	}
+
+	return __f2fs_tmpfile(dir, dentry, mode, NULL);
+}
+
+static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
+{
+	return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
+}
+
 static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
-			struct inode *new_dir, struct dentry *new_dentry)
+			struct inode *new_dir, struct dentry *new_dentry,
+			unsigned int flags)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
 	struct inode *old_inode = d_inode(old_dentry);
 	struct inode *new_inode = d_inode(new_dentry);
+	struct inode *whiteout = NULL;
 	struct page *old_dir_page;
-	struct page *old_page, *new_page;
+	struct page *old_page, *new_page = NULL;
 	struct f2fs_dir_entry *old_dir_entry = NULL;
 	struct f2fs_dir_entry *old_entry;
 	struct f2fs_dir_entry *new_entry;
 	int err = -ENOENT;
 
+	if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
+		!f2fs_is_child_context_consistent_with_parent(new_dir,
+							old_inode)) {
+		err = -EPERM;
+		goto out;
+	}
+
 	f2fs_balance_fs(sbi);
 
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
@@ -471,17 +610,23 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out_old;
 	}
 
+	if (flags & RENAME_WHITEOUT) {
+		err = f2fs_create_whiteout(old_dir, &whiteout);
+		if (err)
+			goto out_dir;
+	}
+
 	if (new_inode) {
 
 		err = -ENOTEMPTY;
 		if (old_dir_entry && !f2fs_empty_dir(new_inode))
-			goto out_dir;
+			goto out_whiteout;
 
 		err = -ENOENT;
 		new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
 						&new_page);
 		if (!new_entry)
-			goto out_dir;
+			goto out_whiteout;
 
 		f2fs_lock_op(sbi);
 
@@ -489,7 +634,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (err)
 			goto put_out_dir;
 
-		if (update_dent_inode(old_inode, &new_dentry->d_name)) {
+		if (update_dent_inode(old_inode, new_inode,
+						&new_dentry->d_name)) {
 			release_orphan_inode(sbi);
 			goto put_out_dir;
 		}
@@ -518,7 +664,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		err = f2fs_add_link(new_dentry, old_inode);
 		if (err) {
 			f2fs_unlock_op(sbi);
-			goto out_dir;
+			goto out_whiteout;
 		}
 
 		if (old_dir_entry) {
@@ -529,6 +675,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	down_write(&F2FS_I(old_inode)->i_sem);
 	file_lost_pino(old_inode);
+	if (new_inode && file_enc_name(new_inode))
+		file_set_enc_name(old_inode);
 	up_write(&F2FS_I(old_inode)->i_sem);
 
 	old_inode->i_ctime = CURRENT_TIME;
@@ -536,8 +684,18 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
 
+	if (whiteout) {
+		whiteout->i_state |= I_LINKABLE;
+		set_inode_flag(F2FS_I(whiteout), FI_INC_LINK);
+		err = f2fs_add_link(old_dentry, whiteout);
+		if (err)
+			goto put_out_dir;
+		whiteout->i_state &= ~I_LINKABLE;
+		iput(whiteout);
+	}
+
 	if (old_dir_entry) {
-		if (old_dir != new_dir) {
+		if (old_dir != new_dir && !whiteout) {
 			f2fs_set_link(old_inode, old_dir_entry,
 						old_dir_page, new_dir);
 			update_inode_page(old_inode);
@@ -558,8 +716,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 put_out_dir:
 	f2fs_unlock_op(sbi);
-	f2fs_dentry_kunmap(new_dir, new_page);
-	f2fs_put_page(new_page, 0);
+	if (new_page) {
+		f2fs_dentry_kunmap(new_dir, new_page);
+		f2fs_put_page(new_page, 0);
+	}
+out_whiteout:
+	if (whiteout)
+		iput(whiteout);
 out_dir:
 	if (old_dir_entry) {
 		f2fs_dentry_kunmap(old_inode, old_dir_page);
@@ -585,6 +748,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int old_nlink = 0, new_nlink = 0;
 	int err = -ENOENT;
 
+	if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
+		(old_dir != new_dir) &&
+		(!f2fs_is_child_context_consistent_with_parent(new_dir,
+								old_inode) ||
+		!f2fs_is_child_context_consistent_with_parent(old_dir,
+								new_inode)))
+		return -EPERM;
+
 	f2fs_balance_fs(sbi);
 
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
@@ -631,13 +802,17 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	f2fs_lock_op(sbi);
 
-	err = update_dent_inode(old_inode, &new_dentry->d_name);
+	err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
 	if (err)
 		goto out_unlock;
+	if (file_enc_name(new_inode))
+		file_set_enc_name(old_inode);
 
-	err = update_dent_inode(new_inode, &old_dentry->d_name);
+	err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name);
 	if (err)
 		goto out_undo;
+	if (file_enc_name(old_inode))
+		file_set_enc_name(new_inode);
 
 	/* update ".." directory entry info of old dentry */
 	if (old_dir_entry)
@@ -695,8 +870,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 		f2fs_sync_fs(sbi->sb, 1);
 	return 0;
 out_undo:
-	/* Still we may fail to recover name info of f2fs_inode here */
-	update_dent_inode(old_inode, &old_dentry->d_name);
+	/*
+	 * Still we may fail to recover name info of f2fs_inode here
+	 * Drop it, once its name is set as encrypted
+	 */
+	update_dent_inode(old_inode, old_inode, &old_dentry->d_name);
 out_unlock:
 	f2fs_unlock_op(sbi);
 out_new_dir:
@@ -723,7 +901,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
-	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return -EINVAL;
 
 	if (flags & RENAME_EXCHANGE) {
@@ -734,53 +912,85 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 	 * VFS has already handled the new dentry existence case,
 	 * here, we just deal with "RENAME_NOREPLACE" as regular rename.
 	 */
-	return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry);
+	return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
-static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
-	struct inode *inode;
-	int err;
-
-	inode = f2fs_new_inode(dir, mode);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	inode->i_op = &f2fs_file_inode_operations;
-	inode->i_fop = &f2fs_file_operations;
-	inode->i_mapping->a_ops = &f2fs_dblock_aops;
-
-	f2fs_lock_op(sbi);
-	err = acquire_orphan_inode(sbi);
-	if (err)
-		goto out;
-
-	err = f2fs_do_tmpfile(inode, dir);
-	if (err)
-		goto release_out;
-
-	/*
-	 * add this non-linked tmpfile to orphan list, in this way we could
-	 * remove all unused data of tmpfile after abnormal power-off.
-	 */
-	add_orphan_inode(sbi, inode->i_ino);
-	f2fs_unlock_op(sbi);
-
-	alloc_nid_done(sbi, inode->i_ino);
+	struct page *cpage = NULL;
+	char *caddr, *paddr = NULL;
+	struct f2fs_str cstr;
+	struct f2fs_str pstr = FSTR_INIT(NULL, 0);
+	struct inode *inode = d_inode(dentry);
+	struct f2fs_encrypted_symlink_data *sd;
+	loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
+	u32 max_size = inode->i_sb->s_blocksize;
+	int res;
+
+	res = f2fs_get_encryption_info(inode);
+	if (res)
+		return ERR_PTR(res);
+
+	cpage = read_mapping_page(inode->i_mapping, 0, NULL);
+	if (IS_ERR(cpage))
+		return ERR_CAST(cpage);
+	caddr = kmap(cpage);
+	caddr[size] = 0;
+
+	/* Symlink is encrypted */
+	sd = (struct f2fs_encrypted_symlink_data *)caddr;
+	cstr.name = sd->encrypted_path;
+	cstr.len = le16_to_cpu(sd->len);
 
-	stat_inc_inline_inode(inode);
-	d_tmpfile(dentry, inode);
-	unlock_new_inode(inode);
-	return 0;
+	/* this is broken symlink case */
+	if (cstr.name[0] == 0 && cstr.len == 0) {
+		res = -ENOENT;
+		goto errout;
+	}
 
-release_out:
-	release_orphan_inode(sbi);
-out:
-	handle_failed_inode(inode);
-	return err;
+	if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) >
+								max_size) {
+		/* Symlink data on the disk is corrupted */
+		res = -EIO;
+		goto errout;
+	}
+	res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr);
+	if (res)
+		goto errout;
+
+	res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
+	if (res < 0)
+		goto errout;
+
+	paddr = pstr.name;
+
+	/* Null-terminate the name */
+	paddr[res] = '\0';
+
+	kunmap(cpage);
+	page_cache_release(cpage);
+	return *cookie = paddr;
+errout:
+	f2fs_fname_crypto_free_buffer(&pstr);
+	kunmap(cpage);
+	page_cache_release(cpage);
+	return ERR_PTR(res);
 }
 
+const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
+	.readlink       = generic_readlink,
+	.follow_link    = f2fs_encrypted_follow_link,
+	.put_link       = kfree_put_link,
+	.getattr	= f2fs_getattr,
+	.setattr	= f2fs_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= f2fs_listxattr,
+	.removexattr	= generic_removexattr,
+};
+#endif
+
 const struct inode_operations f2fs_dir_inode_operations = {
 	.create		= f2fs_create,
 	.lookup		= f2fs_lookup,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8ab0cf193..7dd63b794 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -53,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 							PAGE_CACHE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == DIRTY_DENTS) {
-		if (sbi->sb->s_bdi->dirty_exceeded)
+		if (sbi->sb->s_bdi->wb.dirty_exceeded)
 			return false;
 		mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
@@ -70,7 +70,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 				sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else {
-		if (sbi->sb->s_bdi->dirty_exceeded)
+		if (sbi->sb->s_bdi->wb.dirty_exceeded)
 			return false;
 	}
 	return res;
@@ -195,32 +195,35 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
 							start, nr);
 }
 
-bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *e;
-	bool is_cp = true;
+	bool need = false;
 
 	down_read(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
-	if (e && !get_nat_flag(e, IS_CHECKPOINTED))
-		is_cp = false;
+	if (e) {
+		if (!get_nat_flag(e, IS_CHECKPOINTED) &&
+				!get_nat_flag(e, HAS_FSYNCED_INODE))
+			need = true;
+	}
 	up_read(&nm_i->nat_tree_lock);
-	return is_cp;
+	return need;
 }
 
-bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *e;
-	bool fsynced = false;
+	bool is_cp = true;
 
 	down_read(&nm_i->nat_tree_lock);
-	e = __lookup_nat_cache(nm_i, ino);
-	if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
-		fsynced = true;
+	e = __lookup_nat_cache(nm_i, nid);
+	if (e && !get_nat_flag(e, IS_CHECKPOINTED))
+		is_cp = false;
 	up_read(&nm_i->nat_tree_lock);
-	return fsynced;
+	return is_cp;
 }
 
 bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
@@ -312,7 +315,8 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 	__set_nat_cache_dirty(nm_i, e);
 
 	/* update fsync_mark if its inode nat entry is still alive */
-	e = __lookup_nat_cache(nm_i, ni->ino);
+	if (ni->nid != ni->ino)
+		e = __lookup_nat_cache(nm_i, ni->ino);
 	if (e) {
 		if (fsync_done && ni->nid == ni->ino)
 			set_nat_flag(e, HAS_FSYNCED_INODE, true);
@@ -995,8 +999,11 @@ static int read_node_page(struct page *page, int rw)
 	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 	struct node_info ni;
 	struct f2fs_io_info fio = {
+		.sbi = sbi,
 		.type = NODE,
 		.rw = rw,
+		.page = page,
+		.encrypted_page = NULL,
 	};
 
 	get_node_info(sbi, page->index, &ni);
@@ -1011,7 +1018,7 @@ static int read_node_page(struct page *page, int rw)
 		return LOCKED_PAGE;
 
 	fio.blk_addr = ni.blk_addr;
-	return f2fs_submit_page_bio(sbi, page, &fio);
+	return f2fs_submit_page_bio(&fio);
 }
 
 /*
@@ -1204,13 +1211,9 @@ continue_unlock:
 			/* called by fsync() */
 			if (ino && IS_DNODE(page)) {
 				set_fsync_mark(page, 1);
-				if (IS_INODE(page)) {
-					if (!is_checkpointed_node(sbi, ino) &&
-						!has_fsynced_inode(sbi, ino))
-						set_dentry_mark(page, 1);
-					else
-						set_dentry_mark(page, 0);
-				}
+				if (IS_INODE(page))
+					set_dentry_mark(page,
+						need_dentry_mark(sbi, ino));
 				nwritten++;
 			} else {
 				set_fsync_mark(page, 0);
@@ -1293,8 +1296,11 @@ static int f2fs_write_node_page(struct page *page,
 	nid_t nid;
 	struct node_info ni;
 	struct f2fs_io_info fio = {
+		.sbi = sbi,
 		.type = NODE,
 		.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+		.page = page,
+		.encrypted_page = NULL,
 	};
 
 	trace_f2fs_writepage(page, NODE);
@@ -1329,7 +1335,7 @@ static int f2fs_write_node_page(struct page *page,
 
 	set_page_writeback(page);
 	fio.blk_addr = ni.blk_addr;
-	write_node_page(sbi, page, nid, &fio);
+	write_node_page(nid, &fio);
 	set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
 	dec_page_count(sbi, F2FS_DIRTY_NODES);
 	up_read(&sbi->node_write);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index c56026f17..7427e956a 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -343,28 +343,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
  *  - Mark cold node blocks in their node footer
  *  - Mark cold data pages in page cache
  */
-static inline int is_file(struct inode *inode, int type)
-{
-	return F2FS_I(inode)->i_advise & type;
-}
-
-static inline void set_file(struct inode *inode, int type)
-{
-	F2FS_I(inode)->i_advise |= type;
-}
-
-static inline void clear_file(struct inode *inode, int type)
-{
-	F2FS_I(inode)->i_advise &= ~type;
-}
-
-#define file_is_cold(inode)	is_file(inode, FADVISE_COLD_BIT)
-#define file_wrong_pino(inode)	is_file(inode, FADVISE_LOST_PINO_BIT)
-#define file_set_cold(inode)	set_file(inode, FADVISE_COLD_BIT)
-#define file_lost_pino(inode)	set_file(inode, FADVISE_LOST_PINO_BIT)
-#define file_clear_cold(inode)	clear_file(inode, FADVISE_COLD_BIT)
-#define file_got_pino(inode)	clear_file(inode, FADVISE_LOST_PINO_BIT)
-
 static inline int is_cold_data(struct page *page)
 {
 	return PageChecked(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 8d8ea99f2..24a8c1d4f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -83,6 +83,11 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
 		goto out;
 	}
 
+	if (file_enc_name(inode)) {
+		iput(dir);
+		return 0;
+	}
+
 	name.len = le32_to_cpu(raw_inode->i_namelen);
 	name.name = raw_inode->i_name;
 
@@ -143,6 +148,7 @@ out:
 static void recover_inode(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode *raw = F2FS_INODE(page);
+	char *name;
 
 	inode->i_mode = le16_to_cpu(raw->i_mode);
 	i_size_write(inode, le64_to_cpu(raw->i_size));
@@ -153,8 +159,13 @@ static void recover_inode(struct inode *inode, struct page *page)
 	inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
 	inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
 
+	if (file_enc_name(inode))
+		name = "<encrypted>";
+	else
+		name = F2FS_INODE(page)->i_name;
+
 	f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
-			ino_of_node(page), F2FS_INODE(page)->i_name);
+			ino_of_node(page), name);
 }
 
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -174,7 +185,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+		if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
 			return 0;
 
 		page = get_meta_page(sbi, blkaddr);
@@ -349,7 +360,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	unsigned int start, end;
 	struct dnode_of_data dn;
-	struct f2fs_summary sum;
 	struct node_info ni;
 	int err = 0, recovered = 0;
 
@@ -396,7 +406,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 		dest = datablock_addr(page, dn.ofs_in_node);
 
 		if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR &&
-			dest >= MAIN_BLKADDR(sbi) && dest < MAX_BLKADDR(sbi)) {
+			is_valid_blkaddr(sbi, dest, META_POR)) {
 
 			if (src == NULL_ADDR) {
 				err = reserve_new_block(&dn);
@@ -409,13 +419,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 			if (err)
 				goto err;
 
-			set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
-
 			/* write dummy data page */
-			recover_data_page(sbi, NULL, &sum, src, dest);
-			dn.data_blkaddr = dest;
-			set_data_blkaddr(&dn);
-			f2fs_update_extent_cache(&dn);
+			f2fs_replace_block(sbi, &dn, src, dest,
+							ni.version, false);
 			recovered++;
 		}
 		dn.ofs_in_node++;
@@ -454,7 +460,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+		if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
 			break;
 
 		ra_meta_pages_cond(sbi, blkaddr);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f93966094..61b97f9cb 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -75,6 +75,14 @@ static inline unsigned long __reverse_ffs(unsigned long word)
 static unsigned long __find_rev_next_bit(const unsigned long *addr,
 			unsigned long size, unsigned long offset)
 {
+	while (!f2fs_test_bit(offset, (unsigned char *)addr))
+		offset++;
+
+	if (offset > size)
+		offset = size;
+
+	return offset;
+#if 0
 	const unsigned long *p = addr + BIT_WORD(offset);
 	unsigned long result = offset & ~(BITS_PER_LONG - 1);
 	unsigned long tmp;
@@ -121,11 +129,20 @@ found_first:
 		return result + size;   /* Nope. */
 found_middle:
 	return result + __reverse_ffs(tmp);
+#endif
 }
 
 static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
 			unsigned long size, unsigned long offset)
 {
+	while (f2fs_test_bit(offset, (unsigned char *)addr))
+		offset++;
+
+	if (offset > size)
+		offset = size;
+
+	return offset;
+#if 0
 	const unsigned long *p = addr + BIT_WORD(offset);
 	unsigned long result = offset & ~(BITS_PER_LONG - 1);
 	unsigned long tmp;
@@ -173,6 +190,7 @@ found_first:
 		return result + size;   /* Nope. */
 found_middle:
 	return result + __reverse_ffz(tmp);
+#endif
 }
 
 void register_inmem_page(struct inode *inode, struct page *page)
@@ -216,8 +234,10 @@ void commit_inmem_pages(struct inode *inode, bool abort)
 	struct inmem_pages *cur, *tmp;
 	bool submit_bio = false;
 	struct f2fs_io_info fio = {
+		.sbi = sbi,
 		.type = DATA,
 		.rw = WRITE_SYNC | REQ_PRIO,
+		.encrypted_page = NULL,
 	};
 
 	/*
@@ -237,11 +257,13 @@ void commit_inmem_pages(struct inode *inode, bool abort)
 		if (!abort) {
 			lock_page(cur->page);
 			if (cur->page->mapping == inode->i_mapping) {
+				set_page_dirty(cur->page);
 				f2fs_wait_on_page_writeback(cur->page, DATA);
 				if (clear_page_dirty_for_io(cur->page))
 					inode_dec_dirty_pages(inode);
 				trace_f2fs_commit_inmem_page(cur->page, INMEM);
-				do_write_data_page(cur->page, &fio);
+				fio.page = cur->page;
+				do_write_data_page(&fio);
 				submit_bio = true;
 			}
 			f2fs_put_page(cur->page, 1);
@@ -466,22 +488,43 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 {
 	sector_t start = SECTOR_FROM_BLOCK(blkstart);
 	sector_t len = SECTOR_FROM_BLOCK(blklen);
+	struct seg_entry *se;
+	unsigned int offset;
+	block_t i;
+
+	for (i = blkstart; i < blkstart + blklen; i++) {
+		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
+		offset = GET_BLKOFF_FROM_SEG0(sbi, i);
+
+		if (!f2fs_test_and_set_bit(offset, se->discard_map))
+			sbi->discard_blks--;
+	}
 	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
 	return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
 }
 
 void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
 {
-	if (f2fs_issue_discard(sbi, blkaddr, 1)) {
-		struct page *page = grab_meta_page(sbi, blkaddr);
-		/* zero-filled page */
-		set_page_dirty(page);
-		f2fs_put_page(page, 1);
+	int err = -ENOTSUPP;
+
+	if (test_opt(sbi, DISCARD)) {
+		struct seg_entry *se = get_seg_entry(sbi,
+				GET_SEGNO(sbi, blkaddr));
+		unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+
+		if (f2fs_test_bit(offset, se->discard_map))
+			return;
+
+		err = f2fs_issue_discard(sbi, blkaddr, 1);
 	}
+
+	if (err)
+		update_meta_page(sbi, NULL, blkaddr);
 }
 
 static void __add_discard_entry(struct f2fs_sb_info *sbi,
-		struct cp_control *cpc, unsigned int start, unsigned int end)
+		struct cp_control *cpc, struct seg_entry *se,
+		unsigned int start, unsigned int end)
 {
 	struct list_head *head = &SM_I(sbi)->discard_list;
 	struct discard_entry *new, *last;
@@ -502,7 +545,6 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi,
 	list_add_tail(&new->list, head);
 done:
 	SM_I(sbi)->nr_discards += end - start;
-	cpc->trimmed += end - start;
 }
 
 static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
@@ -512,41 +554,24 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
 	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
 	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+	unsigned long *discard_map = (unsigned long *)se->discard_map;
 	unsigned long *dmap = SIT_I(sbi)->tmp_map;
 	unsigned int start = 0, end = -1;
 	bool force = (cpc->reason == CP_DISCARD);
 	int i;
 
-	if (!force && (!test_opt(sbi, DISCARD) ||
-			SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards))
+	if (se->valid_blocks == max_blocks)
 		return;
 
-	if (force && !se->valid_blocks) {
-		struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-		/*
-		 * if this segment is registered in the prefree list, then
-		 * we should skip adding a discard candidate, and let the
-		 * checkpoint do that later.
-		 */
-		mutex_lock(&dirty_i->seglist_lock);
-		if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
-			mutex_unlock(&dirty_i->seglist_lock);
-			cpc->trimmed += sbi->blocks_per_seg;
+	if (!force) {
+		if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
+		    SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)
 			return;
-		}
-		mutex_unlock(&dirty_i->seglist_lock);
-
-		__add_discard_entry(sbi, cpc, 0, sbi->blocks_per_seg);
-		return;
 	}
 
-	/* zero block will be discarded through the prefree list */
-	if (!se->valid_blocks || se->valid_blocks == max_blocks)
-		return;
-
 	/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
 	for (i = 0; i < entries; i++)
-		dmap[i] = force ? ~ckpt_map[i] :
+		dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] :
 				(cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
 
 	while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
@@ -555,11 +580,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			break;
 
 		end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
-
-		if (force && end - start < cpc->trim_minlen)
-			continue;
-
-		__add_discard_entry(sbi, cpc, start, end);
+		__add_discard_entry(sbi, cpc, se, start, end);
 	}
 }
 
@@ -589,7 +610,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 	mutex_unlock(&dirty_i->seglist_lock);
 }
 
-void clear_prefree_segments(struct f2fs_sb_info *sbi)
+void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct list_head *head = &(SM_I(sbi)->discard_list);
 	struct discard_entry *entry, *this;
@@ -622,7 +643,11 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
 
 	/* send small discards */
 	list_for_each_entry_safe(entry, this, head, list) {
+		if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen)
+			goto skip;
 		f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
+		cpc->trimmed += entry->len;
+skip:
 		list_del(&entry->list);
 		SM_I(sbi)->nr_discards -= entry->len;
 		kmem_cache_free(discard_entry_slab, entry);
@@ -673,9 +698,13 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 	if (del > 0) {
 		if (f2fs_test_and_set_bit(offset, se->cur_valid_map))
 			f2fs_bug_on(sbi, 1);
+		if (!f2fs_test_and_set_bit(offset, se->discard_map))
+			sbi->discard_blks--;
 	} else {
 		if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map))
 			f2fs_bug_on(sbi, 1);
+		if (f2fs_test_and_clear_bit(offset, se->discard_map))
+			sbi->discard_blks++;
 	}
 	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
 		se->ckpt_valid_blocks += del;
@@ -769,16 +798,25 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
 	return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno));
 }
 
-static void write_sum_page(struct f2fs_sb_info *sbi,
-			struct f2fs_summary_block *sum_blk, block_t blk_addr)
+void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
 {
 	struct page *page = grab_meta_page(sbi, blk_addr);
-	void *kaddr = page_address(page);
-	memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE);
+	void *dst = page_address(page);
+
+	if (src)
+		memcpy(dst, src, PAGE_CACHE_SIZE);
+	else
+		memset(dst, 0, PAGE_CACHE_SIZE);
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
 }
 
+static void write_sum_page(struct f2fs_sb_info *sbi,
+			struct f2fs_summary_block *sum_blk, block_t blk_addr)
+{
+	update_meta_page(sbi, (void *)sum_blk, blk_addr);
+}
+
 static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -1060,8 +1098,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	unsigned int start_segno, end_segno;
 	struct cp_control cpc;
 
-	if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
-						range->len < sbi->blocksize)
+	if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
 		return -EINVAL;
 
 	cpc.trimmed = 0;
@@ -1073,12 +1110,19 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
 						GET_SEGNO(sbi, end);
 	cpc.reason = CP_DISCARD;
-	cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen);
+	cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
 
 	/* do checkpoint to issue discard commands safely */
 	for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
 		cpc.trim_start = start_segno;
-		cpc.trim_end = min_t(unsigned int, rounddown(start_segno +
+
+		if (sbi->discard_blks == 0)
+			break;
+		else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi))
+			cpc.trim_end = end_segno;
+		else
+			cpc.trim_end = min_t(unsigned int,
+				rounddown(start_segno +
 				BATCHED_TRIM_SEGMENTS(sbi),
 				sbi->segs_per_sec) - 1, end_segno);
 
@@ -1206,84 +1250,95 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	mutex_unlock(&curseg->curseg_mutex);
 }
 
-static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
-			struct f2fs_summary *sum,
-			struct f2fs_io_info *fio)
+static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 {
-	int type = __get_segment_type(page, fio->type);
+	int type = __get_segment_type(fio->page, fio->type);
 
-	allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type);
+	allocate_data_block(fio->sbi, fio->page, fio->blk_addr,
+					&fio->blk_addr, sum, type);
 
 	/* writeout dirty page into bdev */
-	f2fs_submit_page_mbio(sbi, page, fio);
+	f2fs_submit_page_mbio(fio);
 }
 
 void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 {
 	struct f2fs_io_info fio = {
+		.sbi = sbi,
 		.type = META,
 		.rw = WRITE_SYNC | REQ_META | REQ_PRIO,
 		.blk_addr = page->index,
+		.page = page,
+		.encrypted_page = NULL,
 	};
 
 	set_page_writeback(page);
-	f2fs_submit_page_mbio(sbi, page, &fio);
+	f2fs_submit_page_mbio(&fio);
 }
 
-void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
-			unsigned int nid, struct f2fs_io_info *fio)
+void write_node_page(unsigned int nid, struct f2fs_io_info *fio)
 {
 	struct f2fs_summary sum;
+
 	set_summary(&sum, nid, 0, 0);
-	do_write_page(sbi, page, &sum, fio);
+	do_write_page(&sum, fio);
 }
 
-void write_data_page(struct page *page, struct dnode_of_data *dn,
-				struct f2fs_io_info *fio)
+void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	struct f2fs_sb_info *sbi = fio->sbi;
 	struct f2fs_summary sum;
 	struct node_info ni;
 
 	f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
 	get_node_info(sbi, dn->nid, &ni);
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
-	do_write_page(sbi, page, &sum, fio);
+	do_write_page(&sum, fio);
 	dn->data_blkaddr = fio->blk_addr;
 }
 
-void rewrite_data_page(struct page *page, struct f2fs_io_info *fio)
+void rewrite_data_page(struct f2fs_io_info *fio)
 {
-	stat_inc_inplace_blocks(F2FS_P_SB(page));
-	f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio);
+	stat_inc_inplace_blocks(fio->sbi);
+	f2fs_submit_page_mbio(fio);
 }
 
-void recover_data_page(struct f2fs_sb_info *sbi,
-			struct page *page, struct f2fs_summary *sum,
-			block_t old_blkaddr, block_t new_blkaddr)
+static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
+				struct f2fs_summary *sum,
+				block_t old_blkaddr, block_t new_blkaddr,
+				bool recover_curseg)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg;
 	unsigned int segno, old_cursegno;
 	struct seg_entry *se;
 	int type;
+	unsigned short old_blkoff;
 
 	segno = GET_SEGNO(sbi, new_blkaddr);
 	se = get_seg_entry(sbi, segno);
 	type = se->type;
 
-	if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
-		if (old_blkaddr == NULL_ADDR)
-			type = CURSEG_COLD_DATA;
-		else
+	if (!recover_curseg) {
+		/* for recovery flow */
+		if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
+			if (old_blkaddr == NULL_ADDR)
+				type = CURSEG_COLD_DATA;
+			else
+				type = CURSEG_WARM_DATA;
+		}
+	} else {
+		if (!IS_CURSEG(sbi, segno))
 			type = CURSEG_WARM_DATA;
 	}
+
 	curseg = CURSEG_I(sbi, type);
 
 	mutex_lock(&curseg->curseg_mutex);
 	mutex_lock(&sit_i->sentry_lock);
 
 	old_cursegno = curseg->segno;
+	old_blkoff = curseg->next_blkoff;
 
 	/* change the current segment */
 	if (segno != curseg->segno) {
@@ -1297,30 +1352,67 @@ void recover_data_page(struct f2fs_sb_info *sbi,
 	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
 	locate_dirty_segment(sbi, old_cursegno);
 
+	if (recover_curseg) {
+		if (old_cursegno != curseg->segno) {
+			curseg->next_segno = old_cursegno;
+			change_curseg(sbi, type, true);
+		}
+		curseg->next_blkoff = old_blkoff;
+	}
+
 	mutex_unlock(&sit_i->sentry_lock);
 	mutex_unlock(&curseg->curseg_mutex);
 }
 
+void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
+				block_t old_addr, block_t new_addr,
+				unsigned char version, bool recover_curseg)
+{
+	struct f2fs_summary sum;
+
+	set_summary(&sum, dn->nid, dn->ofs_in_node, version);
+
+	__f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg);
+
+	dn->data_blkaddr = new_addr;
+	set_data_blkaddr(dn);
+	f2fs_update_extent_cache(dn);
+}
+
 static inline bool is_merged_page(struct f2fs_sb_info *sbi,
 					struct page *page, enum page_type type)
 {
 	enum page_type btype = PAGE_TYPE_OF_BIO(type);
 	struct f2fs_bio_info *io = &sbi->write_io[btype];
 	struct bio_vec *bvec;
+	struct page *target;
 	int i;
 
 	down_read(&io->io_rwsem);
-	if (!io->bio)
-		goto out;
+	if (!io->bio) {
+		up_read(&io->io_rwsem);
+		return false;
+	}
 
 	bio_for_each_segment_all(bvec, io->bio, i) {
-		if (page == bvec->bv_page) {
+
+		if (bvec->bv_page->mapping) {
+			target = bvec->bv_page;
+		} else {
+			struct f2fs_crypto_ctx *ctx;
+
+			/* encrypted page */
+			ctx = (struct f2fs_crypto_ctx *)page_private(
+								bvec->bv_page);
+			target = ctx->w.control_page;
+		}
+
+		if (page == target) {
 			up_read(&io->io_rwsem);
 			return true;
 		}
 	}
 
-out:
 	up_read(&io->io_rwsem);
 	return false;
 }
@@ -1857,8 +1949,11 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
 		sit_i->sentries[start].ckpt_valid_map
 			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
-		if (!sit_i->sentries[start].cur_valid_map
-				|| !sit_i->sentries[start].ckpt_valid_map)
+		sit_i->sentries[start].discard_map
+			= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+		if (!sit_i->sentries[start].cur_valid_map ||
+				!sit_i->sentries[start].ckpt_valid_map ||
+				!sit_i->sentries[start].discard_map)
 			return -ENOMEM;
 	}
 
@@ -1996,6 +2091,11 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 got_it:
 			check_block_count(sbi, start, &sit);
 			seg_info_from_raw_sit(se, &sit);
+
+			/* build discard map only one time */
+			memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
+			sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks;
+
 			if (sbi->segs_per_sec > 1) {
 				struct sec_entry *e = get_sec_entry(sbi, start);
 				e->valid_blocks += se->valid_blocks;
@@ -2245,6 +2345,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
 		for (start = 0; start < MAIN_SEGS(sbi); start++) {
 			kfree(sit_i->sentries[start].cur_valid_map);
 			kfree(sit_i->sentries[start].ckpt_valid_map);
+			kfree(sit_i->sentries[start].discard_map);
 		}
 	}
 	kfree(sit_i->tmp_map);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 85d7fa751..79e7b879a 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -9,6 +9,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 /* constant macro */
 #define NULL_SEGNO			((unsigned int)(~0))
@@ -163,6 +164,7 @@ struct seg_entry {
 	 */
 	unsigned short ckpt_valid_blocks;
 	unsigned char *ckpt_valid_map;
+	unsigned char *discard_map;
 	unsigned char type;		/* segment type like CURSEG_XXX_TYPE */
 	unsigned long long mtime;	/* modification time of the segment */
 };
@@ -713,7 +715,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
  */
 static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 {
-	if (sbi->sb->s_bdi->dirty_exceeded)
+	if (sbi->sb->s_bdi->wb.dirty_exceeded)
 		return 0;
 
 	if (type == DATA)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index b2dd1b01f..a06b0b46f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -258,6 +258,7 @@ static void init_once(void *foo)
 static int parse_options(struct super_block *sb, char *options)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	struct request_queue *q;
 	substring_t args[MAX_OPT_ARGS];
 	char *p, *name;
 	int arg = 0;
@@ -302,7 +303,14 @@ static int parse_options(struct super_block *sb, char *options)
 				return -EINVAL;
 			break;
 		case Opt_discard:
-			set_opt(sbi, DISCARD);
+			q = bdev_get_queue(sb->s_bdev);
+			if (blk_queue_discard(q)) {
+				set_opt(sbi, DISCARD);
+			} else {
+				f2fs_msg(sb, KERN_WARNING,
+					"mounting with \"discard\" option, but "
+					"the device does not support discard");
+			}
 			break;
 		case Opt_noheap:
 			set_opt(sbi, NOHEAP);
@@ -416,6 +424,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	/* Will be used by directory only */
 	fi->i_dir_level = F2FS_SB(sb)->dir_level;
 
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	fi->i_crypt_info = NULL;
+#endif
 	return &fi->vfs_inode;
 }
 
@@ -428,8 +439,31 @@ static int f2fs_drop_inode(struct inode *inode)
 	 *    - f2fs_gc -> iput -> evict
 	 *       - inode_wait_for_writeback(inode)
 	 */
-	if (!inode_unhashed(inode) && inode->i_state & I_SYNC)
+	if (!inode_unhashed(inode) && inode->i_state & I_SYNC) {
+		if (!inode->i_nlink && !is_bad_inode(inode)) {
+			spin_unlock(&inode->i_lock);
+
+			/* some remained atomic pages should discarded */
+			if (f2fs_is_atomic_file(inode))
+				commit_inmem_pages(inode, true);
+
+			sb_start_intwrite(inode->i_sb);
+			i_size_write(inode, 0);
+
+			if (F2FS_HAS_BLOCKS(inode))
+				f2fs_truncate(inode);
+
+			sb_end_intwrite(inode->i_sb);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+			if (F2FS_I(inode)->i_crypt_info)
+				f2fs_free_encryption_info(inode,
+					F2FS_I(inode)->i_crypt_info);
+#endif
+			spin_lock(&inode->i_lock);
+		}
 		return 0;
+	}
 	return generic_drop_inode(inode);
 }
 
@@ -520,7 +554,7 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 	} else {
 		f2fs_balance_fs(sbi);
 	}
-	f2fs_trace_ios(NULL, NULL, 1);
+	f2fs_trace_ios(NULL, 1);
 
 	return 0;
 }
@@ -658,6 +692,22 @@ static const struct file_operations f2fs_seq_segment_info_fops = {
 	.release = single_release,
 };
 
+static void default_options(struct f2fs_sb_info *sbi)
+{
+	/* init some FS parameters */
+	sbi->active_logs = NR_CURSEG_TYPE;
+
+	set_opt(sbi, BG_GC);
+	set_opt(sbi, INLINE_DATA);
+
+#ifdef CONFIG_F2FS_FS_XATTR
+	set_opt(sbi, XATTR_USER);
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+	set_opt(sbi, POSIX_ACL);
+#endif
+}
+
 static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -676,7 +726,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	active_logs = sbi->active_logs;
 
 	sbi->mount_opt.opt = 0;
-	sbi->active_logs = NR_CURSEG_TYPE;
+	default_options(sbi);
 
 	/* parse mount options */
 	err = parse_options(sb, data);
@@ -929,29 +979,36 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
  */
 static int read_raw_super_block(struct super_block *sb,
 			struct f2fs_super_block **raw_super,
-			struct buffer_head **raw_super_buf)
+			struct buffer_head **raw_super_buf,
+			int *recovery)
 {
 	int block = 0;
+	struct buffer_head *buffer;
+	struct f2fs_super_block *super;
+	int err = 0;
 
 retry:
-	*raw_super_buf = sb_bread(sb, block);
-	if (!*raw_super_buf) {
+	buffer = sb_bread(sb, block);
+	if (!buffer) {
+		*recovery = 1;
 		f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
 				block + 1);
 		if (block == 0) {
 			block++;
 			goto retry;
 		} else {
-			return -EIO;
+			err = -EIO;
+			goto out;
 		}
 	}
 
-	*raw_super = (struct f2fs_super_block *)
-		((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET);
+	super = (struct f2fs_super_block *)
+		((char *)(buffer)->b_data + F2FS_SUPER_OFFSET);
 
 	/* sanity checking of raw super */
-	if (sanity_check_raw_super(sb, *raw_super)) {
-		brelse(*raw_super_buf);
+	if (sanity_check_raw_super(sb, super)) {
+		brelse(buffer);
+		*recovery = 1;
 		f2fs_msg(sb, KERN_ERR,
 			"Can't find valid F2FS filesystem in %dth superblock",
 								block + 1);
@@ -959,25 +1016,76 @@ retry:
 			block++;
 			goto retry;
 		} else {
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 	}
 
+	if (!*raw_super) {
+		*raw_super_buf = buffer;
+		*raw_super = super;
+	} else {
+		/* already have a valid superblock */
+		brelse(buffer);
+	}
+
+	/* check the validity of the second superblock */
+	if (block == 0) {
+		block++;
+		goto retry;
+	}
+
+out:
+	/* No valid superblock */
+	if (!*raw_super)
+		return err;
+
 	return 0;
 }
 
+int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
+{
+	struct buffer_head *sbh = sbi->raw_super_buf;
+	sector_t block = sbh->b_blocknr;
+	int err;
+
+	/* write back-up superblock first */
+	sbh->b_blocknr = block ? 0 : 1;
+	mark_buffer_dirty(sbh);
+	err = sync_dirty_buffer(sbh);
+
+	sbh->b_blocknr = block;
+
+	/* if we are in recovery path, skip writing valid superblock */
+	if (recover || err)
+		goto out;
+
+	/* write current valid superblock */
+	mark_buffer_dirty(sbh);
+	err = sync_dirty_buffer(sbh);
+out:
+	clear_buffer_write_io_error(sbh);
+	set_buffer_uptodate(sbh);
+	return err;
+}
+
 static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct f2fs_sb_info *sbi;
-	struct f2fs_super_block *raw_super = NULL;
+	struct f2fs_super_block *raw_super;
 	struct buffer_head *raw_super_buf;
 	struct inode *root;
-	long err = -EINVAL;
+	long err;
 	bool retry = true, need_fsck = false;
 	char *options = NULL;
-	int i;
+	int recovery, i;
 
 try_onemore:
+	err = -EINVAL;
+	raw_super = NULL;
+	raw_super_buf = NULL;
+	recovery = 0;
+
 	/* allocate memory for f2fs-specific super block info */
 	sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
 	if (!sbi)
@@ -989,23 +1097,12 @@ try_onemore:
 		goto free_sbi;
 	}
 
-	err = read_raw_super_block(sb, &raw_super, &raw_super_buf);
+	err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery);
 	if (err)
 		goto free_sbi;
 
 	sb->s_fs_info = sbi;
-	/* init some FS parameters */
-	sbi->active_logs = NR_CURSEG_TYPE;
-
-	set_opt(sbi, BG_GC);
-	set_opt(sbi, INLINE_DATA);
-
-#ifdef CONFIG_F2FS_FS_XATTR
-	set_opt(sbi, XATTR_USER);
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
-	set_opt(sbi, POSIX_ACL);
-#endif
+	default_options(sbi);
 	/* parse mount options */
 	options = kstrdup((const char *)data, GFP_KERNEL);
 	if (data && !options) {
@@ -1148,14 +1245,6 @@ try_onemore:
 		proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
 				 &f2fs_seq_segment_info_fops, sb);
 
-	if (test_opt(sbi, DISCARD)) {
-		struct request_queue *q = bdev_get_queue(sb->s_bdev);
-		if (!blk_queue_discard(q))
-			f2fs_msg(sb, KERN_WARNING,
-					"mounting with \"discard\" option, but "
-					"the device does not support discard");
-	}
-
 	sbi->s_kobj.kset = f2fs_kset;
 	init_completion(&sbi->s_kobj_unregister);
 	err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
@@ -1198,6 +1287,13 @@ try_onemore:
 			goto free_kobj;
 	}
 	kfree(options);
+
+	/* recover broken superblock */
+	if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) {
+		f2fs_msg(sb, KERN_INFO, "Recover invalid superblock");
+		f2fs_commit_super(sbi, true);
+	}
+
 	return 0;
 
 free_kobj:
@@ -1305,13 +1401,18 @@ static int __init init_f2fs_fs(void)
 		err = -ENOMEM;
 		goto free_extent_cache;
 	}
-	err = register_filesystem(&f2fs_fs_type);
+	err = f2fs_init_crypto();
 	if (err)
 		goto free_kset;
+	err = register_filesystem(&f2fs_fs_type);
+	if (err)
+		goto free_crypto;
 	f2fs_create_root_stats();
 	f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
 	return 0;
 
+free_crypto:
+	f2fs_exit_crypto();
 free_kset:
 	kset_unregister(f2fs_kset);
 free_extent_cache:
@@ -1333,6 +1434,7 @@ static void __exit exit_f2fs_fs(void)
 	remove_proc_entry("fs/f2fs", NULL);
 	f2fs_destroy_root_stats();
 	unregister_filesystem(&f2fs_fs_type);
+	f2fs_exit_crypto();
 	destroy_extent_cache();
 	destroy_checkpoint_caches();
 	destroy_segment_manager_caches();
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 875aa8179..145fb659a 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -80,7 +80,7 @@ out:
 	radix_tree_preload_end();
 }
 
-void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush)
+void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
 {
 	struct inode *inode;
 	pid_t pid;
@@ -91,8 +91,8 @@ void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush)
 		return;
 	}
 
-	inode = page->mapping->host;
-	pid = page_private(page);
+	inode = fio->page->mapping->host;
+	pid = page_private(fio->page);
 
 	major = MAJOR(inode->i_sb->s_dev);
 	minor = MINOR(inode->i_sb->s_dev);
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
index 1041dbeb5..67db24ac1 100644
--- a/fs/f2fs/trace.h
+++ b/fs/f2fs/trace.h
@@ -33,12 +33,12 @@ struct last_io_info {
 };
 
 extern void f2fs_trace_pid(struct page *);
-extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int);
+extern void f2fs_trace_ios(struct f2fs_io_info *, int);
 extern void f2fs_build_trace_ios(void);
 extern void f2fs_destroy_trace_ios(void);
 #else
 #define f2fs_trace_pid(p)
-#define f2fs_trace_ios(p, i, n)
+#define f2fs_trace_ios(i, n)
 #define f2fs_build_trace_ios()
 #define f2fs_destroy_trace_ios()
 
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 9757f65a0..07449b980 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -584,6 +584,9 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 		inode->i_ctime = CURRENT_TIME;
 		clear_inode_flag(fi, FI_ACL_MODE);
 	}
+	if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
+			!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
+		f2fs_set_encrypted_inode(inode);
 
 	if (ipage)
 		update_inode(inode, ipage);
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 969d792ca..71a7100d5 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -35,6 +35,10 @@
 #define F2FS_XATTR_INDEX_LUSTRE			5
 #define F2FS_XATTR_INDEX_SECURITY		6
 #define F2FS_XATTR_INDEX_ADVISE			7
+/* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */
+#define F2FS_XATTR_INDEX_ENCRYPTION		9
+
+#define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT	"c"
 
 struct f2fs_xattr_header {
 	__le32  h_magic;        /* magic number for identification */
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 442d50a0e..a08f10399 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,6 +11,7 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include "fat.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c06774658..509411dd3 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,6 +18,7 @@
 #include <linux/parser.h>
 #include <linux/uio.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <asm/unaligned.h>
 #include "fat.h"
 
diff --git a/fs/file.c b/fs/file.c
index 93c5f89c2..6c672ad32 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr)
 
 	spin_unlock(&files->file_lock);
 	new_fdt = alloc_fdtable(nr);
+
+	/* make sure all __fd_install() have seen resize_in_progress
+	 * or have finished their rcu_read_lock_sched() section.
+	 */
+	if (atomic_read(&files->count) > 1)
+		synchronize_sched();
+
 	spin_lock(&files->file_lock);
 	if (!new_fdt)
 		return -ENOMEM;
@@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr)
 		__free_fdtable(new_fdt);
 		return -EMFILE;
 	}
-	/*
-	 * Check again since another task may have expanded the fd table while
-	 * we dropped the lock
-	 */
 	cur_fdt = files_fdtable(files);
-	if (nr >= cur_fdt->max_fds) {
-		/* Continue as planned */
-		copy_fdtable(new_fdt, cur_fdt);
-		rcu_assign_pointer(files->fdt, new_fdt);
-		if (cur_fdt != &files->fdtab)
-			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
-	} else {
-		/* Somebody else expanded, so undo our attempt */
-		__free_fdtable(new_fdt);
-	}
+	BUG_ON(nr < cur_fdt->max_fds);
+	copy_fdtable(new_fdt, cur_fdt);
+	rcu_assign_pointer(files->fdt, new_fdt);
+	if (cur_fdt != &files->fdtab)
+		call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
+	/* coupled with smp_rmb() in __fd_install() */
+	smp_wmb();
 	return 1;
 }
 
@@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr)
  * The files->file_lock should be held on entry, and will be held on exit.
  */
 static int expand_files(struct files_struct *files, int nr)
+	__releases(files->file_lock)
+	__acquires(files->file_lock)
 {
 	struct fdtable *fdt;
+	int expanded = 0;
 
+repeat:
 	fdt = files_fdtable(files);
 
 	/* Do we need to expand? */
 	if (nr < fdt->max_fds)
-		return 0;
+		return expanded;
 
 	/* Can we expand? */
 	if (nr >= sysctl_nr_open)
 		return -EMFILE;
 
+	if (unlikely(files->resize_in_progress)) {
+		spin_unlock(&files->file_lock);
+		expanded = 1;
+		wait_event(files->resize_wait, !files->resize_in_progress);
+		spin_lock(&files->file_lock);
+		goto repeat;
+	}
+
 	/* All good, so we try */
-	return expand_fdtable(files, nr);
+	files->resize_in_progress = true;
+	expanded = expand_fdtable(files, nr);
+	files->resize_in_progress = false;
+
+	wake_up_all(&files->resize_wait);
+	return expanded;
 }
 
 static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
@@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
+	newf->resize_in_progress = false;
+	init_waitqueue_head(&newf->resize_wait);
 	newf->next_fd = 0;
 	new_fdt = &newf->fdtab;
 	new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd,
 		struct file *file)
 {
 	struct fdtable *fdt;
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
+
+	might_sleep();
+	rcu_read_lock_sched();
+
+	while (unlikely(files->resize_in_progress)) {
+		rcu_read_unlock_sched();
+		wait_event(files->resize_wait, !files->resize_in_progress);
+		rcu_read_lock_sched();
+	}
+	/* coupled with smp_wmb() in expand_fdtable() */
+	smp_rmb();
+	fdt = rcu_dereference_sched(files->fdt);
 	BUG_ON(fdt->fd[fd] != NULL);
 	rcu_assign_pointer(fdt->fd[fd], file);
-	spin_unlock(&files->file_lock);
+	rcu_read_unlock_sched();
 }
 
 void fd_install(unsigned int fd, struct file *file)
@@ -635,11 +664,17 @@ static struct file *__fget(unsigned int fd, fmode_t mask)
 	struct file *file;
 
 	rcu_read_lock();
+loop:
 	file = fcheck_files(files, fd);
 	if (file) {
-		/* File object ref couldn't be taken */
-		if ((file->f_mode & mask) || !get_file_rcu(file))
+		/* File object ref couldn't be taken.
+		 * dup2() atomicity guarantee is the reason
+		 * we loop to catch the new file (or NULL pointer)
+		 */
+		if (file->f_mode & mask)
 			file = NULL;
+		else if (!get_file_rcu(file))
+			goto loop;
 	}
 	rcu_read_unlock();
 
diff --git a/fs/file_table.c b/fs/file_table.c
index 3cea02778..ad17e05eb 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,12 +20,12 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
-#include <linux/lglock.h>
 #include <linux/percpu_counter.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/task_work.h>
 #include <linux/ima.h>
+#include <linux/swap.h>
 
 #include <linux/atomic.h>
 
@@ -147,7 +147,6 @@ over:
 	}
 	return ERR_PTR(-ENFILE);
 }
-EXPORT_SYMBOL(get_empty_filp);
 
 /**
  * alloc_file - allocate and initialize a 'struct file'
@@ -309,21 +308,25 @@ void put_filp(struct file *file)
 		file_free(file);
 	}
 }
-EXPORT_SYMBOL(put_filp);
 
-void __init files_init(unsigned long mempages)
+void __init files_init(void)
 { 
-	unsigned long n;
-
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
+}
 
-	/*
-	 * One file with associated inode and dcache is very roughly 1K.
-	 * Per default don't use more than 10% of our memory for files. 
-	 */ 
+/*
+ * One file with associated inode and dcache is very roughly 1K. Per default
+ * do not use more than 10% of our memory for files.
+ */
+void __init files_maxfiles_init(void)
+{
+	unsigned long n;
+	unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+
+	memreserve = min(memreserve, totalram_pages - 1);
+	n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
 
-	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 } 
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index 881aa3d21..e3dcb4467 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -50,9 +50,6 @@ extern daddr_t			vxfs_bmap1(struct inode *, long);
 /* vxfs_fshead.c */
 extern int			vxfs_read_fshead(struct super_block *);
 
-/* vxfs_immed.c */
-extern const struct inode_operations vxfs_immed_symlink_iops;
-
 /* vxfs_inode.c */
 extern const struct address_space_operations vxfs_immed_aops;
 extern struct kmem_cache	*vxfs_inode_cachep;
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index 8b9229e2c..cb84f0fcc 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -32,29 +32,15 @@
  */
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include <linux/namei.h>
 
 #include "vxfs.h"
 #include "vxfs_extern.h"
 #include "vxfs_inode.h"
 
 
-static void *	vxfs_immed_follow_link(struct dentry *, struct nameidata *);
-
 static int	vxfs_immed_readpage(struct file *, struct page *);
 
 /*
- * Inode operations for immed symlinks.
- *
- * Unliked all other operations we do not go through the pagecache,
- * but do all work directly on the inode.
- */
-const struct inode_operations vxfs_immed_symlink_iops = {
-	.readlink =		generic_readlink,
-	.follow_link =		vxfs_immed_follow_link,
-};
-
-/*
  * Address space operations for immed files and directories.
  */
 const struct address_space_operations vxfs_immed_aops = {
@@ -62,26 +48,6 @@ const struct address_space_operations vxfs_immed_aops = {
 };
 
 /**
- * vxfs_immed_follow_link - follow immed symlink
- * @dp:		dentry for the link
- * @np:		pathname lookup data for the current path walk
- *
- * Description:
- *   vxfs_immed_follow_link restarts the pathname lookup with
- *   the data obtained from @dp.
- *
- * Returns:
- *   Zero on success, else a negative error code.
- */
-static void *
-vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np)
-{
-	struct vxfs_inode_info		*vip = VXFS_INO(d_inode(dp));
-	nd_set_link(np, vip->vii_immed.vi_immed);
-	return NULL;
-}
-
-/**
  * vxfs_immed_readpage - read part of an immed inode into pagecache
  * @file:	file context (unused)
  * @page:	page frame to fill in.
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 363e3ae25..ef73ed674 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -35,6 +35,7 @@
 #include <linux/pagemap.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/namei.h>
 
 #include "vxfs.h"
 #include "vxfs_inode.h"
@@ -327,8 +328,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
 			ip->i_op = &page_symlink_inode_operations;
 			ip->i_mapping->a_ops = &vxfs_aops;
 		} else {
-			ip->i_op = &vxfs_immed_symlink_iops;
-			vip->vii_immed.vi_immed[ip->i_size] = '\0';
+			ip->i_op = &simple_symlink_inode_operations;
+			ip->i_link = vip->vii_immed.vi_immed;
+			nd_terminate_link(ip->i_link, ip->i_size,
+					  sizeof(vip->vii_immed.vi_immed) - 1);
 		}
 	} else
 		init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 99c7f0a37..484b32d32 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -61,13 +61,6 @@ const struct file_operations vxfs_dir_operations = {
 	.iterate =		vxfs_readdir,
 };
 
- 
-static inline u_long
-dir_pages(struct inode *inode)
-{
-	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
- 
 static inline u_long
 dir_blocks(struct inode *ip)
 {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 32a8bbd7a..5fa588e93 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/tracepoint.h>
 #include <linux/device.h>
+#include <linux/memcontrol.h>
 #include "internal.h"
 
 /*
@@ -34,6 +35,10 @@
  */
 #define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
 
+struct wb_completion {
+	atomic_t		cnt;
+};
+
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -47,13 +52,29 @@ struct wb_writeback_work {
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
+	unsigned int auto_free:1;	/* free on completion */
+	unsigned int single_wait:1;
+	unsigned int single_done:1;
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
-	struct completion *done;	/* set if the caller waits */
+	struct wb_completion *done;	/* set if the caller waits */
 };
 
 /*
+ * If one wants to wait for one or more wb_writeback_works, each work's
+ * ->done should be set to a wb_completion defined using the following
+ * macro.  Once all work items are issued with wb_queue_work(), the caller
+ * can wait for the completion of all using wb_wait_for_completion().  Work
+ * items which are waited upon aren't freed automatically on completion.
+ */
+#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)				\
+	struct wb_completion cmpl = {					\
+		.cnt		= ATOMIC_INIT(1),			\
+	}
+
+
+/*
  * If an inode is constantly having its pages dirtied, but then the
  * updates stop dirtytime_expire_interval seconds in the past, it's
  * possible for the worst case time between when an inode has its
@@ -65,35 +86,6 @@ struct wb_writeback_work {
  */
 unsigned int dirtytime_expire_interval = 12 * 60 * 60;
 
-/**
- * writeback_in_progress - determine whether there is writeback in progress
- * @bdi: the device's backing_dev_info structure.
- *
- * Determine whether there is writeback waiting to be handled against a
- * backing device.
- */
-int writeback_in_progress(struct backing_dev_info *bdi)
-{
-	return test_bit(BDI_writeback_running, &bdi->state);
-}
-EXPORT_SYMBOL(writeback_in_progress);
-
-struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
-	struct super_block *sb;
-
-	if (!inode)
-		return &noop_backing_dev_info;
-
-	sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
-	if (sb_is_blkdev_sb(sb))
-		return blk_get_backing_dev_info(I_BDEV(inode));
-#endif
-	return sb->s_bdi;
-}
-EXPORT_SYMBOL_GPL(inode_to_bdi);
-
 static inline struct inode *wb_inode(struct list_head *head)
 {
 	return list_entry(head, struct inode, i_wb_list);
@@ -109,45 +101,831 @@ static inline struct inode *wb_inode(struct list_head *head)
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
 
-static void bdi_wakeup_thread(struct backing_dev_info *bdi)
+static bool wb_io_lists_populated(struct bdi_writeback *wb)
+{
+	if (wb_has_dirty_io(wb)) {
+		return false;
+	} else {
+		set_bit(WB_has_dirty_io, &wb->state);
+		WARN_ON_ONCE(!wb->avg_write_bandwidth);
+		atomic_long_add(wb->avg_write_bandwidth,
+				&wb->bdi->tot_write_bandwidth);
+		return true;
+	}
+}
+
+static void wb_io_lists_depopulated(struct bdi_writeback *wb)
 {
-	spin_lock_bh(&bdi->wb_lock);
-	if (test_bit(BDI_registered, &bdi->state))
-		mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-	spin_unlock_bh(&bdi->wb_lock);
+	if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
+	    list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
+		clear_bit(WB_has_dirty_io, &wb->state);
+		WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
+					&wb->bdi->tot_write_bandwidth) < 0);
+	}
 }
 
-static void bdi_queue_work(struct backing_dev_info *bdi,
-			   struct wb_writeback_work *work)
+/**
+ * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * @inode: inode to be moved
+ * @wb: target bdi_writeback
+ * @head: one of @wb->b_{dirty|io|more_io}
+ *
+ * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Returns %true if @inode is the first occupant of the !dirty_time IO
+ * lists; otherwise, %false.
+ */
+static bool inode_wb_list_move_locked(struct inode *inode,
+				      struct bdi_writeback *wb,
+				      struct list_head *head)
 {
-	trace_writeback_queue(bdi, work);
+	assert_spin_locked(&wb->list_lock);
+
+	list_move(&inode->i_wb_list, head);
 
-	spin_lock_bh(&bdi->wb_lock);
-	if (!test_bit(BDI_registered, &bdi->state)) {
-		if (work->done)
-			complete(work->done);
+	/* dirty_time doesn't count as dirty_io until expiration */
+	if (head != &wb->b_dirty_time)
+		return wb_io_lists_populated(wb);
+
+	wb_io_lists_depopulated(wb);
+	return false;
+}
+
+/**
+ * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * @inode: inode to be removed
+ * @wb: bdi_writeback @inode is being removed from
+ *
+ * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
+ * clear %WB_has_dirty_io if all are empty afterwards.
+ */
+static void inode_wb_list_del_locked(struct inode *inode,
+				     struct bdi_writeback *wb)
+{
+	assert_spin_locked(&wb->list_lock);
+
+	list_del_init(&inode->i_wb_list);
+	wb_io_lists_depopulated(wb);
+}
+
+static void wb_wakeup(struct bdi_writeback *wb)
+{
+	spin_lock_bh(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state))
+		mod_delayed_work(bdi_wq, &wb->dwork, 0);
+	spin_unlock_bh(&wb->work_lock);
+}
+
+static void wb_queue_work(struct bdi_writeback *wb,
+			  struct wb_writeback_work *work)
+{
+	trace_writeback_queue(wb->bdi, work);
+
+	spin_lock_bh(&wb->work_lock);
+	if (!test_bit(WB_registered, &wb->state)) {
+		if (work->single_wait)
+			work->single_done = 1;
 		goto out_unlock;
 	}
-	list_add_tail(&work->list, &bdi->work_list);
-	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+	if (work->done)
+		atomic_inc(&work->done->cnt);
+	list_add_tail(&work->list, &wb->work_list);
+	mod_delayed_work(bdi_wq, &wb->dwork, 0);
 out_unlock:
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_unlock_bh(&wb->work_lock);
+}
+
+/**
+ * wb_wait_for_completion - wait for completion of bdi_writeback_works
+ * @bdi: bdi work items were issued to
+ * @done: target wb_completion
+ *
+ * Wait for one or more work items issued to @bdi with their ->done field
+ * set to @done, which should have been defined with
+ * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
+ * work items are completed.  Work items which are waited upon aren't freed
+ * automatically on completion.
+ */
+static void wb_wait_for_completion(struct backing_dev_info *bdi,
+				   struct wb_completion *done)
+{
+	atomic_dec(&done->cnt);		/* put down the initial count */
+	wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/* parameters for foreign inode detection, see wb_detach_inode() */
+#define WB_FRN_TIME_SHIFT	13	/* 1s = 2^13, upto 8 secs w/ 16bit */
+#define WB_FRN_TIME_AVG_SHIFT	3	/* avg = avg * 7/8 + new * 1/8 */
+#define WB_FRN_TIME_CUT_DIV	2	/* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_PERIOD	(2 * (1 << WB_FRN_TIME_SHIFT))	/* 2s */
+
+#define WB_FRN_HIST_SLOTS	16	/* inode->i_wb_frn_history is 16bit */
+#define WB_FRN_HIST_UNIT	(WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
+					/* each slot's duration is 2s / 16 */
+#define WB_FRN_HIST_THR_SLOTS	(WB_FRN_HIST_SLOTS / 2)
+					/* if foreign slots >= 8, switch */
+#define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
+					/* one round can affect upto 5 slots */
+
+void __inode_attach_wb(struct inode *inode, struct page *page)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb = NULL;
+
+	if (inode_cgwb_enabled(inode)) {
+		struct cgroup_subsys_state *memcg_css;
+
+		if (page) {
+			memcg_css = mem_cgroup_css_from_page(page);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+		} else {
+			/* must pin memcg_css, see wb_get_create() */
+			memcg_css = task_get_css(current, memory_cgrp_id);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+			css_put(memcg_css);
+		}
+	}
+
+	if (!wb)
+		wb = &bdi->wb;
+
+	/*
+	 * There may be multiple instances of this function racing to
+	 * update the same inode.  Use cmpxchg() to tell the winner.
+	 */
+	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
+		wb_put(wb);
+}
+
+/**
+ * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
+ * @inode: inode of interest with i_lock held
+ *
+ * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
+ * held on entry and is released on return.  The returned wb is guaranteed
+ * to stay @inode's associated wb until its list_lock is released.
+ */
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+	__releases(&inode->i_lock)
+	__acquires(&wb->list_lock)
+{
+	while (true) {
+		struct bdi_writeback *wb = inode_to_wb(inode);
+
+		/*
+		 * inode_to_wb() association is protected by both
+		 * @inode->i_lock and @wb->list_lock but list_lock nests
+		 * outside i_lock.  Drop i_lock and verify that the
+		 * association hasn't changed after acquiring list_lock.
+		 */
+		wb_get(wb);
+		spin_unlock(&inode->i_lock);
+		spin_lock(&wb->list_lock);
+		wb_put(wb);		/* not gonna deref it anymore */
+
+		/* i_wb may have changed inbetween, can't use inode_to_wb() */
+		if (likely(wb == inode->i_wb))
+			return wb;	/* @inode already has ref */
+
+		spin_unlock(&wb->list_lock);
+		cpu_relax();
+		spin_lock(&inode->i_lock);
+	}
+}
+
+/**
+ * inode_to_wb_and_lock_list - determine an inode's wb and lock it
+ * @inode: inode of interest
+ *
+ * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
+ * on entry.
+ */
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+	__acquires(&wb->list_lock)
+{
+	spin_lock(&inode->i_lock);
+	return locked_inode_to_wb_and_lock_list(inode);
+}
+
+struct inode_switch_wbs_context {
+	struct inode		*inode;
+	struct bdi_writeback	*new_wb;
+
+	struct rcu_head		rcu_head;
+	struct work_struct	work;
+};
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+	struct inode_switch_wbs_context *isw =
+		container_of(work, struct inode_switch_wbs_context, work);
+	struct inode *inode = isw->inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct bdi_writeback *old_wb = inode->i_wb;
+	struct bdi_writeback *new_wb = isw->new_wb;
+	struct radix_tree_iter iter;
+	bool switched = false;
+	void **slot;
+
+	/*
+	 * By the time control reaches here, RCU grace period has passed
+	 * since I_WB_SWITCH assertion and all wb stat update transactions
+	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+	 * synchronizing against mapping->tree_lock.
+	 *
+	 * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+	 * gives us exclusion against all wb related operations on @inode
+	 * including IO list manipulations and stat updates.
+	 */
+	if (old_wb < new_wb) {
+		spin_lock(&old_wb->list_lock);
+		spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock(&new_wb->list_lock);
+		spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+	}
+	spin_lock(&inode->i_lock);
+	spin_lock_irq(&mapping->tree_lock);
+
+	/*
+	 * Once I_FREEING is visible under i_lock, the eviction path owns
+	 * the inode and we shouldn't modify ->i_wb_list.
+	 */
+	if (unlikely(inode->i_state & I_FREEING))
+		goto skip_switch;
+
+	/*
+	 * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
+	 * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
+	 * pages actually under underwriteback.
+	 */
+	radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+				   PAGECACHE_TAG_DIRTY) {
+		struct page *page = radix_tree_deref_slot_protected(slot,
+							&mapping->tree_lock);
+		if (likely(page) && PageDirty(page)) {
+			__dec_wb_stat(old_wb, WB_RECLAIMABLE);
+			__inc_wb_stat(new_wb, WB_RECLAIMABLE);
+		}
+	}
+
+	radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+				   PAGECACHE_TAG_WRITEBACK) {
+		struct page *page = radix_tree_deref_slot_protected(slot,
+							&mapping->tree_lock);
+		if (likely(page)) {
+			WARN_ON_ONCE(!PageWriteback(page));
+			__dec_wb_stat(old_wb, WB_WRITEBACK);
+			__inc_wb_stat(new_wb, WB_WRITEBACK);
+		}
+	}
+
+	wb_get(new_wb);
+
+	/*
+	 * Transfer to @new_wb's IO list if necessary.  The specific list
+	 * @inode was on is ignored and the inode is put on ->b_dirty which
+	 * is always correct including from ->b_dirty_time.  The transfer
+	 * preserves @inode->dirtied_when ordering.
+	 */
+	if (!list_empty(&inode->i_wb_list)) {
+		struct inode *pos;
+
+		inode_wb_list_del_locked(inode, old_wb);
+		inode->i_wb = new_wb;
+		list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+			if (time_after_eq(inode->dirtied_when,
+					  pos->dirtied_when))
+				break;
+		inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+	} else {
+		inode->i_wb = new_wb;
+	}
+
+	/* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
+	inode->i_wb_frn_winner = 0;
+	inode->i_wb_frn_avg_time = 0;
+	inode->i_wb_frn_history = 0;
+	switched = true;
+skip_switch:
+	/*
+	 * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+	 * ensures that the new wb is visible if they see !I_WB_SWITCH.
+	 */
+	smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+
+	spin_unlock_irq(&mapping->tree_lock);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&new_wb->list_lock);
+	spin_unlock(&old_wb->list_lock);
+
+	if (switched) {
+		wb_wakeup(new_wb);
+		wb_put(old_wb);
+	}
+	wb_put(new_wb);
+
+	iput(inode);
+	kfree(isw);
 }
 
-static void
-__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		      bool range_cyclic, enum wb_reason reason)
+static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+{
+	struct inode_switch_wbs_context *isw = container_of(rcu_head,
+				struct inode_switch_wbs_context, rcu_head);
+
+	/* needs to grab bh-unsafe locks, bounce to work item */
+	INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
+	schedule_work(&isw->work);
+}
+
+/**
+ * inode_switch_wbs - change the wb association of an inode
+ * @inode: target inode
+ * @new_wb_id: ID of the new wb
+ *
+ * Switch @inode's wb association to the wb identified by @new_wb_id.  The
+ * switching is performed asynchronously and may fail silently.
+ */
+static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct cgroup_subsys_state *memcg_css;
+	struct inode_switch_wbs_context *isw;
+
+	/* noop if seems to be already in progress */
+	if (inode->i_state & I_WB_SWITCH)
+		return;
+
+	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+	if (!isw)
+		return;
+
+	/* find and pin the new wb */
+	rcu_read_lock();
+	memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
+	if (memcg_css)
+		isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+	rcu_read_unlock();
+	if (!isw->new_wb)
+		goto out_free;
+
+	/* while holding I_WB_SWITCH, no one else can update the association */
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+	    inode_to_wb(inode) == isw->new_wb) {
+		spin_unlock(&inode->i_lock);
+		goto out_free;
+	}
+	inode->i_state |= I_WB_SWITCH;
+	spin_unlock(&inode->i_lock);
+
+	ihold(inode);
+	isw->inode = inode;
+
+	/*
+	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
+	 * the RCU protected stat update paths to grab the mapping's
+	 * tree_lock so that stat transfer can synchronize against them.
+	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+	 */
+	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+	return;
+
+out_free:
+	if (isw->new_wb)
+		wb_put(isw->new_wb);
+	kfree(isw);
+}
+
+/**
+ * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * @inode is locked and about to be written back under the control of @wbc.
+ * Record @inode's writeback context into @wbc and unlock the i_lock.  On
+ * writeback completion, wbc_detach_inode() should be called.  This is used
+ * to track the cgroup writeback context.
+ */
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+				 struct inode *inode)
+{
+	if (!inode_cgwb_enabled(inode)) {
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+
+	wbc->wb = inode_to_wb(inode);
+	wbc->inode = inode;
+
+	wbc->wb_id = wbc->wb->memcg_css->id;
+	wbc->wb_lcand_id = inode->i_wb_frn_winner;
+	wbc->wb_tcand_id = 0;
+	wbc->wb_bytes = 0;
+	wbc->wb_lcand_bytes = 0;
+	wbc->wb_tcand_bytes = 0;
+
+	wb_get(wbc->wb);
+	spin_unlock(&inode->i_lock);
+
+	/*
+	 * A dying wb indicates that the memcg-blkcg mapping has changed
+	 * and a new wb is already serving the memcg.  Switch immediately.
+	 */
+	if (unlikely(wb_dying(wbc->wb)))
+		inode_switch_wbs(inode, wbc->wb_id);
+}
+
+/**
+ * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
+ * @wbc: writeback_control of the just finished writeback
+ *
+ * To be called after a writeback attempt of an inode finishes and undoes
+ * wbc_attach_and_unlock_inode().  Can be called under any context.
+ *
+ * As concurrent write sharing of an inode is expected to be very rare and
+ * memcg only tracks page ownership on first-use basis severely confining
+ * the usefulness of such sharing, cgroup writeback tracks ownership
+ * per-inode.  While the support for concurrent write sharing of an inode
+ * is deemed unnecessary, an inode being written to by different cgroups at
+ * different points in time is a lot more common, and, more importantly,
+ * charging only by first-use can too readily lead to grossly incorrect
+ * behaviors (single foreign page can lead to gigabytes of writeback to be
+ * incorrectly attributed).
+ *
+ * To resolve this issue, cgroup writeback detects the majority dirtier of
+ * an inode and transfers the ownership to it.  To avoid unnnecessary
+ * oscillation, the detection mechanism keeps track of history and gives
+ * out the switch verdict only if the foreign usage pattern is stable over
+ * a certain amount of time and/or writeback attempts.
+ *
+ * On each writeback attempt, @wbc tries to detect the majority writer
+ * using Boyer-Moore majority vote algorithm.  In addition to the byte
+ * count from the majority voting, it also counts the bytes written for the
+ * current wb and the last round's winner wb (max of last round's current
+ * wb, the winner from two rounds ago, and the last round's majority
+ * candidate).  Keeping track of the historical winner helps the algorithm
+ * to semi-reliably detect the most active writer even when it's not the
+ * absolute majority.
+ *
+ * Once the winner of the round is determined, whether the winner is
+ * foreign or not and how much IO time the round consumed is recorded in
+ * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
+ * over a certain threshold, the switch verdict is given.
+ */
+void wbc_detach_inode(struct writeback_control *wbc)
+{
+	struct bdi_writeback *wb = wbc->wb;
+	struct inode *inode = wbc->inode;
+	unsigned long avg_time, max_bytes, max_time;
+	u16 history;
+	int max_id;
+
+	if (!wb)
+		return;
+
+	history = inode->i_wb_frn_history;
+	avg_time = inode->i_wb_frn_avg_time;
+
+	/* pick the winner of this round */
+	if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
+	    wbc->wb_bytes >= wbc->wb_tcand_bytes) {
+		max_id = wbc->wb_id;
+		max_bytes = wbc->wb_bytes;
+	} else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
+		max_id = wbc->wb_lcand_id;
+		max_bytes = wbc->wb_lcand_bytes;
+	} else {
+		max_id = wbc->wb_tcand_id;
+		max_bytes = wbc->wb_tcand_bytes;
+	}
+
+	/*
+	 * Calculate the amount of IO time the winner consumed and fold it
+	 * into the running average kept per inode.  If the consumed IO
+	 * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
+	 * deciding whether to switch or not.  This is to prevent one-off
+	 * small dirtiers from skewing the verdict.
+	 */
+	max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
+				wb->avg_write_bandwidth);
+	if (avg_time)
+		avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
+			    (avg_time >> WB_FRN_TIME_AVG_SHIFT);
+	else
+		avg_time = max_time;	/* immediate catch up on first run */
+
+	if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
+		int slots;
+
+		/*
+		 * The switch verdict is reached if foreign wb's consume
+		 * more than a certain proportion of IO time in a
+		 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
+		 * history mask where each bit represents one sixteenth of
+		 * the period.  Determine the number of slots to shift into
+		 * history from @max_time.
+		 */
+		slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
+			    (unsigned long)WB_FRN_HIST_MAX_SLOTS);
+		history <<= slots;
+		if (wbc->wb_id != max_id)
+			history |= (1U << slots) - 1;
+
+		/*
+		 * Switch if the current wb isn't the consistent winner.
+		 * If there are multiple closely competing dirtiers, the
+		 * inode may switch across them repeatedly over time, which
+		 * is okay.  The main goal is avoiding keeping an inode on
+		 * the wrong wb for an extended period of time.
+		 */
+		if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+			inode_switch_wbs(inode, max_id);
+	}
+
+	/*
+	 * Multiple instances of this function may race to update the
+	 * following fields but we don't mind occassional inaccuracies.
+	 */
+	inode->i_wb_frn_winner = max_id;
+	inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
+	inode->i_wb_frn_history = history;
+
+	wb_put(wbc->wb);
+	wbc->wb = NULL;
+}
+
+/**
+ * wbc_account_io - account IO issued during writeback
+ * @wbc: writeback_control of the writeback in progress
+ * @page: page being written out
+ * @bytes: number of bytes being written out
+ *
+ * @bytes from @page are about to written out during the writeback
+ * controlled by @wbc.  Keep the book for foreign inode detection.  See
+ * wbc_detach_inode().
+ */
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+		    size_t bytes)
+{
+	int id;
+
+	/*
+	 * pageout() path doesn't attach @wbc to the inode being written
+	 * out.  This is intentional as we don't want the function to block
+	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
+	 * regular writeback instead of writing things out itself.
+	 */
+	if (!wbc->wb)
+		return;
+
+	rcu_read_lock();
+	id = mem_cgroup_css_from_page(page)->id;
+	rcu_read_unlock();
+
+	if (id == wbc->wb_id) {
+		wbc->wb_bytes += bytes;
+		return;
+	}
+
+	if (id == wbc->wb_lcand_id)
+		wbc->wb_lcand_bytes += bytes;
+
+	/* Boyer-Moore majority vote algorithm */
+	if (!wbc->wb_tcand_bytes)
+		wbc->wb_tcand_id = id;
+	if (id == wbc->wb_tcand_id)
+		wbc->wb_tcand_bytes += bytes;
+	else
+		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
+}
+EXPORT_SYMBOL_GPL(wbc_account_io);
+
+/**
+ * inode_congested - test whether an inode is congested
+ * @inode: inode to test for congestion
+ * @cong_bits: mask of WB_[a]sync_congested bits to test
+ *
+ * Tests whether @inode is congested.  @cong_bits is the mask of congestion
+ * bits to test and the return value is the mask of set bits.
+ *
+ * If cgroup writeback is enabled for @inode, the congestion state is
+ * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
+ * associated with @inode is congested; otherwise, the root wb's congestion
+ * state is used.
+ */
+int inode_congested(struct inode *inode, int cong_bits)
+{
+	/*
+	 * Once set, ->i_wb never becomes NULL while the inode is alive.
+	 * Start transaction iff ->i_wb is visible.
+	 */
+	if (inode && inode_to_wb_is_valid(inode)) {
+		struct bdi_writeback *wb;
+		bool locked, congested;
+
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
+		congested = wb_congested(wb, cong_bits);
+		unlocked_inode_to_wb_end(inode, locked);
+		return congested;
+	}
+
+	return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+EXPORT_SYMBOL_GPL(inode_congested);
+
+/**
+ * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
+ * @bdi: bdi the work item was issued to
+ * @work: work item to wait for
+ *
+ * Wait for the completion of @work which was issued to one of @bdi's
+ * bdi_writeback's.  The caller must have set @work->single_wait before
+ * issuing it.  This wait operates independently fo
+ * wb_wait_for_completion() and also disables automatic freeing of @work.
+ */
+static void wb_wait_for_single_work(struct backing_dev_info *bdi,
+				    struct wb_writeback_work *work)
+{
+	if (WARN_ON_ONCE(!work->single_wait))
+		return;
+
+	wait_event(bdi->wb_waitq, work->single_done);
+
+	/*
+	 * Paired with smp_wmb() in wb_do_writeback() and ensures that all
+	 * modifications to @work prior to assertion of ->single_done is
+	 * visible to the caller once this function returns.
+	 */
+	smp_rmb();
+}
+
+/**
+ * wb_split_bdi_pages - split nr_pages to write according to bandwidth
+ * @wb: target bdi_writeback to split @nr_pages to
+ * @nr_pages: number of pages to write for the whole bdi
+ *
+ * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
+ * relation to the total write bandwidth of all wb's w/ dirty inodes on
+ * @wb->bdi.
+ */
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+	unsigned long this_bw = wb->avg_write_bandwidth;
+	unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+
+	if (nr_pages == LONG_MAX)
+		return LONG_MAX;
+
+	/*
+	 * This may be called on clean wb's and proportional distribution
+	 * may not make sense, just use the original @nr_pages in those
+	 * cases.  In general, we wanna err on the side of writing more.
+	 */
+	if (!tot_bw || this_bw >= tot_bw)
+		return nr_pages;
+	else
+		return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
+}
+
+/**
+ * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
+ * @wb: target bdi_writeback
+ * @base_work: source wb_writeback_work
+ *
+ * Try to make a clone of @base_work and issue it to @wb.  If cloning
+ * succeeds, %true is returned; otherwise, @base_work is issued directly
+ * and %false is returned.  In the latter case, the caller is required to
+ * wait for @base_work's completion using wb_wait_for_single_work().
+ *
+ * A clone is auto-freed on completion.  @base_work never is.
+ */
+static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
+				    struct wb_writeback_work *base_work)
 {
 	struct wb_writeback_work *work;
 
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		*work = *base_work;
+		work->auto_free = 1;
+		work->single_wait = 0;
+	} else {
+		work = base_work;
+		work->auto_free = 0;
+		work->single_wait = 1;
+	}
+	work->single_done = 0;
+	wb_queue_work(wb, work);
+	return work != base_work;
+}
+
+/**
+ * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
+ * @bdi: target backing_dev_info
+ * @base_work: wb_writeback_work to issue
+ * @skip_if_busy: skip wb's which already have writeback in progress
+ *
+ * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
+ * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
+ * distributed to the busy wbs according to each wb's proportion in the
+ * total active write bandwidth of @bdi.
+ */
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+				  struct wb_writeback_work *base_work,
+				  bool skip_if_busy)
+{
+	long nr_pages = base_work->nr_pages;
+	int next_blkcg_id = 0;
+	struct bdi_writeback *wb;
+	struct wb_iter iter;
+
+	might_sleep();
+restart:
+	rcu_read_lock();
+	bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+		/* SYNC_ALL writes out I_DIRTY_TIME too */
+		if (!wb_has_dirty_io(wb) &&
+		    (base_work->sync_mode == WB_SYNC_NONE ||
+		     list_empty(&wb->b_dirty_time)))
+			continue;
+		if (skip_if_busy && writeback_in_progress(wb))
+			continue;
+
+		base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
+		if (!wb_clone_and_queue_work(wb, base_work)) {
+			next_blkcg_id = wb->blkcg_css->id + 1;
+			rcu_read_unlock();
+			wb_wait_for_single_work(bdi, base_work);
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+	__releases(&inode->i_lock)
+	__acquires(&wb->list_lock)
+{
+	struct bdi_writeback *wb = inode_to_wb(inode);
+
+	spin_unlock(&inode->i_lock);
+	spin_lock(&wb->list_lock);
+	return wb;
+}
+
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+	__acquires(&wb->list_lock)
+{
+	struct bdi_writeback *wb = inode_to_wb(inode);
+
+	spin_lock(&wb->list_lock);
+	return wb;
+}
+
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+	return nr_pages;
+}
+
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+				  struct wb_writeback_work *base_work,
+				  bool skip_if_busy)
+{
+	might_sleep();
+
+	if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
+		base_work->auto_free = 0;
+		base_work->single_wait = 0;
+		base_work->single_done = 0;
+		wb_queue_work(&bdi->wb, base_work);
+	}
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+			bool range_cyclic, enum wb_reason reason)
+{
+	struct wb_writeback_work *work;
+
+	if (!wb_has_dirty_io(wb))
+		return;
+
 	/*
 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
 	 * wakeup the thread for old dirty data writeback
 	 */
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work) {
-		trace_writeback_nowork(bdi);
-		bdi_wakeup_thread(bdi);
+		trace_writeback_nowork(wb->bdi);
+		wb_wakeup(wb);
 		return;
 	}
 
@@ -155,46 +933,29 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
 	work->reason	= reason;
+	work->auto_free	= 1;
 
-	bdi_queue_work(bdi, work);
-}
-
-/**
- * bdi_start_writeback - start writeback
- * @bdi: the backing device to write from
- * @nr_pages: the number of pages to write
- * @reason: reason why some writeback work was initiated
- *
- * Description:
- *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
- *   started when this function returns, we make no guarantees on
- *   completion. Caller need not hold sb s_umount semaphore.
- *
- */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-			enum wb_reason reason)
-{
-	__bdi_start_writeback(bdi, nr_pages, true, reason);
+	wb_queue_work(wb, work);
 }
 
 /**
- * bdi_start_background_writeback - start background writeback
- * @bdi: the backing device to write from
+ * wb_start_background_writeback - start background writeback
+ * @wb: bdi_writback to write from
  *
  * Description:
  *   This makes sure WB_SYNC_NONE background writeback happens. When
- *   this function returns, it is only guaranteed that for given BDI
+ *   this function returns, it is only guaranteed that for given wb
  *   some IO is happening if we are over background dirty threshold.
  *   Caller need not hold sb s_umount semaphore.
  */
-void bdi_start_background_writeback(struct backing_dev_info *bdi)
+void wb_start_background_writeback(struct bdi_writeback *wb)
 {
 	/*
 	 * We just wake up the flusher thread. It will perform background
 	 * writeback as soon as there is no other work to do.
 	 */
-	trace_writeback_wake_background(bdi);
-	bdi_wakeup_thread(bdi);
+	trace_writeback_wake_background(wb->bdi);
+	wb_wakeup(wb);
 }
 
 /*
@@ -202,11 +963,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
  */
 void inode_wb_list_del(struct inode *inode)
 {
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb;
 
-	spin_lock(&bdi->wb.list_lock);
-	list_del_init(&inode->i_wb_list);
-	spin_unlock(&bdi->wb.list_lock);
+	wb = inode_to_wb_and_lock_list(inode);
+	inode_wb_list_del_locked(inode, wb);
+	spin_unlock(&wb->list_lock);
 }
 
 /*
@@ -220,7 +981,6 @@ void inode_wb_list_del(struct inode *inode)
  */
 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 {
-	assert_spin_locked(&wb->list_lock);
 	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 
@@ -228,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	list_move(&inode->i_wb_list, &wb->b_dirty);
+	inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
 }
 
 /*
@@ -236,8 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
  */
 static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 {
-	assert_spin_locked(&wb->list_lock);
-	list_move(&inode->i_wb_list, &wb->b_more_io);
+	inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -346,6 +1105,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
 	moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
 				     EXPIRE_DIRTY_ATIME, work);
+	if (moved)
+		wb_io_lists_populated(wb);
 	trace_writeback_queue_io(wb, work, moved);
 }
 
@@ -471,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		redirty_tail(inode, wb);
 	} else if (inode->i_state & I_DIRTY_TIME) {
 		inode->dirtied_when = jiffies;
-		list_move(&inode->i_wb_list, &wb->b_dirty_time);
+		inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
-		list_del_init(&inode->i_wb_list);
+		inode_wb_list_del_locked(inode, wb);
 	}
 }
 
@@ -605,10 +1366,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
 	inode->i_state |= I_SYNC;
-	spin_unlock(&inode->i_lock);
+	wbc_attach_and_unlock_inode(wbc, inode);
 
 	ret = __writeback_single_inode(inode, wbc);
 
+	wbc_detach_inode(wbc);
 	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
 	/*
@@ -616,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * touch it. See comment above for explanation.
 	 */
 	if (!(inode->i_state & I_DIRTY_ALL))
-		list_del_init(&inode->i_wb_list);
+		inode_wb_list_del_locked(inode, wb);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
 out:
@@ -624,7 +1386,7 @@ out:
 	return ret;
 }
 
-static long writeback_chunk_size(struct backing_dev_info *bdi,
+static long writeback_chunk_size(struct bdi_writeback *wb,
 				 struct wb_writeback_work *work)
 {
 	long pages;
@@ -645,8 +1407,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
 	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
 		pages = LONG_MAX;
 	else {
-		pages = min(bdi->avg_write_bandwidth / 2,
-			    global_dirty_limit / DIRTY_SCOPE);
+		pages = min(wb->avg_write_bandwidth / 2,
+			    global_wb_domain.dirty_limit / DIRTY_SCOPE);
 		pages = min(pages, work->nr_pages);
 		pages = round_down(pages + MIN_WRITEBACK_PAGES,
 				   MIN_WRITEBACK_PAGES);
@@ -741,9 +1503,9 @@ static long writeback_sb_inodes(struct super_block *sb,
 			continue;
 		}
 		inode->i_state |= I_SYNC;
-		spin_unlock(&inode->i_lock);
+		wbc_attach_and_unlock_inode(&wbc, inode);
 
-		write_chunk = writeback_chunk_size(wb->bdi, work);
+		write_chunk = writeback_chunk_size(wb, work);
 		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
 
@@ -753,6 +1515,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		 */
 		__writeback_single_inode(inode, &wbc);
 
+		wbc_detach_inode(&wbc);
 		work->nr_pages -= write_chunk - wbc.nr_to_write;
 		wrote += write_chunk - wbc.nr_to_write;
 		spin_lock(&wb->list_lock);
@@ -830,33 +1593,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 	return nr_pages - work.nr_pages;
 }
 
-static bool over_bground_thresh(struct backing_dev_info *bdi)
-{
-	unsigned long background_thresh, dirty_thresh;
-
-	global_dirty_limits(&background_thresh, &dirty_thresh);
-
-	if (global_page_state(NR_FILE_DIRTY) +
-	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
-		return true;
-
-	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
-				bdi_dirty_limit(bdi, background_thresh))
-		return true;
-
-	return false;
-}
-
-/*
- * Called under wb->list_lock. If there are multiple wb per bdi,
- * only the flusher working on the first wb should do it.
- */
-static void wb_update_bandwidth(struct bdi_writeback *wb,
-				unsigned long start_time)
-{
-	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
-}
-
 /*
  * Explicit flushing or periodic writeback of "old" data.
  *
@@ -899,14 +1635,14 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * after the other works are all done.
 		 */
 		if ((work->for_background || work->for_kupdate) &&
-		    !list_empty(&wb->bdi->work_list))
+		    !list_empty(&wb->work_list))
 			break;
 
 		/*
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (work->for_background && !over_bground_thresh(wb->bdi))
+		if (work->for_background && !wb_over_bg_thresh(wb))
 			break;
 
 		/*
@@ -970,18 +1706,17 @@ static long wb_writeback(struct bdi_writeback *wb,
 /*
  * Return the next wb_writeback_work struct that hasn't been processed yet.
  */
-static struct wb_writeback_work *
-get_next_work_item(struct backing_dev_info *bdi)
+static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 {
 	struct wb_writeback_work *work = NULL;
 
-	spin_lock_bh(&bdi->wb_lock);
-	if (!list_empty(&bdi->work_list)) {
-		work = list_entry(bdi->work_list.next,
+	spin_lock_bh(&wb->work_lock);
+	if (!list_empty(&wb->work_list)) {
+		work = list_entry(wb->work_list.next,
 				  struct wb_writeback_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_unlock_bh(&wb->work_lock);
 	return work;
 }
 
@@ -998,7 +1733,7 @@ static unsigned long get_nr_dirty_pages(void)
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-	if (over_bground_thresh(wb->bdi)) {
+	if (wb_over_bg_thresh(wb)) {
 
 		struct wb_writeback_work work = {
 			.nr_pages	= LONG_MAX,
@@ -1053,25 +1788,33 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
  */
 static long wb_do_writeback(struct bdi_writeback *wb)
 {
-	struct backing_dev_info *bdi = wb->bdi;
 	struct wb_writeback_work *work;
 	long wrote = 0;
 
-	set_bit(BDI_writeback_running, &wb->bdi->state);
-	while ((work = get_next_work_item(bdi)) != NULL) {
+	set_bit(WB_writeback_running, &wb->state);
+	while ((work = get_next_work_item(wb)) != NULL) {
+		struct wb_completion *done = work->done;
+		bool need_wake_up = false;
 
-		trace_writeback_exec(bdi, work);
+		trace_writeback_exec(wb->bdi, work);
 
 		wrote += wb_writeback(wb, work);
 
-		/*
-		 * Notify the caller of completion if this is a synchronous
-		 * work item, otherwise just free it.
-		 */
-		if (work->done)
-			complete(work->done);
-		else
+		if (work->single_wait) {
+			WARN_ON_ONCE(work->auto_free);
+			/* paired w/ rmb in wb_wait_for_single_work() */
+			smp_wmb();
+			work->single_done = 1;
+			need_wake_up = true;
+		} else if (work->auto_free) {
 			kfree(work);
+		}
+
+		if (done && atomic_dec_and_test(&done->cnt))
+			need_wake_up = true;
+
+		if (need_wake_up)
+			wake_up_all(&wb->bdi->wb_waitq);
 	}
 
 	/*
@@ -1079,7 +1822,7 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 	 */
 	wrote += wb_check_old_data_flush(wb);
 	wrote += wb_check_background_flush(wb);
-	clear_bit(BDI_writeback_running, &wb->bdi->state);
+	clear_bit(WB_writeback_running, &wb->state);
 
 	return wrote;
 }
@@ -1088,43 +1831,42 @@ static long wb_do_writeback(struct bdi_writeback *wb)
  * Handle writeback of dirty data for the device backed by this bdi. Also
  * reschedules periodically and does kupdated style flushing.
  */
-void bdi_writeback_workfn(struct work_struct *work)
+void wb_workfn(struct work_struct *work)
 {
 	struct bdi_writeback *wb = container_of(to_delayed_work(work),
 						struct bdi_writeback, dwork);
-	struct backing_dev_info *bdi = wb->bdi;
 	long pages_written;
 
-	set_worker_desc("flush-%s", dev_name(bdi->dev));
+	set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
 	current->flags |= PF_SWAPWRITE;
 
 	if (likely(!current_is_workqueue_rescuer() ||
-		   !test_bit(BDI_registered, &bdi->state))) {
+		   !test_bit(WB_registered, &wb->state))) {
 		/*
-		 * The normal path.  Keep writing back @bdi until its
+		 * The normal path.  Keep writing back @wb until its
 		 * work_list is empty.  Note that this path is also taken
-		 * if @bdi is shutting down even when we're running off the
+		 * if @wb is shutting down even when we're running off the
 		 * rescuer as work_list needs to be drained.
 		 */
 		do {
 			pages_written = wb_do_writeback(wb);
 			trace_writeback_pages_written(pages_written);
-		} while (!list_empty(&bdi->work_list));
+		} while (!list_empty(&wb->work_list));
 	} else {
 		/*
 		 * bdi_wq can't get enough workers and we're running off
 		 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
 		 * enough for efficient IO.
 		 */
-		pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+		pages_written = writeback_inodes_wb(wb, 1024,
 						    WB_REASON_FORKER_THREAD);
 		trace_writeback_pages_written(pages_written);
 	}
 
-	if (!list_empty(&bdi->work_list))
+	if (!list_empty(&wb->work_list))
 		mod_delayed_work(bdi_wq, &wb->dwork, 0);
 	else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-		bdi_wakeup_thread_delayed(bdi);
+		wb_wakeup_delayed(wb);
 
 	current->flags &= ~PF_SWAPWRITE;
 }
@@ -1142,9 +1884,15 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+		struct bdi_writeback *wb;
+		struct wb_iter iter;
+
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, reason);
+
+		bdi_for_each_wb(wb, bdi, &iter, 0)
+			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
+					   false, reason);
 	}
 	rcu_read_unlock();
 }
@@ -1173,9 +1921,12 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
-		if (list_empty(&bdi->wb.b_dirty_time))
-			continue;
-		bdi_wakeup_thread(bdi);
+		struct bdi_writeback *wb;
+		struct wb_iter iter;
+
+		bdi_for_each_wb(wb, bdi, &iter, 0)
+			if (!list_empty(&bdi->wb.b_dirty_time))
+				wb_wakeup(&bdi->wb);
 	}
 	rcu_read_unlock();
 	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@ -1249,7 +2000,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block *sb = inode->i_sb;
-	struct backing_dev_info *bdi = NULL;
 	int dirtytime;
 
 	trace_writeback_mark_inode_dirty(inode, flags);
@@ -1289,6 +2039,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
 
+		inode_attach_wb(inode, NULL);
+
 		if (flags & I_DIRTY_INODE)
 			inode->i_state &= ~I_DIRTY_TIME;
 		inode->i_state |= flags;
@@ -1317,38 +2069,39 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * reposition it (that would break b_dirty time-ordering).
 		 */
 		if (!was_dirty) {
+			struct bdi_writeback *wb;
+			struct list_head *dirty_list;
 			bool wakeup_bdi = false;
-			bdi = inode_to_bdi(inode);
 
-			spin_unlock(&inode->i_lock);
-			spin_lock(&bdi->wb.list_lock);
-			if (bdi_cap_writeback_dirty(bdi)) {
-				WARN(!test_bit(BDI_registered, &bdi->state),
-				     "bdi-%s not registered\n", bdi->name);
+			wb = locked_inode_to_wb_and_lock_list(inode);
 
-				/*
-				 * If this is the first dirty inode for this
-				 * bdi, we have to wake-up the corresponding
-				 * bdi thread to make sure background
-				 * write-back happens later.
-				 */
-				if (!wb_has_dirty_io(&bdi->wb))
-					wakeup_bdi = true;
-			}
+			WARN(bdi_cap_writeback_dirty(wb->bdi) &&
+			     !test_bit(WB_registered, &wb->state),
+			     "bdi-%s not registered\n", wb->bdi->name);
 
 			inode->dirtied_when = jiffies;
 			if (dirtytime)
 				inode->dirtied_time_when = jiffies;
+
 			if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
-				list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+				dirty_list = &wb->b_dirty;
 			else
-				list_move(&inode->i_wb_list,
-					  &bdi->wb.b_dirty_time);
-			spin_unlock(&bdi->wb.list_lock);
+				dirty_list = &wb->b_dirty_time;
+
+			wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+							       dirty_list);
+
+			spin_unlock(&wb->list_lock);
 			trace_writeback_dirty_inode_enqueue(inode);
 
-			if (wakeup_bdi)
-				bdi_wakeup_thread_delayed(bdi);
+			/*
+			 * If this is the first dirty inode for this bdi,
+			 * we have to wake-up the corresponding bdi thread
+			 * to make sure background write-back happens
+			 * later.
+			 */
+			if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
+				wb_wakeup_delayed(wb);
 			return;
 		}
 	}
@@ -1411,6 +2164,28 @@ static void wait_sb_inodes(struct super_block *sb)
 	iput(old_inode);
 }
 
+static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+				     enum wb_reason reason, bool skip_if_busy)
+{
+	DEFINE_WB_COMPLETION_ONSTACK(done);
+	struct wb_writeback_work work = {
+		.sb			= sb,
+		.sync_mode		= WB_SYNC_NONE,
+		.tagged_writepages	= 1,
+		.done			= &done,
+		.nr_pages		= nr,
+		.reason			= reason,
+	};
+	struct backing_dev_info *bdi = sb->s_bdi;
+
+	if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
+		return;
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
+	wb_wait_for_completion(bdi, &done);
+}
+
 /**
  * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1425,21 +2200,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 			    unsigned long nr,
 			    enum wb_reason reason)
 {
-	DECLARE_COMPLETION_ONSTACK(done);
-	struct wb_writeback_work work = {
-		.sb			= sb,
-		.sync_mode		= WB_SYNC_NONE,
-		.tagged_writepages	= 1,
-		.done			= &done,
-		.nr_pages		= nr,
-		.reason			= reason,
-	};
-
-	if (sb->s_bdi == &noop_backing_dev_info)
-		return;
-	WARN_ON(!rwsem_is_locked(&sb->s_umount));
-	bdi_queue_work(sb->s_bdi, &work);
-	wait_for_completion(&done);
+	__writeback_inodes_sb_nr(sb, nr, reason, false);
 }
 EXPORT_SYMBOL(writeback_inodes_sb_nr);
 
@@ -1467,19 +2228,15 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
  * Returns 1 if writeback was started, 0 if not.
  */
-int try_to_writeback_inodes_sb_nr(struct super_block *sb,
-				  unsigned long nr,
-				  enum wb_reason reason)
+bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+				   enum wb_reason reason)
 {
-	if (writeback_in_progress(sb->s_bdi))
-		return 1;
-
 	if (!down_read_trylock(&sb->s_umount))
-		return 0;
+		return false;
 
-	writeback_inodes_sb_nr(sb, nr, reason);
+	__writeback_inodes_sb_nr(sb, nr, reason, true);
 	up_read(&sb->s_umount);
-	return 1;
+	return true;
 }
 EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
 
@@ -1491,7 +2248,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
  * Implement by try_to_writeback_inodes_sb_nr()
  * Returns 1 if writeback was started, 0 if not.
  */
-int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
+bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
 	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
@@ -1506,7 +2263,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  */
 void sync_inodes_sb(struct super_block *sb)
 {
-	DECLARE_COMPLETION_ONSTACK(done);
+	DEFINE_WB_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
@@ -1516,14 +2273,19 @@ void sync_inodes_sb(struct super_block *sb)
 		.reason		= WB_REASON_SYNC,
 		.for_sync	= 1,
 	};
+	struct backing_dev_info *bdi = sb->s_bdi;
 
-	/* Nothing to do? */
-	if (sb->s_bdi == &noop_backing_dev_info)
+	/*
+	 * Can't skip on !bdi_has_dirty() because we should wait for !dirty
+	 * inodes under writeback and I_DIRTY_TIME inodes ignored by
+	 * bdi_has_dirty() need to be written out too.
+	 */
+	if (bdi == &noop_backing_dev_info)
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	bdi_queue_work(sb->s_bdi, &work);
-	wait_for_completion(&done);
+	bdi_split_work_to_wbs(bdi, &work, false);
+	wb_wait_for_completion(bdi, &done);
 
 	wait_sb_inodes(sb);
 }
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 89acec742..d403c69be 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -327,7 +327,8 @@ static int fscache_alloc_object(struct fscache_cache *cache,
 
 object_already_extant:
 	ret = -ENOBUFS;
-	if (fscache_object_is_dead(object)) {
+	if (fscache_object_is_dying(object) ||
+	    fscache_cache_is_broken(object)) {
 		spin_unlock(&cookie->lock);
 		goto error;
 	}
@@ -671,7 +672,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 	if (!op)
 		return -ENOMEM;
 
-	fscache_operation_init(op, NULL, NULL);
+	fscache_operation_init(op, NULL, NULL, NULL);
 	op->flags = FSCACHE_OP_MYTHREAD |
 		(1 << FSCACHE_OP_WAITING) |
 		(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -695,8 +696,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 	/* the work queue now carries its own ref on the object */
 	spin_unlock(&cookie->lock);
 
-	ret = fscache_wait_for_operation_activation(object, op,
-						    NULL, NULL, NULL);
+	ret = fscache_wait_for_operation_activation(object, op, NULL, NULL);
 	if (ret == 0) {
 		/* ask the cache to honour the operation */
 		ret = object->cache->ops->check_consistency(op);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 7872a62ef..97ec45110 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -124,8 +124,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *,
 				       struct fscache_operation *);
 extern int fscache_submit_op(struct fscache_object *,
 			     struct fscache_operation *);
-extern int fscache_cancel_op(struct fscache_operation *,
-			     void (*)(struct fscache_operation *));
+extern int fscache_cancel_op(struct fscache_operation *, bool);
 extern void fscache_cancel_all_ops(struct fscache_object *);
 extern void fscache_abort_object(struct fscache_object *);
 extern void fscache_start_operations(struct fscache_object *);
@@ -138,8 +137,7 @@ extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *);
 extern int fscache_wait_for_operation_activation(struct fscache_object *,
 						 struct fscache_operation *,
 						 atomic_t *,
-						 atomic_t *,
-						 void (*)(struct fscache_operation *));
+						 atomic_t *);
 extern void fscache_invalidate_writes(struct fscache_cookie *);
 
 /*
@@ -164,6 +162,7 @@ extern atomic_t fscache_n_op_pend;
 extern atomic_t fscache_n_op_run;
 extern atomic_t fscache_n_op_enqueue;
 extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_initialised;
 extern atomic_t fscache_n_op_release;
 extern atomic_t fscache_n_op_gc;
 extern atomic_t fscache_n_op_cancelled;
@@ -271,6 +270,11 @@ extern atomic_t fscache_n_cop_write_page;
 extern atomic_t fscache_n_cop_uncache_page;
 extern atomic_t fscache_n_cop_dissociate_pages;
 
+extern atomic_t fscache_n_cache_no_space_reject;
+extern atomic_t fscache_n_cache_stale_objects;
+extern atomic_t fscache_n_cache_retired_objects;
+extern atomic_t fscache_n_cache_culled_objects;
+
 static inline void fscache_stat(atomic_t *stat)
 {
 	atomic_inc(stat);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index da032daf0..9e792e30f 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -328,6 +328,17 @@ void fscache_object_init(struct fscache_object *object,
 EXPORT_SYMBOL(fscache_object_init);
 
 /*
+ * Mark the object as no longer being live, making sure that we synchronise
+ * against op submission.
+ */
+static inline void fscache_mark_object_dead(struct fscache_object *object)
+{
+	spin_lock(&object->lock);
+	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	spin_unlock(&object->lock);
+}
+
+/*
  * Abort object initialisation before we start it.
  */
 static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
@@ -610,6 +621,8 @@ static const struct fscache_state *fscache_lookup_failure(struct fscache_object
 	object->cache->ops->lookup_complete(object);
 	fscache_stat_d(&fscache_n_cop_lookup_complete);
 
+	set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags);
+
 	cookie = object->cookie;
 	set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 	if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
@@ -629,7 +642,7 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob
 	_enter("{OBJ%x,%d,%d},%d",
 	       object->debug_id, object->n_ops, object->n_children, event);
 
-	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	fscache_mark_object_dead(object);
 	object->oob_event_mask = 0;
 
 	if (list_empty(&object->dependents) &&
@@ -948,7 +961,8 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
 	if (!op)
 		goto nomem;
 
-	fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
+	fscache_operation_init(op, object->cache->ops->invalidate_object,
+			       NULL, NULL);
 	op->flags = FSCACHE_OP_ASYNC |
 		(1 << FSCACHE_OP_EXCLUSIVE) |
 		(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -974,13 +988,13 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
 	return transit_to(UPDATE_OBJECT);
 
 nomem:
-	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	fscache_mark_object_dead(object);
 	fscache_unuse_cookie(object);
 	_leave(" [ENOMEM]");
 	return transit_to(KILL_OBJECT);
 
 submit_op_failed:
-	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	fscache_mark_object_dead(object);
 	spin_unlock(&cookie->lock);
 	fscache_unuse_cookie(object);
 	kfree(op);
@@ -1016,3 +1030,50 @@ static const struct fscache_state *fscache_update_object(struct fscache_object *
 	_leave("");
 	return transit_to(WAIT_FOR_CMD);
 }
+
+/**
+ * fscache_object_retrying_stale - Note retrying stale object
+ * @object: The object that will be retried
+ *
+ * Note that an object lookup found an on-disk object that was adjudged to be
+ * stale and has been deleted.  The lookup will be retried.
+ */
+void fscache_object_retrying_stale(struct fscache_object *object)
+{
+	fscache_stat(&fscache_n_cache_no_space_reject);
+}
+EXPORT_SYMBOL(fscache_object_retrying_stale);
+
+/**
+ * fscache_object_mark_killed - Note that an object was killed
+ * @object: The object that was culled
+ * @why: The reason the object was killed.
+ *
+ * Note that an object was killed.  Returns true if the object was
+ * already marked killed, false if it wasn't.
+ */
+void fscache_object_mark_killed(struct fscache_object *object,
+				enum fscache_why_object_killed why)
+{
+	if (test_and_set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags)) {
+		pr_err("Error: Object already killed by cache [%s]\n",
+		       object->cache->identifier);
+		return;
+	}
+
+	switch (why) {
+	case FSCACHE_OBJECT_NO_SPACE:
+		fscache_stat(&fscache_n_cache_no_space_reject);
+		break;
+	case FSCACHE_OBJECT_IS_STALE:
+		fscache_stat(&fscache_n_cache_stale_objects);
+		break;
+	case FSCACHE_OBJECT_WAS_RETIRED:
+		fscache_stat(&fscache_n_cache_retired_objects);
+		break;
+	case FSCACHE_OBJECT_WAS_CULLED:
+		fscache_stat(&fscache_n_cache_culled_objects);
+		break;
+	}
+}
+EXPORT_SYMBOL(fscache_object_mark_killed);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e7b87a0e5..de67745e1 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -20,6 +20,35 @@
 atomic_t fscache_op_debug_id;
 EXPORT_SYMBOL(fscache_op_debug_id);
 
+static void fscache_operation_dummy_cancel(struct fscache_operation *op)
+{
+}
+
+/**
+ * fscache_operation_init - Do basic initialisation of an operation
+ * @op: The operation to initialise
+ * @release: The release function to assign
+ *
+ * Do basic initialisation of an operation.  The caller must still set flags,
+ * object and processor if needed.
+ */
+void fscache_operation_init(struct fscache_operation *op,
+			    fscache_operation_processor_t processor,
+			    fscache_operation_cancel_t cancel,
+			    fscache_operation_release_t release)
+{
+	INIT_WORK(&op->work, fscache_op_work_func);
+	atomic_set(&op->usage, 1);
+	op->state = FSCACHE_OP_ST_INITIALISED;
+	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+	op->processor = processor;
+	op->cancel = cancel ?: fscache_operation_dummy_cancel;
+	op->release = release;
+	INIT_LIST_HEAD(&op->pend_link);
+	fscache_stat(&fscache_n_op_initialised);
+}
+EXPORT_SYMBOL(fscache_operation_init);
+
 /**
  * fscache_enqueue_operation - Enqueue an operation for processing
  * @op: The operation to enqueue
@@ -76,6 +105,43 @@ static void fscache_run_op(struct fscache_object *object,
 }
 
 /*
+ * report an unexpected submission
+ */
+static void fscache_report_unexpected_submission(struct fscache_object *object,
+						 struct fscache_operation *op,
+						 const struct fscache_state *ostate)
+{
+	static bool once_only;
+	struct fscache_operation *p;
+	unsigned n;
+
+	if (once_only)
+		return;
+	once_only = true;
+
+	kdebug("unexpected submission OP%x [OBJ%x %s]",
+	       op->debug_id, object->debug_id, object->state->name);
+	kdebug("objstate=%s [%s]", object->state->name, ostate->name);
+	kdebug("objflags=%lx", object->flags);
+	kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
+	kdebug("ops=%u inp=%u exc=%u",
+	       object->n_ops, object->n_in_progress, object->n_exclusive);
+
+	if (!list_empty(&object->pending_ops)) {
+		n = 0;
+		list_for_each_entry(p, &object->pending_ops, pend_link) {
+			ASSERTCMP(p->object, ==, object);
+			kdebug("%p %p", op->processor, op->release);
+			n++;
+		}
+
+		kdebug("n=%u", n);
+	}
+
+	dump_stack();
+}
+
+/*
  * submit an exclusive operation for an object
  * - other ops are excluded from running simultaneously with this one
  * - this gets any extra refs it needs on an op
@@ -83,6 +149,8 @@ static void fscache_run_op(struct fscache_object *object,
 int fscache_submit_exclusive_op(struct fscache_object *object,
 				struct fscache_operation *op)
 {
+	const struct fscache_state *ostate;
+	unsigned long flags;
 	int ret;
 
 	_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
@@ -95,8 +163,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
 	ASSERT(list_empty(&op->pend_link));
 
+	ostate = object->state;
+	smp_rmb();
+
 	op->state = FSCACHE_OP_ST_PENDING;
-	if (fscache_object_is_active(object)) {
+	flags = READ_ONCE(object->flags);
+	if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
+		fscache_stat(&fscache_n_op_rejected);
+		op->cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
+	} else if (unlikely(fscache_cache_is_broken(object))) {
+		op->cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -EIO;
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
 		op->object = object;
 		object->n_ops++;
 		object->n_exclusive++;	/* reads and writes must wait */
@@ -118,7 +199,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		/* need to issue a new write op after this */
 		clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
 		ret = 0;
-	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) {
 		op->object = object;
 		object->n_ops++;
 		object->n_exclusive++;	/* reads and writes must wait */
@@ -126,12 +207,15 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		list_add_tail(&op->pend_link, &object->pending_ops);
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
+	} else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) {
+		op->cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
 	} else {
-		/* If we're in any other state, there must have been an I/O
-		 * error of some nature.
-		 */
-		ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags));
-		ret = -EIO;
+		fscache_report_unexpected_submission(object, op, ostate);
+		op->cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
 	}
 
 	spin_unlock(&object->lock);
@@ -139,43 +223,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 }
 
 /*
- * report an unexpected submission
- */
-static void fscache_report_unexpected_submission(struct fscache_object *object,
-						 struct fscache_operation *op,
-						 const struct fscache_state *ostate)
-{
-	static bool once_only;
-	struct fscache_operation *p;
-	unsigned n;
-
-	if (once_only)
-		return;
-	once_only = true;
-
-	kdebug("unexpected submission OP%x [OBJ%x %s]",
-	       op->debug_id, object->debug_id, object->state->name);
-	kdebug("objstate=%s [%s]", object->state->name, ostate->name);
-	kdebug("objflags=%lx", object->flags);
-	kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
-	kdebug("ops=%u inp=%u exc=%u",
-	       object->n_ops, object->n_in_progress, object->n_exclusive);
-
-	if (!list_empty(&object->pending_ops)) {
-		n = 0;
-		list_for_each_entry(p, &object->pending_ops, pend_link) {
-			ASSERTCMP(p->object, ==, object);
-			kdebug("%p %p", op->processor, op->release);
-			n++;
-		}
-
-		kdebug("n=%u", n);
-	}
-
-	dump_stack();
-}
-
-/*
  * submit an operation for an object
  * - objects may be submitted only in the following states:
  *   - during object creation (write ops may be submitted)
@@ -187,6 +234,7 @@ int fscache_submit_op(struct fscache_object *object,
 		      struct fscache_operation *op)
 {
 	const struct fscache_state *ostate;
+	unsigned long flags;
 	int ret;
 
 	_enter("{OBJ%x OP%x},{%u}",
@@ -204,7 +252,17 @@ int fscache_submit_op(struct fscache_object *object,
 	smp_rmb();
 
 	op->state = FSCACHE_OP_ST_PENDING;
-	if (fscache_object_is_active(object)) {
+	flags = READ_ONCE(object->flags);
+	if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
+		fscache_stat(&fscache_n_op_rejected);
+		op->cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
+	} else if (unlikely(fscache_cache_is_broken(object))) {
+		op->cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -EIO;
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
 		op->object = object;
 		object->n_ops++;
 
@@ -222,23 +280,21 @@ int fscache_submit_op(struct fscache_object *object,
 			fscache_run_op(object, op);
 		}
 		ret = 0;
-	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) {
 		op->object = object;
 		object->n_ops++;
 		atomic_inc(&op->usage);
 		list_add_tail(&op->pend_link, &object->pending_ops);
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
-	} else if (fscache_object_is_dying(object)) {
-		fscache_stat(&fscache_n_op_rejected);
+	} else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) {
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
-	} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+	} else {
 		fscache_report_unexpected_submission(object, op, ostate);
 		ASSERT(!fscache_object_is_active(object));
-		op->state = FSCACHE_OP_ST_CANCELLED;
-		ret = -ENOBUFS;
-	} else {
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	}
@@ -293,9 +349,10 @@ void fscache_start_operations(struct fscache_object *object)
  * cancel an operation that's pending on an object
  */
 int fscache_cancel_op(struct fscache_operation *op,
-		      void (*do_cancel)(struct fscache_operation *))
+		      bool cancel_in_progress_op)
 {
 	struct fscache_object *object = op->object;
+	bool put = false;
 	int ret;
 
 	_enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
@@ -309,19 +366,37 @@ int fscache_cancel_op(struct fscache_operation *op,
 	ret = -EBUSY;
 	if (op->state == FSCACHE_OP_ST_PENDING) {
 		ASSERT(!list_empty(&op->pend_link));
-		fscache_stat(&fscache_n_op_cancelled);
 		list_del_init(&op->pend_link);
-		if (do_cancel)
-			do_cancel(op);
+		put = true;
+
+		fscache_stat(&fscache_n_op_cancelled);
+		op->cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+			object->n_exclusive--;
+		if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+			wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+		ret = 0;
+	} else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) {
+		ASSERTCMP(object->n_in_progress, >, 0);
+		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+			object->n_exclusive--;
+		object->n_in_progress--;
+		if (object->n_in_progress == 0)
+			fscache_start_operations(object);
+
+		fscache_stat(&fscache_n_op_cancelled);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
 			object->n_exclusive--;
 		if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
 			wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
-		fscache_put_operation(op);
 		ret = 0;
 	}
 
+	if (put)
+		fscache_put_operation(op);
 	spin_unlock(&object->lock);
 	_leave(" = %d", ret);
 	return ret;
@@ -345,6 +420,7 @@ void fscache_cancel_all_ops(struct fscache_object *object)
 		list_del_init(&op->pend_link);
 
 		ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 
 		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
@@ -377,8 +453,12 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled)
 
 	spin_lock(&object->lock);
 
-	op->state = cancelled ?
-		FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE;
+	if (!cancelled) {
+		op->state = FSCACHE_OP_ST_COMPLETE;
+	} else {
+		op->cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+	}
 
 	if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
 		object->n_exclusive--;
@@ -409,9 +489,9 @@ void fscache_put_operation(struct fscache_operation *op)
 		return;
 
 	_debug("PUT OP");
-	ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE,
+	ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED &&
+		    op->state != FSCACHE_OP_ST_COMPLETE,
 		    op->state, ==, FSCACHE_OP_ST_CANCELLED);
-	op->state = FSCACHE_OP_ST_DEAD;
 
 	fscache_stat(&fscache_n_op_release);
 
@@ -419,37 +499,39 @@ void fscache_put_operation(struct fscache_operation *op)
 		op->release(op);
 		op->release = NULL;
 	}
+	op->state = FSCACHE_OP_ST_DEAD;
 
 	object = op->object;
+	if (likely(object)) {
+		if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+			atomic_dec(&object->n_reads);
+		if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
+			fscache_unuse_cookie(object);
+
+		/* now... we may get called with the object spinlock held, so we
+		 * complete the cleanup here only if we can immediately acquire the
+		 * lock, and defer it otherwise */
+		if (!spin_trylock(&object->lock)) {
+			_debug("defer put");
+			fscache_stat(&fscache_n_op_deferred_release);
+
+			cache = object->cache;
+			spin_lock(&cache->op_gc_list_lock);
+			list_add_tail(&op->pend_link, &cache->op_gc_list);
+			spin_unlock(&cache->op_gc_list_lock);
+			schedule_work(&cache->op_gc);
+			_leave(" [defer]");
+			return;
+		}
 
-	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
-		atomic_dec(&object->n_reads);
-	if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
-		fscache_unuse_cookie(object);
-
-	/* now... we may get called with the object spinlock held, so we
-	 * complete the cleanup here only if we can immediately acquire the
-	 * lock, and defer it otherwise */
-	if (!spin_trylock(&object->lock)) {
-		_debug("defer put");
-		fscache_stat(&fscache_n_op_deferred_release);
+		ASSERTCMP(object->n_ops, >, 0);
+		object->n_ops--;
+		if (object->n_ops == 0)
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
 
-		cache = object->cache;
-		spin_lock(&cache->op_gc_list_lock);
-		list_add_tail(&op->pend_link, &cache->op_gc_list);
-		spin_unlock(&cache->op_gc_list_lock);
-		schedule_work(&cache->op_gc);
-		_leave(" [defer]");
-		return;
+		spin_unlock(&object->lock);
 	}
 
-	ASSERTCMP(object->n_ops, >, 0);
-	object->n_ops--;
-	if (object->n_ops == 0)
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
-
-	spin_unlock(&object->lock);
-
 	kfree(op);
 	_leave(" [done]");
 }
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index de33b3fcc..483bbc613 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -213,7 +213,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 		return -ENOMEM;
 	}
 
-	fscache_operation_init(op, fscache_attr_changed_op, NULL);
+	fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL);
 	op->flags = FSCACHE_OP_ASYNC |
 		(1 << FSCACHE_OP_EXCLUSIVE) |
 		(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -239,7 +239,7 @@ nobufs_dec:
 	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs:
 	spin_unlock(&cookie->lock);
-	kfree(op);
+	fscache_put_operation(op);
 	if (wake_cookie)
 		__fscache_wake_unused_cookie(cookie);
 	fscache_stat(&fscache_n_attr_changed_nobufs);
@@ -249,6 +249,17 @@ nobufs:
 EXPORT_SYMBOL(__fscache_attr_changed);
 
 /*
+ * Handle cancellation of a pending retrieval op
+ */
+static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
+{
+	struct fscache_retrieval *op =
+		container_of(_op, struct fscache_retrieval, op);
+
+	atomic_set(&op->n_pages, 0);
+}
+
+/*
  * release a retrieval op reference
  */
 static void fscache_release_retrieval_op(struct fscache_operation *_op)
@@ -258,11 +269,12 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
 
 	_enter("{OP%x}", op->op.debug_id);
 
-	ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
+	ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED,
+		    atomic_read(&op->n_pages), ==, 0);
 
 	fscache_hist(fscache_retrieval_histogram, op->start_time);
 	if (op->context)
-		fscache_put_context(op->op.object->cookie, op->context);
+		fscache_put_context(op->cookie, op->context);
 
 	_leave("");
 }
@@ -285,15 +297,24 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 		return NULL;
 	}
 
-	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
+	fscache_operation_init(&op->op, NULL,
+			       fscache_do_cancel_retrieval,
+			       fscache_release_retrieval_op);
 	op->op.flags	= FSCACHE_OP_MYTHREAD |
 		(1UL << FSCACHE_OP_WAITING) |
 		(1UL << FSCACHE_OP_UNUSE_COOKIE);
+	op->cookie	= cookie;
 	op->mapping	= mapping;
 	op->end_io_func	= end_io_func;
 	op->context	= context;
 	op->start_time	= jiffies;
 	INIT_LIST_HEAD(&op->to_do);
+
+	/* Pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure.
+	 */
+	if (context)
+		fscache_get_context(op->cookie, context);
 	return op;
 }
 
@@ -330,24 +351,12 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
 }
 
 /*
- * Handle cancellation of a pending retrieval op
- */
-static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
-{
-	struct fscache_retrieval *op =
-		container_of(_op, struct fscache_retrieval, op);
-
-	atomic_set(&op->n_pages, 0);
-}
-
-/*
  * wait for an object to become active (or dead)
  */
 int fscache_wait_for_operation_activation(struct fscache_object *object,
 					  struct fscache_operation *op,
 					  atomic_t *stat_op_waits,
-					  atomic_t *stat_object_dead,
-					  void (*do_cancel)(struct fscache_operation *))
+					  atomic_t *stat_object_dead)
 {
 	int ret;
 
@@ -359,7 +368,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
 		fscache_stat(stat_op_waits);
 	if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
 			TASK_INTERRUPTIBLE) != 0) {
-		ret = fscache_cancel_op(op, do_cancel);
+		ret = fscache_cancel_op(op, false);
 		if (ret == 0)
 			return -ERESTARTSYS;
 
@@ -377,11 +386,13 @@ check_if_dead:
 		_leave(" = -ENOBUFS [cancelled]");
 		return -ENOBUFS;
 	}
-	if (unlikely(fscache_object_is_dead(object))) {
-		pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state);
-		fscache_cancel_op(op, do_cancel);
+	if (unlikely(fscache_object_is_dying(object) ||
+		     fscache_cache_is_broken(object))) {
+		enum fscache_operation_state state = op->state;
+		fscache_cancel_op(op, true);
 		if (stat_object_dead)
 			fscache_stat(stat_object_dead);
+		_leave(" = -ENOBUFS [obj dead %d]", state);
 		return -ENOBUFS;
 	}
 	return 0;
@@ -453,17 +464,12 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	fscache_stat(&fscache_n_retrieval_ops);
 
-	/* pin the netfs read context in case we need to do the actual netfs
-	 * read because we've encountered a cache read failure */
-	fscache_get_context(object->cookie, op->context);
-
 	/* we wait for the operation to become active, and then process it
 	 * *here*, in this thread, and not in the thread pool */
 	ret = fscache_wait_for_operation_activation(
 		object, &op->op,
 		__fscache_stat(&fscache_n_retrieval_op_waits),
-		__fscache_stat(&fscache_n_retrievals_object_dead),
-		fscache_do_cancel_retrieval);
+		__fscache_stat(&fscache_n_retrievals_object_dead));
 	if (ret < 0)
 		goto error;
 
@@ -503,7 +509,7 @@ nobufs_unlock:
 	spin_unlock(&cookie->lock);
 	if (wake_cookie)
 		__fscache_wake_unused_cookie(cookie);
-	kfree(op);
+	fscache_put_retrieval(op);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
 	_leave(" = -ENOBUFS");
@@ -584,17 +590,12 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 
 	fscache_stat(&fscache_n_retrieval_ops);
 
-	/* pin the netfs read context in case we need to do the actual netfs
-	 * read because we've encountered a cache read failure */
-	fscache_get_context(object->cookie, op->context);
-
 	/* we wait for the operation to become active, and then process it
 	 * *here*, in this thread, and not in the thread pool */
 	ret = fscache_wait_for_operation_activation(
 		object, &op->op,
 		__fscache_stat(&fscache_n_retrieval_op_waits),
-		__fscache_stat(&fscache_n_retrievals_object_dead),
-		fscache_do_cancel_retrieval);
+		__fscache_stat(&fscache_n_retrievals_object_dead));
 	if (ret < 0)
 		goto error;
 
@@ -632,7 +633,7 @@ nobufs_unlock_dec:
 	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	kfree(op);
+	fscache_put_retrieval(op);
 	if (wake_cookie)
 		__fscache_wake_unused_cookie(cookie);
 nobufs:
@@ -700,8 +701,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 	ret = fscache_wait_for_operation_activation(
 		object, &op->op,
 		__fscache_stat(&fscache_n_alloc_op_waits),
-		__fscache_stat(&fscache_n_allocs_object_dead),
-		fscache_do_cancel_retrieval);
+		__fscache_stat(&fscache_n_allocs_object_dead));
 	if (ret < 0)
 		goto error;
 
@@ -726,7 +726,7 @@ nobufs_unlock_dec:
 	wake_cookie = __fscache_unuse_cookie(cookie);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
-	kfree(op);
+	fscache_put_retrieval(op);
 	if (wake_cookie)
 		__fscache_wake_unused_cookie(cookie);
 nobufs:
@@ -944,7 +944,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	if (!op)
 		goto nomem;
 
-	fscache_operation_init(&op->op, fscache_write_op,
+	fscache_operation_init(&op->op, fscache_write_op, NULL,
 			       fscache_release_write_op);
 	op->op.flags = FSCACHE_OP_ASYNC |
 		(1 << FSCACHE_OP_WAITING) |
@@ -1016,7 +1016,7 @@ already_pending:
 	spin_unlock(&object->lock);
 	spin_unlock(&cookie->lock);
 	radix_tree_preload_end();
-	kfree(op);
+	fscache_put_operation(&op->op);
 	fscache_stat(&fscache_n_stores_ok);
 	_leave(" = 0");
 	return 0;
@@ -1036,7 +1036,7 @@ nobufs_unlock_obj:
 nobufs:
 	spin_unlock(&cookie->lock);
 	radix_tree_preload_end();
-	kfree(op);
+	fscache_put_operation(&op->op);
 	if (wake_cookie)
 		__fscache_wake_unused_cookie(cookie);
 	fscache_stat(&fscache_n_stores_nobufs);
@@ -1044,7 +1044,7 @@ nobufs:
 	return -ENOBUFS;
 
 nomem_free:
-	kfree(op);
+	fscache_put_operation(&op->op);
 nomem:
 	fscache_stat(&fscache_n_stores_oom);
 	_leave(" = -ENOMEM");
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 40d13c70e..7cfa0aacd 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -23,6 +23,7 @@ atomic_t fscache_n_op_run;
 atomic_t fscache_n_op_enqueue;
 atomic_t fscache_n_op_requeue;
 atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_initialised;
 atomic_t fscache_n_op_release;
 atomic_t fscache_n_op_gc;
 atomic_t fscache_n_op_cancelled;
@@ -130,6 +131,11 @@ atomic_t fscache_n_cop_write_page;
 atomic_t fscache_n_cop_uncache_page;
 atomic_t fscache_n_cop_dissociate_pages;
 
+atomic_t fscache_n_cache_no_space_reject;
+atomic_t fscache_n_cache_stale_objects;
+atomic_t fscache_n_cache_retired_objects;
+atomic_t fscache_n_cache_culled_objects;
+
 /*
  * display the general statistics
  */
@@ -246,7 +252,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_op_enqueue),
 		   atomic_read(&fscache_n_op_cancelled),
 		   atomic_read(&fscache_n_op_rejected));
-	seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
+	seq_printf(m, "Ops    : ini=%u dfr=%u rel=%u gc=%u\n",
+		   atomic_read(&fscache_n_op_initialised),
 		   atomic_read(&fscache_n_op_deferred_release),
 		   atomic_read(&fscache_n_op_release),
 		   atomic_read(&fscache_n_op_gc));
@@ -271,6 +278,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_cop_write_page),
 		   atomic_read(&fscache_n_cop_uncache_page),
 		   atomic_read(&fscache_n_cop_dissociate_pages));
+	seq_printf(m, "CacheEv: nsp=%d stl=%d rtr=%d cul=%d\n",
+		   atomic_read(&fscache_n_cache_no_space_reject),
+		   atomic_read(&fscache_n_cache_stale_objects),
+		   atomic_read(&fscache_n_cache_retired_objects),
+		   atomic_read(&fscache_n_cache_culled_objects));
 	return 0;
 }
 
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e5bbf748b..eae2c1126 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -489,6 +489,7 @@ static void cuse_fc_release(struct fuse_conn *fc)
  */
 static int cuse_channel_open(struct inode *inode, struct file *file)
 {
+	struct fuse_dev *fud;
 	struct cuse_conn *cc;
 	int rc;
 
@@ -499,17 +500,22 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
 
 	fuse_conn_init(&cc->fc);
 
+	fud = fuse_dev_alloc(&cc->fc);
+	if (!fud) {
+		kfree(cc);
+		return -ENOMEM;
+	}
+
 	INIT_LIST_HEAD(&cc->list);
 	cc->fc.release = cuse_fc_release;
 
-	cc->fc.connected = 1;
 	cc->fc.initialized = 1;
 	rc = cuse_send_init(cc);
 	if (rc) {
-		fuse_conn_put(&cc->fc);
+		fuse_dev_free(fud);
 		return rc;
 	}
-	file->private_data = &cc->fc;	/* channel owns base reference to cc */
+	file->private_data = fud;
 
 	return 0;
 }
@@ -527,7 +533,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
  */
 static int cuse_channel_release(struct inode *inode, struct file *file)
 {
-	struct cuse_conn *cc = fc_to_cc(file->private_data);
+	struct fuse_dev *fud = file->private_data;
+	struct cuse_conn *cc = fc_to_cc(fud->fc);
 	int rc;
 
 	/* remove from the conntbl, no more access from this point on */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c8b68ab2e..ebb5e3745 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -25,13 +25,13 @@ MODULE_ALIAS("devname:fuse");
 
 static struct kmem_cache *fuse_req_cachep;
 
-static struct fuse_conn *fuse_get_conn(struct file *file)
+static struct fuse_dev *fuse_get_dev(struct file *file)
 {
 	/*
 	 * Lockless access is OK, because file->private data is set
 	 * once during mount and is valid until the file is released.
 	 */
-	return file->private_data;
+	return ACCESS_ONCE(file->private_data);
 }
 
 static void fuse_request_init(struct fuse_req *req, struct page **pages,
@@ -48,6 +48,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
 	req->pages = pages;
 	req->page_descs = page_descs;
 	req->max_pages = npages;
+	__set_bit(FR_PENDING, &req->flags);
 }
 
 static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
@@ -168,6 +169,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
 	if (!fc->connected)
 		goto out;
 
+	err = -ECONNREFUSED;
+	if (fc->conn_error)
+		goto out;
+
 	req = fuse_request_alloc(npages);
 	err = -ENOMEM;
 	if (!req) {
@@ -177,8 +182,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
 	}
 
 	fuse_req_init_context(req);
-	req->waiting = 1;
-	req->background = for_background;
+	__set_bit(FR_WAITING, &req->flags);
+	if (for_background)
+		__set_bit(FR_BACKGROUND, &req->flags);
+
 	return req;
 
  out:
@@ -268,15 +275,15 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
 		req = get_reserved_req(fc, file);
 
 	fuse_req_init_context(req);
-	req->waiting = 1;
-	req->background = 0;
+	__set_bit(FR_WAITING, &req->flags);
+	__clear_bit(FR_BACKGROUND, &req->flags);
 	return req;
 }
 
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
-		if (unlikely(req->background)) {
+		if (test_bit(FR_BACKGROUND, &req->flags)) {
 			/*
 			 * We get here in the unlikely case that a background
 			 * request was allocated but not sent
@@ -287,8 +294,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 			spin_unlock(&fc->lock);
 		}
 
-		if (req->waiting)
+		if (test_bit(FR_WAITING, &req->flags)) {
+			__clear_bit(FR_WAITING, &req->flags);
 			atomic_dec(&fc->num_waiting);
+		}
 
 		if (req->stolen_file)
 			put_reserved_req(fc, req);
@@ -309,46 +318,38 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
 	return nbytes;
 }
 
-static u64 fuse_get_unique(struct fuse_conn *fc)
+static u64 fuse_get_unique(struct fuse_iqueue *fiq)
 {
-	fc->reqctr++;
-	/* zero is special */
-	if (fc->reqctr == 0)
-		fc->reqctr = 1;
-
-	return fc->reqctr;
+	return ++fiq->reqctr;
 }
 
-static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
+static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
-	list_add_tail(&req->list, &fc->pending);
-	req->state = FUSE_REQ_PENDING;
-	if (!req->waiting) {
-		req->waiting = 1;
-		atomic_inc(&fc->num_waiting);
-	}
-	wake_up(&fc->waitq);
-	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	list_add_tail(&req->list, &fiq->pending);
+	wake_up_locked(&fiq->waitq);
+	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 }
 
 void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
 		       u64 nodeid, u64 nlookup)
 {
+	struct fuse_iqueue *fiq = &fc->iq;
+
 	forget->forget_one.nodeid = nodeid;
 	forget->forget_one.nlookup = nlookup;
 
-	spin_lock(&fc->lock);
-	if (fc->connected) {
-		fc->forget_list_tail->next = forget;
-		fc->forget_list_tail = forget;
-		wake_up(&fc->waitq);
-		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	spin_lock(&fiq->waitq.lock);
+	if (fiq->connected) {
+		fiq->forget_list_tail->next = forget;
+		fiq->forget_list_tail = forget;
+		wake_up_locked(&fiq->waitq);
+		kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 	} else {
 		kfree(forget);
 	}
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 }
 
 static void flush_bg_queue(struct fuse_conn *fc)
@@ -356,12 +357,15 @@ static void flush_bg_queue(struct fuse_conn *fc)
 	while (fc->active_background < fc->max_background &&
 	       !list_empty(&fc->bg_queue)) {
 		struct fuse_req *req;
+		struct fuse_iqueue *fiq = &fc->iq;
 
 		req = list_entry(fc->bg_queue.next, struct fuse_req, list);
 		list_del(&req->list);
 		fc->active_background++;
-		req->in.h.unique = fuse_get_unique(fc);
-		queue_request(fc, req);
+		spin_lock(&fiq->waitq.lock);
+		req->in.h.unique = fuse_get_unique(fiq);
+		queue_request(fiq, req);
+		spin_unlock(&fiq->waitq.lock);
 	}
 }
 
@@ -372,20 +376,22 @@ static void flush_bg_queue(struct fuse_conn *fc)
  * was closed.  The requester thread is woken up (if still waiting),
  * the 'end' callback is called if given, else the reference to the
  * request is released
- *
- * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-__releases(fc->lock)
 {
-	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-	req->end = NULL;
-	list_del(&req->list);
-	list_del(&req->intr_entry);
-	req->state = FUSE_REQ_FINISHED;
-	if (req->background) {
-		req->background = 0;
+	struct fuse_iqueue *fiq = &fc->iq;
+
+	if (test_and_set_bit(FR_FINISHED, &req->flags))
+		return;
 
+	spin_lock(&fiq->waitq.lock);
+	list_del_init(&req->intr_entry);
+	spin_unlock(&fiq->waitq.lock);
+	WARN_ON(test_bit(FR_PENDING, &req->flags));
+	WARN_ON(test_bit(FR_SENT, &req->flags));
+	if (test_bit(FR_BACKGROUND, &req->flags)) {
+		spin_lock(&fc->lock);
+		clear_bit(FR_BACKGROUND, &req->flags);
 		if (fc->num_background == fc->max_background)
 			fc->blocked = 0;
 
@@ -401,122 +407,105 @@ __releases(fc->lock)
 		fc->num_background--;
 		fc->active_background--;
 		flush_bg_queue(fc);
+		spin_unlock(&fc->lock);
 	}
-	spin_unlock(&fc->lock);
 	wake_up(&req->waitq);
-	if (end)
-		end(fc, req);
+	if (req->end)
+		req->end(fc, req);
 	fuse_put_request(fc, req);
 }
 
-static void wait_answer_interruptible(struct fuse_conn *fc,
-				      struct fuse_req *req)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-	if (signal_pending(current))
-		return;
-
-	spin_unlock(&fc->lock);
-	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
-	spin_lock(&fc->lock);
-}
-
-static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
-	list_add_tail(&req->intr_entry, &fc->interrupts);
-	wake_up(&fc->waitq);
-	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	spin_lock(&fiq->waitq.lock);
+	if (list_empty(&req->intr_entry)) {
+		list_add_tail(&req->intr_entry, &fiq->interrupts);
+		wake_up_locked(&fiq->waitq);
+	}
+	spin_unlock(&fiq->waitq.lock);
+	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 }
 
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-__releases(fc->lock)
-__acquires(fc->lock)
 {
+	struct fuse_iqueue *fiq = &fc->iq;
+	int err;
+
 	if (!fc->no_interrupt) {
 		/* Any signal may interrupt this */
-		wait_answer_interruptible(fc, req);
-
-		if (req->aborted)
-			goto aborted;
-		if (req->state == FUSE_REQ_FINISHED)
+		err = wait_event_interruptible(req->waitq,
+					test_bit(FR_FINISHED, &req->flags));
+		if (!err)
 			return;
 
-		req->interrupted = 1;
-		if (req->state == FUSE_REQ_SENT)
-			queue_interrupt(fc, req);
+		set_bit(FR_INTERRUPTED, &req->flags);
+		/* matches barrier in fuse_dev_do_read() */
+		smp_mb__after_atomic();
+		if (test_bit(FR_SENT, &req->flags))
+			queue_interrupt(fiq, req);
 	}
 
-	if (!req->force) {
+	if (!test_bit(FR_FORCE, &req->flags)) {
 		sigset_t oldset;
 
 		/* Only fatal signals may interrupt this */
 		block_sigs(&oldset);
-		wait_answer_interruptible(fc, req);
+		err = wait_event_interruptible(req->waitq,
+					test_bit(FR_FINISHED, &req->flags));
 		restore_sigs(&oldset);
 
-		if (req->aborted)
-			goto aborted;
-		if (req->state == FUSE_REQ_FINISHED)
+		if (!err)
 			return;
 
+		spin_lock(&fiq->waitq.lock);
 		/* Request is not yet in userspace, bail out */
-		if (req->state == FUSE_REQ_PENDING) {
+		if (test_bit(FR_PENDING, &req->flags)) {
 			list_del(&req->list);
+			spin_unlock(&fiq->waitq.lock);
 			__fuse_put_request(req);
 			req->out.h.error = -EINTR;
 			return;
 		}
+		spin_unlock(&fiq->waitq.lock);
 	}
 
 	/*
 	 * Either request is already in userspace, or it was forced.
 	 * Wait it out.
 	 */
-	spin_unlock(&fc->lock);
-	wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
-	spin_lock(&fc->lock);
-
-	if (!req->aborted)
-		return;
-
- aborted:
-	BUG_ON(req->state != FUSE_REQ_FINISHED);
-	if (req->locked) {
-		/* This is uninterruptible sleep, because data is
-		   being copied to/from the buffers of req.  During
-		   locked state, there mustn't be any filesystem
-		   operation (e.g. page fault), since that could lead
-		   to deadlock */
-		spin_unlock(&fc->lock);
-		wait_event(req->waitq, !req->locked);
-		spin_lock(&fc->lock);
-	}
+	wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
 }
 
 static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
-	BUG_ON(req->background);
-	spin_lock(&fc->lock);
-	if (!fc->connected)
+	struct fuse_iqueue *fiq = &fc->iq;
+
+	BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
+	spin_lock(&fiq->waitq.lock);
+	if (!fiq->connected) {
+		spin_unlock(&fiq->waitq.lock);
 		req->out.h.error = -ENOTCONN;
-	else if (fc->conn_error)
-		req->out.h.error = -ECONNREFUSED;
-	else {
-		req->in.h.unique = fuse_get_unique(fc);
-		queue_request(fc, req);
+	} else {
+		req->in.h.unique = fuse_get_unique(fiq);
+		queue_request(fiq, req);
 		/* acquire extra reference, since request is still needed
 		   after request_end() */
 		__fuse_get_request(req);
+		spin_unlock(&fiq->waitq.lock);
 
 		request_wait_answer(fc, req);
+		/* Pairs with smp_wmb() in request_end() */
+		smp_rmb();
 	}
-	spin_unlock(&fc->lock);
 }
 
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
-	req->isreply = 1;
+	__set_bit(FR_ISREPLY, &req->flags);
+	if (!test_bit(FR_WAITING, &req->flags)) {
+		__set_bit(FR_WAITING, &req->flags);
+		atomic_inc(&fc->num_waiting);
+	}
 	__fuse_request_send(fc, req);
 }
 EXPORT_SYMBOL_GPL(fuse_request_send);
@@ -586,10 +575,20 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
 	return ret;
 }
 
-static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
-					    struct fuse_req *req)
+/*
+ * Called under fc->lock
+ *
+ * fc->connected must have been checked previously
+ */
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+					 struct fuse_req *req)
 {
-	BUG_ON(!req->background);
+	BUG_ON(!test_bit(FR_BACKGROUND, &req->flags));
+	if (!test_bit(FR_WAITING, &req->flags)) {
+		__set_bit(FR_WAITING, &req->flags);
+		atomic_inc(&fc->num_waiting);
+	}
+	__set_bit(FR_ISREPLY, &req->flags);
 	fc->num_background++;
 	if (fc->num_background == fc->max_background)
 		fc->blocked = 1;
@@ -602,54 +601,40 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
 	flush_bg_queue(fc);
 }
 
-static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
+	BUG_ON(!req->end);
 	spin_lock(&fc->lock);
 	if (fc->connected) {
-		fuse_request_send_nowait_locked(fc, req);
+		fuse_request_send_background_locked(fc, req);
 		spin_unlock(&fc->lock);
 	} else {
+		spin_unlock(&fc->lock);
 		req->out.h.error = -ENOTCONN;
-		request_end(fc, req);
+		req->end(fc, req);
+		fuse_put_request(fc, req);
 	}
 }
-
-void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
-{
-	req->isreply = 1;
-	fuse_request_send_nowait(fc, req);
-}
 EXPORT_SYMBOL_GPL(fuse_request_send_background);
 
 static int fuse_request_send_notify_reply(struct fuse_conn *fc,
 					  struct fuse_req *req, u64 unique)
 {
 	int err = -ENODEV;
+	struct fuse_iqueue *fiq = &fc->iq;
 
-	req->isreply = 0;
+	__clear_bit(FR_ISREPLY, &req->flags);
 	req->in.h.unique = unique;
-	spin_lock(&fc->lock);
-	if (fc->connected) {
-		queue_request(fc, req);
+	spin_lock(&fiq->waitq.lock);
+	if (fiq->connected) {
+		queue_request(fiq, req);
 		err = 0;
 	}
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 
 	return err;
 }
 
-/*
- * Called under fc->lock
- *
- * fc->connected must have been checked previously
- */
-void fuse_request_send_background_locked(struct fuse_conn *fc,
-					 struct fuse_req *req)
-{
-	req->isreply = 1;
-	fuse_request_send_nowait_locked(fc, req);
-}
-
 void fuse_force_forget(struct file *file, u64 nodeid)
 {
 	struct inode *inode = file_inode(file);
@@ -665,7 +650,7 @@ void fuse_force_forget(struct file *file, u64 nodeid)
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
-	req->isreply = 0;
+	__clear_bit(FR_ISREPLY, &req->flags);
 	__fuse_request_send(fc, req);
 	/* ignore errors */
 	fuse_put_request(fc, req);
@@ -676,38 +661,39 @@ void fuse_force_forget(struct file *file, u64 nodeid)
  * anything that could cause a page-fault.  If the request was already
  * aborted bail out.
  */
-static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
+static int lock_request(struct fuse_req *req)
 {
 	int err = 0;
 	if (req) {
-		spin_lock(&fc->lock);
-		if (req->aborted)
+		spin_lock(&req->waitq.lock);
+		if (test_bit(FR_ABORTED, &req->flags))
 			err = -ENOENT;
 		else
-			req->locked = 1;
-		spin_unlock(&fc->lock);
+			set_bit(FR_LOCKED, &req->flags);
+		spin_unlock(&req->waitq.lock);
 	}
 	return err;
 }
 
 /*
- * Unlock request.  If it was aborted during being locked, the
- * requester thread is currently waiting for it to be unlocked, so
- * wake it up.
+ * Unlock request.  If it was aborted while locked, caller is responsible
+ * for unlocking and ending the request.
  */
-static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
+static int unlock_request(struct fuse_req *req)
 {
+	int err = 0;
 	if (req) {
-		spin_lock(&fc->lock);
-		req->locked = 0;
-		if (req->aborted)
-			wake_up(&req->waitq);
-		spin_unlock(&fc->lock);
+		spin_lock(&req->waitq.lock);
+		if (test_bit(FR_ABORTED, &req->flags))
+			err = -ENOENT;
+		else
+			clear_bit(FR_LOCKED, &req->flags);
+		spin_unlock(&req->waitq.lock);
 	}
+	return err;
 }
 
 struct fuse_copy_state {
-	struct fuse_conn *fc;
 	int write;
 	struct fuse_req *req;
 	struct iov_iter *iter;
@@ -721,13 +707,10 @@ struct fuse_copy_state {
 	unsigned move_pages:1;
 };
 
-static void fuse_copy_init(struct fuse_copy_state *cs,
-			   struct fuse_conn *fc,
-			   int write,
+static void fuse_copy_init(struct fuse_copy_state *cs, int write,
 			   struct iov_iter *iter)
 {
 	memset(cs, 0, sizeof(*cs));
-	cs->fc = fc;
 	cs->write = write;
 	cs->iter = iter;
 }
@@ -760,7 +743,10 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 	struct page *page;
 	int err;
 
-	unlock_request(cs->fc, cs->req);
+	err = unlock_request(cs->req);
+	if (err)
+		return err;
+
 	fuse_copy_finish(cs);
 	if (cs->pipebufs) {
 		struct pipe_buffer *buf = cs->pipebufs;
@@ -809,7 +795,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 		iov_iter_advance(cs->iter, err);
 	}
 
-	return lock_request(cs->fc, cs->req);
+	return lock_request(cs->req);
 }
 
 /* Do as much copy to/from userspace buffer as we can */
@@ -860,7 +846,10 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 	struct page *newpage;
 	struct pipe_buffer *buf = cs->pipebufs;
 
-	unlock_request(cs->fc, cs->req);
+	err = unlock_request(cs->req);
+	if (err)
+		return err;
+
 	fuse_copy_finish(cs);
 
 	err = buf->ops->confirm(cs->pipe, buf);
@@ -914,12 +903,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 		lru_cache_add_file(newpage);
 
 	err = 0;
-	spin_lock(&cs->fc->lock);
-	if (cs->req->aborted)
+	spin_lock(&cs->req->waitq.lock);
+	if (test_bit(FR_ABORTED, &cs->req->flags))
 		err = -ENOENT;
 	else
 		*pagep = newpage;
-	spin_unlock(&cs->fc->lock);
+	spin_unlock(&cs->req->waitq.lock);
 
 	if (err) {
 		unlock_page(newpage);
@@ -939,7 +928,7 @@ out_fallback:
 	cs->pg = buf->page;
 	cs->offset = buf->offset;
 
-	err = lock_request(cs->fc, cs->req);
+	err = lock_request(cs->req);
 	if (err)
 		return err;
 
@@ -950,11 +939,15 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
 			 unsigned offset, unsigned count)
 {
 	struct pipe_buffer *buf;
+	int err;
 
 	if (cs->nr_segs == cs->pipe->buffers)
 		return -EIO;
 
-	unlock_request(cs->fc, cs->req);
+	err = unlock_request(cs->req);
+	if (err)
+		return err;
+
 	fuse_copy_finish(cs);
 
 	buf = cs->pipebufs;
@@ -1065,36 +1058,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
 	return err;
 }
 
-static int forget_pending(struct fuse_conn *fc)
+static int forget_pending(struct fuse_iqueue *fiq)
 {
-	return fc->forget_list_head.next != NULL;
+	return fiq->forget_list_head.next != NULL;
 }
 
-static int request_pending(struct fuse_conn *fc)
+static int request_pending(struct fuse_iqueue *fiq)
 {
-	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
-		forget_pending(fc);
-}
-
-/* Wait until a request is available on the pending list */
-static void request_wait(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue_exclusive(&fc->waitq, &wait);
-	while (fc->connected && !request_pending(fc)) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (signal_pending(current))
-			break;
-
-		spin_unlock(&fc->lock);
-		schedule();
-		spin_lock(&fc->lock);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&fc->waitq, &wait);
+	return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) ||
+		forget_pending(fiq);
 }
 
 /*
@@ -1103,11 +1075,12 @@ __acquires(fc->lock)
  * Unlike other requests this is assembled on demand, without a need
  * to allocate a separate fuse_req structure.
  *
- * Called with fc->lock held, releases it
+ * Called with fiq->waitq.lock held, releases it
  */
-static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
+static int fuse_read_interrupt(struct fuse_iqueue *fiq,
+			       struct fuse_copy_state *cs,
 			       size_t nbytes, struct fuse_req *req)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
 	struct fuse_in_header ih;
 	struct fuse_interrupt_in arg;
@@ -1115,7 +1088,7 @@ __releases(fc->lock)
 	int err;
 
 	list_del_init(&req->intr_entry);
-	req->intr_unique = fuse_get_unique(fc);
+	req->intr_unique = fuse_get_unique(fiq);
 	memset(&ih, 0, sizeof(ih));
 	memset(&arg, 0, sizeof(arg));
 	ih.len = reqsize;
@@ -1123,7 +1096,7 @@ __releases(fc->lock)
 	ih.unique = req->intr_unique;
 	arg.unique = req->in.h.unique;
 
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 	if (nbytes < reqsize)
 		return -EINVAL;
 
@@ -1135,21 +1108,21 @@ __releases(fc->lock)
 	return err ? err : reqsize;
 }
 
-static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq,
 					       unsigned max,
 					       unsigned *countp)
 {
-	struct fuse_forget_link *head = fc->forget_list_head.next;
+	struct fuse_forget_link *head = fiq->forget_list_head.next;
 	struct fuse_forget_link **newhead = &head;
 	unsigned count;
 
 	for (count = 0; *newhead != NULL && count < max; count++)
 		newhead = &(*newhead)->next;
 
-	fc->forget_list_head.next = *newhead;
+	fiq->forget_list_head.next = *newhead;
 	*newhead = NULL;
-	if (fc->forget_list_head.next == NULL)
-		fc->forget_list_tail = &fc->forget_list_head;
+	if (fiq->forget_list_head.next == NULL)
+		fiq->forget_list_tail = &fiq->forget_list_head;
 
 	if (countp != NULL)
 		*countp = count;
@@ -1157,24 +1130,24 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
 	return head;
 }
 
-static int fuse_read_single_forget(struct fuse_conn *fc,
+static int fuse_read_single_forget(struct fuse_iqueue *fiq,
 				   struct fuse_copy_state *cs,
 				   size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
 	int err;
-	struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+	struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL);
 	struct fuse_forget_in arg = {
 		.nlookup = forget->forget_one.nlookup,
 	};
 	struct fuse_in_header ih = {
 		.opcode = FUSE_FORGET,
 		.nodeid = forget->forget_one.nodeid,
-		.unique = fuse_get_unique(fc),
+		.unique = fuse_get_unique(fiq),
 		.len = sizeof(ih) + sizeof(arg),
 	};
 
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 	kfree(forget);
 	if (nbytes < ih.len)
 		return -EINVAL;
@@ -1190,9 +1163,9 @@ __releases(fc->lock)
 	return ih.len;
 }
 
-static int fuse_read_batch_forget(struct fuse_conn *fc,
+static int fuse_read_batch_forget(struct fuse_iqueue *fiq,
 				   struct fuse_copy_state *cs, size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
 	int err;
 	unsigned max_forgets;
@@ -1201,18 +1174,18 @@ __releases(fc->lock)
 	struct fuse_batch_forget_in arg = { .count = 0 };
 	struct fuse_in_header ih = {
 		.opcode = FUSE_BATCH_FORGET,
-		.unique = fuse_get_unique(fc),
+		.unique = fuse_get_unique(fiq),
 		.len = sizeof(ih) + sizeof(arg),
 	};
 
 	if (nbytes < ih.len) {
-		spin_unlock(&fc->lock);
+		spin_unlock(&fiq->waitq.lock);
 		return -EINVAL;
 	}
 
 	max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
-	head = dequeue_forget(fc, max_forgets, &count);
-	spin_unlock(&fc->lock);
+	head = dequeue_forget(fiq, max_forgets, &count);
+	spin_unlock(&fiq->waitq.lock);
 
 	arg.count = count;
 	ih.len += count * sizeof(struct fuse_forget_one);
@@ -1239,14 +1212,15 @@ __releases(fc->lock)
 	return ih.len;
 }
 
-static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq,
+			    struct fuse_copy_state *cs,
 			    size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
-	if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
-		return fuse_read_single_forget(fc, cs, nbytes);
+	if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL)
+		return fuse_read_single_forget(fiq, cs, nbytes);
 	else
-		return fuse_read_batch_forget(fc, cs, nbytes);
+		return fuse_read_batch_forget(fiq, cs, nbytes);
 }
 
 /*
@@ -1258,46 +1232,51 @@ __releases(fc->lock)
  * request_end().  Otherwise add it to the processing list, and set
  * the 'sent' flag.
  */
-static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
+static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
 				struct fuse_copy_state *cs, size_t nbytes)
 {
-	int err;
+	ssize_t err;
+	struct fuse_conn *fc = fud->fc;
+	struct fuse_iqueue *fiq = &fc->iq;
+	struct fuse_pqueue *fpq = &fud->pq;
 	struct fuse_req *req;
 	struct fuse_in *in;
 	unsigned reqsize;
 
  restart:
-	spin_lock(&fc->lock);
+	spin_lock(&fiq->waitq.lock);
 	err = -EAGAIN;
-	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
-	    !request_pending(fc))
+	if ((file->f_flags & O_NONBLOCK) && fiq->connected &&
+	    !request_pending(fiq))
 		goto err_unlock;
 
-	request_wait(fc);
-	err = -ENODEV;
-	if (!fc->connected)
+	err = wait_event_interruptible_exclusive_locked(fiq->waitq,
+				!fiq->connected || request_pending(fiq));
+	if (err)
 		goto err_unlock;
-	err = -ERESTARTSYS;
-	if (!request_pending(fc))
+
+	err = -ENODEV;
+	if (!fiq->connected)
 		goto err_unlock;
 
-	if (!list_empty(&fc->interrupts)) {
-		req = list_entry(fc->interrupts.next, struct fuse_req,
+	if (!list_empty(&fiq->interrupts)) {
+		req = list_entry(fiq->interrupts.next, struct fuse_req,
 				 intr_entry);
-		return fuse_read_interrupt(fc, cs, nbytes, req);
+		return fuse_read_interrupt(fiq, cs, nbytes, req);
 	}
 
-	if (forget_pending(fc)) {
-		if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
-			return fuse_read_forget(fc, cs, nbytes);
+	if (forget_pending(fiq)) {
+		if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0)
+			return fuse_read_forget(fc, fiq, cs, nbytes);
 
-		if (fc->forget_batch <= -8)
-			fc->forget_batch = 16;
+		if (fiq->forget_batch <= -8)
+			fiq->forget_batch = 16;
 	}
 
-	req = list_entry(fc->pending.next, struct fuse_req, list);
-	req->state = FUSE_REQ_READING;
-	list_move(&req->list, &fc->io);
+	req = list_entry(fiq->pending.next, struct fuse_req, list);
+	clear_bit(FR_PENDING, &req->flags);
+	list_del_init(&req->list);
+	spin_unlock(&fiq->waitq.lock);
 
 	in = &req->in;
 	reqsize = in->h.len;
@@ -1310,37 +1289,48 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
 		request_end(fc, req);
 		goto restart;
 	}
-	spin_unlock(&fc->lock);
+	spin_lock(&fpq->lock);
+	list_add(&req->list, &fpq->io);
+	spin_unlock(&fpq->lock);
 	cs->req = req;
 	err = fuse_copy_one(cs, &in->h, sizeof(in->h));
 	if (!err)
 		err = fuse_copy_args(cs, in->numargs, in->argpages,
 				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(cs);
-	spin_lock(&fc->lock);
-	req->locked = 0;
-	if (req->aborted) {
-		request_end(fc, req);
-		return -ENODEV;
+	spin_lock(&fpq->lock);
+	clear_bit(FR_LOCKED, &req->flags);
+	if (!fpq->connected) {
+		err = -ENODEV;
+		goto out_end;
 	}
 	if (err) {
 		req->out.h.error = -EIO;
-		request_end(fc, req);
-		return err;
+		goto out_end;
 	}
-	if (!req->isreply)
-		request_end(fc, req);
-	else {
-		req->state = FUSE_REQ_SENT;
-		list_move_tail(&req->list, &fc->processing);
-		if (req->interrupted)
-			queue_interrupt(fc, req);
-		spin_unlock(&fc->lock);
+	if (!test_bit(FR_ISREPLY, &req->flags)) {
+		err = reqsize;
+		goto out_end;
 	}
+	list_move_tail(&req->list, &fpq->processing);
+	spin_unlock(&fpq->lock);
+	set_bit(FR_SENT, &req->flags);
+	/* matches barrier in request_wait_answer() */
+	smp_mb__after_atomic();
+	if (test_bit(FR_INTERRUPTED, &req->flags))
+		queue_interrupt(fiq, req);
+
 	return reqsize;
 
+out_end:
+	if (!test_bit(FR_PRIVATE, &req->flags))
+		list_del_init(&req->list);
+	spin_unlock(&fpq->lock);
+	request_end(fc, req);
+	return err;
+
  err_unlock:
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 	return err;
 }
 
@@ -1359,16 +1349,17 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct fuse_copy_state cs;
 	struct file *file = iocb->ki_filp;
-	struct fuse_conn *fc = fuse_get_conn(file);
-	if (!fc)
+	struct fuse_dev *fud = fuse_get_dev(file);
+
+	if (!fud)
 		return -EPERM;
 
 	if (!iter_is_iovec(to))
 		return -EINVAL;
 
-	fuse_copy_init(&cs, fc, 1, to);
+	fuse_copy_init(&cs, 1, to);
 
-	return fuse_dev_do_read(fc, file, &cs, iov_iter_count(to));
+	return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to));
 }
 
 static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
@@ -1380,18 +1371,19 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 	int do_wakeup = 0;
 	struct pipe_buffer *bufs;
 	struct fuse_copy_state cs;
-	struct fuse_conn *fc = fuse_get_conn(in);
-	if (!fc)
+	struct fuse_dev *fud = fuse_get_dev(in);
+
+	if (!fud)
 		return -EPERM;
 
 	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
 	if (!bufs)
 		return -ENOMEM;
 
-	fuse_copy_init(&cs, fc, 1, NULL);
+	fuse_copy_init(&cs, 1, NULL);
 	cs.pipebufs = bufs;
 	cs.pipe = pipe;
-	ret = fuse_dev_do_read(fc, in, &cs, len);
+	ret = fuse_dev_do_read(fud, in, &cs, len);
 	if (ret < 0)
 		goto out;
 
@@ -1830,11 +1822,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 }
 
 /* Look up request on processing list by unique ID */
-static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
+static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
 {
 	struct fuse_req *req;
 
-	list_for_each_entry(req, &fc->processing, list) {
+	list_for_each_entry(req, &fpq->processing, list) {
 		if (req->in.h.unique == unique || req->intr_unique == unique)
 			return req;
 	}
@@ -1871,10 +1863,12 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
  * it from the list and copy the rest of the buffer to the request.
  * The request is finished by calling request_end()
  */
-static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
+static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 				 struct fuse_copy_state *cs, size_t nbytes)
 {
 	int err;
+	struct fuse_conn *fc = fud->fc;
+	struct fuse_pqueue *fpq = &fud->pq;
 	struct fuse_req *req;
 	struct fuse_out_header oh;
 
@@ -1902,63 +1896,60 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
 	if (oh.error <= -1000 || oh.error > 0)
 		goto err_finish;
 
-	spin_lock(&fc->lock);
+	spin_lock(&fpq->lock);
 	err = -ENOENT;
-	if (!fc->connected)
-		goto err_unlock;
+	if (!fpq->connected)
+		goto err_unlock_pq;
 
-	req = request_find(fc, oh.unique);
+	req = request_find(fpq, oh.unique);
 	if (!req)
-		goto err_unlock;
+		goto err_unlock_pq;
 
-	if (req->aborted) {
-		spin_unlock(&fc->lock);
-		fuse_copy_finish(cs);
-		spin_lock(&fc->lock);
-		request_end(fc, req);
-		return -ENOENT;
-	}
 	/* Is it an interrupt reply? */
 	if (req->intr_unique == oh.unique) {
+		spin_unlock(&fpq->lock);
+
 		err = -EINVAL;
 		if (nbytes != sizeof(struct fuse_out_header))
-			goto err_unlock;
+			goto err_finish;
 
 		if (oh.error == -ENOSYS)
 			fc->no_interrupt = 1;
 		else if (oh.error == -EAGAIN)
-			queue_interrupt(fc, req);
+			queue_interrupt(&fc->iq, req);
 
-		spin_unlock(&fc->lock);
 		fuse_copy_finish(cs);
 		return nbytes;
 	}
 
-	req->state = FUSE_REQ_WRITING;
-	list_move(&req->list, &fc->io);
+	clear_bit(FR_SENT, &req->flags);
+	list_move(&req->list, &fpq->io);
 	req->out.h = oh;
-	req->locked = 1;
+	set_bit(FR_LOCKED, &req->flags);
+	spin_unlock(&fpq->lock);
 	cs->req = req;
 	if (!req->out.page_replace)
 		cs->move_pages = 0;
-	spin_unlock(&fc->lock);
 
 	err = copy_out_args(cs, &req->out, nbytes);
 	fuse_copy_finish(cs);
 
-	spin_lock(&fc->lock);
-	req->locked = 0;
-	if (!err) {
-		if (req->aborted)
-			err = -ENOENT;
-	} else if (!req->aborted)
+	spin_lock(&fpq->lock);
+	clear_bit(FR_LOCKED, &req->flags);
+	if (!fpq->connected)
+		err = -ENOENT;
+	else if (err)
 		req->out.h.error = -EIO;
+	if (!test_bit(FR_PRIVATE, &req->flags))
+		list_del_init(&req->list);
+	spin_unlock(&fpq->lock);
+
 	request_end(fc, req);
 
 	return err ? err : nbytes;
 
- err_unlock:
-	spin_unlock(&fc->lock);
+ err_unlock_pq:
+	spin_unlock(&fpq->lock);
  err_finish:
 	fuse_copy_finish(cs);
 	return err;
@@ -1967,16 +1958,17 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
 static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct fuse_copy_state cs;
-	struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
-	if (!fc)
+	struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp);
+
+	if (!fud)
 		return -EPERM;
 
 	if (!iter_is_iovec(from))
 		return -EINVAL;
 
-	fuse_copy_init(&cs, fc, 0, from);
+	fuse_copy_init(&cs, 0, from);
 
-	return fuse_dev_do_write(fc, &cs, iov_iter_count(from));
+	return fuse_dev_do_write(fud, &cs, iov_iter_count(from));
 }
 
 static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
@@ -1987,12 +1979,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 	unsigned idx;
 	struct pipe_buffer *bufs;
 	struct fuse_copy_state cs;
-	struct fuse_conn *fc;
+	struct fuse_dev *fud;
 	size_t rem;
 	ssize_t ret;
 
-	fc = fuse_get_conn(out);
-	if (!fc)
+	fud = fuse_get_dev(out);
+	if (!fud)
 		return -EPERM;
 
 	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
@@ -2039,7 +2031,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 	}
 	pipe_unlock(pipe);
 
-	fuse_copy_init(&cs, fc, 0, NULL);
+	fuse_copy_init(&cs, 0, NULL);
 	cs.pipebufs = bufs;
 	cs.nr_segs = nbuf;
 	cs.pipe = pipe;
@@ -2047,7 +2039,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 	if (flags & SPLICE_F_MOVE)
 		cs.move_pages = 1;
 
-	ret = fuse_dev_do_write(fc, &cs, len);
+	ret = fuse_dev_do_write(fud, &cs, len);
 
 	for (idx = 0; idx < nbuf; idx++) {
 		struct pipe_buffer *buf = &bufs[idx];
@@ -2061,18 +2053,21 @@ out:
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
 	unsigned mask = POLLOUT | POLLWRNORM;
-	struct fuse_conn *fc = fuse_get_conn(file);
-	if (!fc)
+	struct fuse_iqueue *fiq;
+	struct fuse_dev *fud = fuse_get_dev(file);
+
+	if (!fud)
 		return POLLERR;
 
-	poll_wait(file, &fc->waitq, wait);
+	fiq = &fud->fc->iq;
+	poll_wait(file, &fiq->waitq, wait);
 
-	spin_lock(&fc->lock);
-	if (!fc->connected)
+	spin_lock(&fiq->waitq.lock);
+	if (!fiq->connected)
 		mask = POLLERR;
-	else if (request_pending(fc))
+	else if (request_pending(fiq))
 		mask |= POLLIN | POLLRDNORM;
-	spin_unlock(&fc->lock);
+	spin_unlock(&fiq->waitq.lock);
 
 	return mask;
 }
@@ -2083,67 +2078,18 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
  * This function releases and reacquires fc->lock
  */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
-__releases(fc->lock)
-__acquires(fc->lock)
 {
 	while (!list_empty(head)) {
 		struct fuse_req *req;
 		req = list_entry(head->next, struct fuse_req, list);
 		req->out.h.error = -ECONNABORTED;
-		request_end(fc, req);
-		spin_lock(&fc->lock);
-	}
-}
-
-/*
- * Abort requests under I/O
- *
- * The requests are set to aborted and finished, and the request
- * waiter is woken up.  This will make request_wait_answer() wait
- * until the request is unlocked and then return.
- *
- * If the request is asynchronous, then the end function needs to be
- * called after waiting for the request to be unlocked (if it was
- * locked).
- */
-static void end_io_requests(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-	while (!list_empty(&fc->io)) {
-		struct fuse_req *req =
-			list_entry(fc->io.next, struct fuse_req, list);
-		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-
-		req->aborted = 1;
-		req->out.h.error = -ECONNABORTED;
-		req->state = FUSE_REQ_FINISHED;
+		clear_bit(FR_PENDING, &req->flags);
+		clear_bit(FR_SENT, &req->flags);
 		list_del_init(&req->list);
-		wake_up(&req->waitq);
-		if (end) {
-			req->end = NULL;
-			__fuse_get_request(req);
-			spin_unlock(&fc->lock);
-			wait_event(req->waitq, !req->locked);
-			end(fc, req);
-			fuse_put_request(fc, req);
-			spin_lock(&fc->lock);
-		}
+		request_end(fc, req);
 	}
 }
 
-static void end_queued_requests(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-	fc->max_background = UINT_MAX;
-	flush_bg_queue(fc);
-	end_requests(fc, &fc->pending);
-	end_requests(fc, &fc->processing);
-	while (forget_pending(fc))
-		kfree(dequeue_forget(fc, 1, NULL));
-}
-
 static void end_polls(struct fuse_conn *fc)
 {
 	struct rb_node *p;
@@ -2162,67 +2108,164 @@ static void end_polls(struct fuse_conn *fc)
 /*
  * Abort all requests.
  *
- * Emergency exit in case of a malicious or accidental deadlock, or
- * just a hung filesystem.
- *
- * The same effect is usually achievable through killing the
- * filesystem daemon and all users of the filesystem.  The exception
- * is the combination of an asynchronous request and the tricky
- * deadlock (see Documentation/filesystems/fuse.txt).
+ * Emergency exit in case of a malicious or accidental deadlock, or just a hung
+ * filesystem.
  *
- * During the aborting, progression of requests from the pending and
- * processing lists onto the io list, and progression of new requests
- * onto the pending list is prevented by req->connected being false.
+ * The same effect is usually achievable through killing the filesystem daemon
+ * and all users of the filesystem.  The exception is the combination of an
+ * asynchronous request and the tricky deadlock (see
+ * Documentation/filesystems/fuse.txt).
  *
- * Progression of requests under I/O to the processing list is
- * prevented by the req->aborted flag being true for these requests.
- * For this reason requests on the io list must be aborted first.
+ * Aborting requests under I/O goes as follows: 1: Separate out unlocked
+ * requests, they should be finished off immediately.  Locked requests will be
+ * finished after unlock; see unlock_request(). 2: Finish off the unlocked
+ * requests.  It is possible that some request will finish before we can.  This
+ * is OK, the request will in that case be removed from the list before we touch
+ * it.
  */
 void fuse_abort_conn(struct fuse_conn *fc)
 {
+	struct fuse_iqueue *fiq = &fc->iq;
+
 	spin_lock(&fc->lock);
 	if (fc->connected) {
+		struct fuse_dev *fud;
+		struct fuse_req *req, *next;
+		LIST_HEAD(to_end1);
+		LIST_HEAD(to_end2);
+
 		fc->connected = 0;
 		fc->blocked = 0;
 		fuse_set_initialized(fc);
-		end_io_requests(fc);
-		end_queued_requests(fc);
+		list_for_each_entry(fud, &fc->devices, entry) {
+			struct fuse_pqueue *fpq = &fud->pq;
+
+			spin_lock(&fpq->lock);
+			fpq->connected = 0;
+			list_for_each_entry_safe(req, next, &fpq->io, list) {
+				req->out.h.error = -ECONNABORTED;
+				spin_lock(&req->waitq.lock);
+				set_bit(FR_ABORTED, &req->flags);
+				if (!test_bit(FR_LOCKED, &req->flags)) {
+					set_bit(FR_PRIVATE, &req->flags);
+					list_move(&req->list, &to_end1);
+				}
+				spin_unlock(&req->waitq.lock);
+			}
+			list_splice_init(&fpq->processing, &to_end2);
+			spin_unlock(&fpq->lock);
+		}
+		fc->max_background = UINT_MAX;
+		flush_bg_queue(fc);
+
+		spin_lock(&fiq->waitq.lock);
+		fiq->connected = 0;
+		list_splice_init(&fiq->pending, &to_end2);
+		while (forget_pending(fiq))
+			kfree(dequeue_forget(fiq, 1, NULL));
+		wake_up_all_locked(&fiq->waitq);
+		spin_unlock(&fiq->waitq.lock);
+		kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 		end_polls(fc);
-		wake_up_all(&fc->waitq);
 		wake_up_all(&fc->blocked_waitq);
-		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+		spin_unlock(&fc->lock);
+
+		while (!list_empty(&to_end1)) {
+			req = list_first_entry(&to_end1, struct fuse_req, list);
+			__fuse_get_request(req);
+			list_del_init(&req->list);
+			request_end(fc, req);
+		}
+		end_requests(fc, &to_end2);
+	} else {
+		spin_unlock(&fc->lock);
 	}
-	spin_unlock(&fc->lock);
 }
 EXPORT_SYMBOL_GPL(fuse_abort_conn);
 
 int fuse_dev_release(struct inode *inode, struct file *file)
 {
-	struct fuse_conn *fc = fuse_get_conn(file);
-	if (fc) {
-		spin_lock(&fc->lock);
-		fc->connected = 0;
-		fc->blocked = 0;
-		fuse_set_initialized(fc);
-		end_queued_requests(fc);
-		end_polls(fc);
-		wake_up_all(&fc->blocked_waitq);
-		spin_unlock(&fc->lock);
-		fuse_conn_put(fc);
-	}
+	struct fuse_dev *fud = fuse_get_dev(file);
+
+	if (fud) {
+		struct fuse_conn *fc = fud->fc;
+		struct fuse_pqueue *fpq = &fud->pq;
 
+		WARN_ON(!list_empty(&fpq->io));
+		end_requests(fc, &fpq->processing);
+		/* Are we the last open device? */
+		if (atomic_dec_and_test(&fc->dev_count)) {
+			WARN_ON(fc->iq.fasync != NULL);
+			fuse_abort_conn(fc);
+		}
+		fuse_dev_free(fud);
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(fuse_dev_release);
 
 static int fuse_dev_fasync(int fd, struct file *file, int on)
 {
-	struct fuse_conn *fc = fuse_get_conn(file);
-	if (!fc)
+	struct fuse_dev *fud = fuse_get_dev(file);
+
+	if (!fud)
 		return -EPERM;
 
 	/* No locking - fasync_helper does its own locking */
-	return fasync_helper(fd, file, on, &fc->fasync);
+	return fasync_helper(fd, file, on, &fud->fc->iq.fasync);
+}
+
+static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
+{
+	struct fuse_dev *fud;
+
+	if (new->private_data)
+		return -EINVAL;
+
+	fud = fuse_dev_alloc(fc);
+	if (!fud)
+		return -ENOMEM;
+
+	new->private_data = fud;
+	atomic_inc(&fc->dev_count);
+
+	return 0;
+}
+
+static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	int err = -ENOTTY;
+
+	if (cmd == FUSE_DEV_IOC_CLONE) {
+		int oldfd;
+
+		err = -EFAULT;
+		if (!get_user(oldfd, (__u32 __user *) arg)) {
+			struct file *old = fget(oldfd);
+
+			err = -EINVAL;
+			if (old) {
+				struct fuse_dev *fud = NULL;
+
+				/*
+				 * Check against file->f_op because CUSE
+				 * uses the same ioctl handler.
+				 */
+				if (old->f_op == file->f_op &&
+				    old->f_cred->user_ns == file->f_cred->user_ns)
+					fud = fuse_get_dev(old);
+
+				if (fud) {
+					mutex_lock(&fuse_mutex);
+					err = fuse_device_clone(fud->fc, file);
+					mutex_unlock(&fuse_mutex);
+				}
+				fput(old);
+			}
+		}
+	}
+	return err;
 }
 
 const struct file_operations fuse_dev_operations = {
@@ -2236,6 +2279,8 @@ const struct file_operations fuse_dev_operations = {
 	.poll		= fuse_dev_poll,
 	.release	= fuse_dev_release,
 	.fasync		= fuse_dev_fasync,
+	.unlocked_ioctl = fuse_dev_ioctl,
+	.compat_ioctl   = fuse_dev_ioctl,
 };
 EXPORT_SYMBOL_GPL(fuse_dev_operations);
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0572bca49..5e2e08712 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1365,7 +1365,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
 	return err;
 }
 
-static char *read_link(struct dentry *dentry)
+static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct inode *inode = d_inode(dentry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1389,28 +1389,12 @@ static char *read_link(struct dentry *dentry)
 		link = ERR_PTR(ret);
 	} else {
 		link[ret] = '\0';
+		*cookie = link;
 	}
 	fuse_invalidate_atime(inode);
 	return link;
 }
 
-static void free_link(char *link)
-{
-	if (!IS_ERR(link))
-		free_page((unsigned long) link);
-}
-
-static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	nd_set_link(nd, read_link(dentry));
-	return NULL;
-}
-
-static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
-{
-	free_link(nd_get_link(nd));
-}
-
 static int fuse_dir_open(struct inode *inode, struct file *file)
 {
 	return fuse_open_common(inode, file, true);
@@ -1926,7 +1910,7 @@ static const struct inode_operations fuse_common_inode_operations = {
 static const struct inode_operations fuse_symlink_inode_operations = {
 	.setattr	= fuse_setattr,
 	.follow_link	= fuse_follow_link,
-	.put_link	= fuse_put_link,
+	.put_link	= free_page_put_link,
 	.readlink	= generic_readlink,
 	.getattr	= fuse_getattr,
 	.setxattr	= fuse_setxattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ef05b5c4..f523f2f04 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -96,17 +96,17 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 			 * Drop the release request when client does not
 			 * implement 'open'
 			 */
-			req->background = 0;
+			__clear_bit(FR_BACKGROUND, &req->flags);
 			iput(req->misc.release.inode);
 			fuse_put_request(ff->fc, req);
 		} else if (sync) {
-			req->background = 0;
+			__clear_bit(FR_BACKGROUND, &req->flags);
 			fuse_request_send(ff->fc, req);
 			iput(req->misc.release.inode);
 			fuse_put_request(ff->fc, req);
 		} else {
 			req->end = fuse_release_end;
-			req->background = 1;
+			__set_bit(FR_BACKGROUND, &req->flags);
 			fuse_request_send_background(ff->fc, req);
 		}
 		kfree(ff);
@@ -299,8 +299,8 @@ void fuse_sync_release(struct fuse_file *ff, int flags)
 {
 	WARN_ON(atomic_read(&ff->count) > 1);
 	fuse_prepare_release(ff, flags, FUSE_RELEASE);
-	ff->reserved_req->force = 1;
-	ff->reserved_req->background = 0;
+	__set_bit(FR_FORCE, &ff->reserved_req->flags);
+	__clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
 	fuse_request_send(ff->fc, ff->reserved_req);
 	fuse_put_request(ff->fc, ff->reserved_req);
 	kfree(ff);
@@ -426,7 +426,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
-	req->force = 1;
+	__set_bit(FR_FORCE, &req->flags);
 	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
@@ -1169,7 +1169,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (err <= 0)
 		goto out;
 
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err)
 		goto out;
 
@@ -1445,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 
 	list_del(&req->writepages_entry);
 	for (i = 0; i < req->num_pages; i++) {
-		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
 		dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
-		bdi_writeout_inc(bdi);
+		wb_writeout_inc(&bdi->wb);
 	}
 	wake_up(&fi->page_waitq);
 }
@@ -1611,7 +1611,8 @@ static int fuse_writepage_locked(struct page *page)
 	if (!req)
 		goto err;
 
-	req->background = 1; /* writeback always goes to bg_queue */
+	/* writeback always goes to bg_queue */
+	__set_bit(FR_BACKGROUND, &req->flags);
 	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 	if (!tmp_page)
 		goto err_free;
@@ -1634,7 +1635,7 @@ static int fuse_writepage_locked(struct page *page)
 	req->end = fuse_writepage_end;
 	req->inode = inode;
 
-	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	spin_lock(&fc->lock);
@@ -1742,16 +1743,15 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 		}
 	}
 
-	if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
-					old_req->state == FUSE_REQ_PENDING)) {
+	if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) {
 		struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
 
 		copy_highpage(old_req->pages[0], page);
 		spin_unlock(&fc->lock);
 
-		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
 		dec_zone_page_state(page, NR_WRITEBACK_TEMP);
-		bdi_writeout_inc(bdi);
+		wb_writeout_inc(&bdi->wb);
 		fuse_writepage_free(fc, new_req);
 		fuse_request_free(new_req);
 		goto out;
@@ -1830,7 +1830,7 @@ static int fuse_writepages_fill(struct page *page,
 		req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
 		req->misc.write.next = NULL;
 		req->in.argpages = 1;
-		req->background = 1;
+		__set_bit(FR_BACKGROUND, &req->flags);
 		req->num_pages = 0;
 		req->end = fuse_writepage_end;
 		req->inode = inode;
@@ -1848,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page,
 	req->page_descs[req->num_pages].offset = 0;
 	req->page_descs[req->num_pages].length = PAGE_SIZE;
 
-	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	err = 0;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7354dc142..405113101 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -241,16 +241,6 @@ struct fuse_args {
 
 #define FUSE_ARGS(args) struct fuse_args args = {}
 
-/** The request state */
-enum fuse_req_state {
-	FUSE_REQ_INIT = 0,
-	FUSE_REQ_PENDING,
-	FUSE_REQ_READING,
-	FUSE_REQ_SENT,
-	FUSE_REQ_WRITING,
-	FUSE_REQ_FINISHED
-};
-
 /** The request IO state (for asynchronous processing) */
 struct fuse_io_priv {
 	int async;
@@ -267,7 +257,40 @@ struct fuse_io_priv {
 };
 
 /**
+ * Request flags
+ *
+ * FR_ISREPLY:		set if the request has reply
+ * FR_FORCE:		force sending of the request even if interrupted
+ * FR_BACKGROUND:	request is sent in the background
+ * FR_WAITING:		request is counted as "waiting"
+ * FR_ABORTED:		the request was aborted
+ * FR_INTERRUPTED:	the request has been interrupted
+ * FR_LOCKED:		data is being copied to/from the request
+ * FR_PENDING:		request is not yet in userspace
+ * FR_SENT:		request is in userspace, waiting for an answer
+ * FR_FINISHED:		request is finished
+ * FR_PRIVATE:		request is on private list
+ */
+enum fuse_req_flag {
+	FR_ISREPLY,
+	FR_FORCE,
+	FR_BACKGROUND,
+	FR_WAITING,
+	FR_ABORTED,
+	FR_INTERRUPTED,
+	FR_LOCKED,
+	FR_PENDING,
+	FR_SENT,
+	FR_FINISHED,
+	FR_PRIVATE,
+};
+
+/**
  * A request to the client
+ *
+ * .waitq.lock protects the following fields:
+ *   - FR_ABORTED
+ *   - FR_LOCKED (may also be modified under fc->lock, tested under both)
  */
 struct fuse_req {
 	/** This can be on either pending processing or io lists in
@@ -283,35 +306,8 @@ struct fuse_req {
 	/** Unique ID for the interrupt request */
 	u64 intr_unique;
 
-	/*
-	 * The following bitfields are either set once before the
-	 * request is queued or setting/clearing them is protected by
-	 * fuse_conn->lock
-	 */
-
-	/** True if the request has reply */
-	unsigned isreply:1;
-
-	/** Force sending of the request even if interrupted */
-	unsigned force:1;
-
-	/** The request was aborted */
-	unsigned aborted:1;
-
-	/** Request is sent in the background */
-	unsigned background:1;
-
-	/** The request has been interrupted */
-	unsigned interrupted:1;
-
-	/** Data is being copied to/from the request */
-	unsigned locked:1;
-
-	/** Request is counted as "waiting" */
-	unsigned waiting:1;
-
-	/** State of the request */
-	enum fuse_req_state state;
+	/* Request flags, updated with test/set/clear_bit() */
+	unsigned long flags;
 
 	/** The request input */
 	struct fuse_in in;
@@ -380,6 +376,61 @@ struct fuse_req {
 	struct file *stolen_file;
 };
 
+struct fuse_iqueue {
+	/** Connection established */
+	unsigned connected;
+
+	/** Readers of the connection are waiting on this */
+	wait_queue_head_t waitq;
+
+	/** The next unique request id */
+	u64 reqctr;
+
+	/** The list of pending requests */
+	struct list_head pending;
+
+	/** Pending interrupts */
+	struct list_head interrupts;
+
+	/** Queue of pending forgets */
+	struct fuse_forget_link forget_list_head;
+	struct fuse_forget_link *forget_list_tail;
+
+	/** Batching of FORGET requests (positive indicates FORGET batch) */
+	int forget_batch;
+
+	/** O_ASYNC requests */
+	struct fasync_struct *fasync;
+};
+
+struct fuse_pqueue {
+	/** Connection established */
+	unsigned connected;
+
+	/** Lock protecting accessess to  members of this structure */
+	spinlock_t lock;
+
+	/** The list of requests being processed */
+	struct list_head processing;
+
+	/** The list of requests under I/O */
+	struct list_head io;
+};
+
+/**
+ * Fuse device instance
+ */
+struct fuse_dev {
+	/** Fuse connection for this device */
+	struct fuse_conn *fc;
+
+	/** Processing queue */
+	struct fuse_pqueue pq;
+
+	/** list entry on fc->devices */
+	struct list_head entry;
+};
+
 /**
  * A Fuse connection.
  *
@@ -394,6 +445,9 @@ struct fuse_conn {
 	/** Refcount */
 	atomic_t count;
 
+	/** Number of fuse_dev's */
+	atomic_t dev_count;
+
 	struct rcu_head rcu;
 
 	/** The user id for this mount */
@@ -411,17 +465,8 @@ struct fuse_conn {
 	/** Maximum write size */
 	unsigned max_write;
 
-	/** Readers of the connection are waiting on this */
-	wait_queue_head_t waitq;
-
-	/** The list of pending requests */
-	struct list_head pending;
-
-	/** The list of requests being processed */
-	struct list_head processing;
-
-	/** The list of requests under I/O */
-	struct list_head io;
+	/** Input queue */
+	struct fuse_iqueue iq;
 
 	/** The next unique kernel file handle */
 	u64 khctr;
@@ -444,16 +489,6 @@ struct fuse_conn {
 	/** The list of background requests set aside for later queuing */
 	struct list_head bg_queue;
 
-	/** Pending interrupts */
-	struct list_head interrupts;
-
-	/** Queue of pending forgets */
-	struct fuse_forget_link forget_list_head;
-	struct fuse_forget_link *forget_list_tail;
-
-	/** Batching of FORGET requests (positive indicates FORGET batch) */
-	int forget_batch;
-
 	/** Flag indicating that INIT reply has been received. Allocating
 	 * any fuse request will be suspended until the flag is set */
 	int initialized;
@@ -469,9 +504,6 @@ struct fuse_conn {
 	/** waitq for reserved requests */
 	wait_queue_head_t reserved_req_waitq;
 
-	/** The next unique request id */
-	u64 reqctr;
-
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -594,9 +626,6 @@ struct fuse_conn {
 	/** number of dentries used in the above array */
 	int ctl_ndents;
 
-	/** O_ASYNC requests */
-	struct fasync_struct *fasync;
-
 	/** Key for lock owner ID scrambling */
 	u32 scramble_key[4];
 
@@ -614,6 +643,9 @@ struct fuse_conn {
 
 	/** Read/write semaphore to hold when accessing sb. */
 	struct rw_semaphore killsb;
+
+	/** List of device instances belonging to this connection */
+	struct list_head devices;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -826,6 +858,9 @@ void fuse_conn_init(struct fuse_conn *fc);
  */
 void fuse_conn_put(struct fuse_conn *fc);
 
+struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc);
+void fuse_dev_free(struct fuse_dev *fud);
+
 /**
  * Add connection to control filesystem
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 708d69711..2913db2a5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -362,8 +362,8 @@ static void fuse_send_destroy(struct fuse_conn *fc)
 	if (req && fc->conn_init) {
 		fc->destroy_req = NULL;
 		req->in.h.opcode = FUSE_DESTROY;
-		req->force = 1;
-		req->background = 0;
+		__set_bit(FR_FORCE, &req->flags);
+		__clear_bit(FR_BACKGROUND, &req->flags);
 		fuse_request_send(fc, req);
 		fuse_put_request(fc, req);
 	}
@@ -567,30 +567,46 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
 	return 0;
 }
 
+static void fuse_iqueue_init(struct fuse_iqueue *fiq)
+{
+	memset(fiq, 0, sizeof(struct fuse_iqueue));
+	init_waitqueue_head(&fiq->waitq);
+	INIT_LIST_HEAD(&fiq->pending);
+	INIT_LIST_HEAD(&fiq->interrupts);
+	fiq->forget_list_tail = &fiq->forget_list_head;
+	fiq->connected = 1;
+}
+
+static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+{
+	memset(fpq, 0, sizeof(struct fuse_pqueue));
+	spin_lock_init(&fpq->lock);
+	INIT_LIST_HEAD(&fpq->processing);
+	INIT_LIST_HEAD(&fpq->io);
+	fpq->connected = 1;
+}
+
 void fuse_conn_init(struct fuse_conn *fc)
 {
 	memset(fc, 0, sizeof(*fc));
 	spin_lock_init(&fc->lock);
 	init_rwsem(&fc->killsb);
 	atomic_set(&fc->count, 1);
-	init_waitqueue_head(&fc->waitq);
+	atomic_set(&fc->dev_count, 1);
 	init_waitqueue_head(&fc->blocked_waitq);
 	init_waitqueue_head(&fc->reserved_req_waitq);
-	INIT_LIST_HEAD(&fc->pending);
-	INIT_LIST_HEAD(&fc->processing);
-	INIT_LIST_HEAD(&fc->io);
-	INIT_LIST_HEAD(&fc->interrupts);
+	fuse_iqueue_init(&fc->iq);
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
-	fc->forget_list_tail = &fc->forget_list_head;
+	INIT_LIST_HEAD(&fc->devices);
 	atomic_set(&fc->num_waiting, 0);
 	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
 	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
 	fc->khctr = 0;
 	fc->polled_files = RB_ROOT;
-	fc->reqctr = 0;
 	fc->blocked = 0;
 	fc->initialized = 0;
+	fc->connected = 1;
 	fc->attr_version = 1;
 	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 }
@@ -930,6 +946,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 
 static void fuse_free_conn(struct fuse_conn *fc)
 {
+	WARN_ON(!list_empty(&fc->devices));
 	kfree_rcu(fc, rcu);
 }
 
@@ -975,8 +992,42 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 	return 0;
 }
 
+struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
+{
+	struct fuse_dev *fud;
+
+	fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL);
+	if (fud) {
+		fud->fc = fuse_conn_get(fc);
+		fuse_pqueue_init(&fud->pq);
+
+		spin_lock(&fc->lock);
+		list_add_tail(&fud->entry, &fc->devices);
+		spin_unlock(&fc->lock);
+	}
+
+	return fud;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_alloc);
+
+void fuse_dev_free(struct fuse_dev *fud)
+{
+	struct fuse_conn *fc = fud->fc;
+
+	if (fc) {
+		spin_lock(&fc->lock);
+		list_del(&fud->entry);
+		spin_unlock(&fc->lock);
+
+		fuse_conn_put(fc);
+	}
+	kfree(fud);
+}
+EXPORT_SYMBOL_GPL(fuse_dev_free);
+
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 {
+	struct fuse_dev *fud;
 	struct fuse_conn *fc;
 	struct inode *root;
 	struct fuse_mount_data d;
@@ -1028,11 +1079,15 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fuse_conn_init(fc);
 	fc->release = fuse_free_conn;
 
+	fud = fuse_dev_alloc(fc);
+	if (!fud)
+		goto err_put_conn;
+
 	fc->dev = sb->s_dev;
 	fc->sb = sb;
 	err = fuse_bdi_init(fc, sb);
 	if (err)
-		goto err_put_conn;
+		goto err_dev_free;
 
 	sb->s_bdi = &fc->bdi;
 
@@ -1053,14 +1108,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	root = fuse_get_root_inode(sb, d.rootmode);
 	root_dentry = d_make_root(root);
 	if (!root_dentry)
-		goto err_put_conn;
+		goto err_dev_free;
 	/* only now - we want root dentry with NULL ->d_op */
 	sb->s_d_op = &fuse_dentry_operations;
 
 	init_req = fuse_request_alloc(0);
 	if (!init_req)
 		goto err_put_root;
-	init_req->background = 1;
+	__set_bit(FR_BACKGROUND, &init_req->flags);
 
 	if (is_bdev) {
 		fc->destroy_req = fuse_request_alloc(0);
@@ -1079,8 +1134,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	list_add_tail(&fc->entry, &fuse_conn_list);
 	sb->s_root = root_dentry;
-	fc->connected = 1;
-	file->private_data = fuse_conn_get(fc);
+	file->private_data = fud;
 	mutex_unlock(&fuse_mutex);
 	/*
 	 * atomic_dec_and_test() in fput() provides the necessary
@@ -1099,6 +1153,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
+ err_dev_free:
+	fuse_dev_free(fud);
  err_put_conn:
 	fuse_bdi_destroy(fc);
 	fuse_conn_put(fc);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 5551fea0a..1caee0534 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -171,6 +171,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
 /**
  * gfs2_jdata_writepage - Write complete page
  * @page: Page to write
+ * @wbc: The writeback control
  *
  * Returns: errno
  *
@@ -221,9 +222,10 @@ static int gfs2_writepages(struct address_space *mapping,
  * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
  * @mapping: The mapping
  * @wbc: The writeback control
- * @writepage: The writepage function to call for each page
  * @pvec: The vector of pages
  * @nr_pages: The number of pages to write
+ * @end: End position
+ * @done_index: Page index
  *
  * Returns: non-zero if loop should terminate, zero otherwise
  */
@@ -333,8 +335,6 @@ continue_unlock:
  * gfs2_write_cache_jdata - Like write_cache_pages but different
  * @mapping: The mapping to write
  * @wbc: The writeback control
- * @writepage: The writepage function to call
- * @data: The data to pass to writepage
  *
  * The reason that we use our own function here is that we need to
  * start transactions before we grab page locks. This allows us
@@ -588,6 +588,10 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
 
 /**
  * gfs2_readpages - Read a bunch of pages at once
+ * @file: The file to read from
+ * @mapping: Address space info
+ * @pages: List of pages to read
+ * @nr_pages: Number of pages to read
  *
  * Some notes:
  * 1. This is only for readahead, so we can simply ignore any things
@@ -853,7 +857,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
  * @mapping: The address space to write to
  * @pos: The file position
  * @len: The length of the data
- * @copied:
+ * @copied: How much was actually copied by the VFS
  * @page: The page that has been written
  * @fsdata: The fsdata (unused in GFS2)
  *
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 31892871e..cf4ab8915 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -180,7 +180,7 @@ void gfs2_set_inode_flags(struct inode *inode)
 
 	flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC);
 	if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode))
-		inode->i_flags |= S_NOSEC;
+		flags |= S_NOSEC;
 	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
 		flags |= S_IMMUTABLE;
 	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
@@ -917,7 +917,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
 	struct gfs2_holder gh;
 	int ret;
 
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
+	if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
 		return -EOPNOTSUPP;
 
 	mutex_lock(&inode->i_mutex);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 0fa8062f8..a38e38f7b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1076,7 +1076,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
 			fast_path = 1;
 	}
-	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
+	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl) &&
+	    (glops->go_flags & GLOF_LRU))
 		gfs2_glock_add_to_lru(gl);
 
 	trace_gfs2_glock_queue(gh, 0);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index fe91951c3..fa3fa5e94 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -144,6 +144,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 	struct gfs2_rgrpd *rgd;
 	int error;
 
+	spin_lock(&gl->gl_spin);
+	rgd = gl->gl_object;
+	if (rgd)
+		gfs2_rgrp_brelse(rgd);
+	spin_unlock(&gl->gl_spin);
+
 	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
 		return;
 	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
@@ -175,15 +181,17 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct address_space *mapping = &sdp->sd_aspace;
+	struct gfs2_rgrpd *rgd = gl->gl_object;
+
+	if (rgd)
+		gfs2_rgrp_brelse(rgd);
 
 	WARN_ON_ONCE(!(flags & DIO_METADATA));
 	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
 	truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
 
-	if (gl->gl_object) {
-		struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
+	if (rgd)
 		rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
-	}
 }
 
 /**
@@ -561,7 +569,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 	.go_lock = inode_go_lock,
 	.go_dump = inode_go_dump,
 	.go_type = LM_TYPE_INODE,
-	.go_flags = GLOF_ASPACE,
+	.go_flags = GLOF_ASPACE | GLOF_LRU,
 };
 
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -584,10 +592,12 @@ const struct gfs2_glock_operations gfs2_freeze_glops = {
 const struct gfs2_glock_operations gfs2_iopen_glops = {
 	.go_type = LM_TYPE_IOPEN,
 	.go_callback = iopen_go_callback,
+	.go_flags = GLOF_LRU,
 };
 
 const struct gfs2_glock_operations gfs2_flock_glops = {
 	.go_type = LM_TYPE_FLOCK,
+	.go_flags = GLOF_LRU,
 };
 
 const struct gfs2_glock_operations gfs2_nondisk_glops = {
@@ -596,7 +606,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
 
 const struct gfs2_glock_operations gfs2_quota_glops = {
 	.go_type = LM_TYPE_QUOTA,
-	.go_flags = GLOF_LVB,
+	.go_flags = GLOF_LVB | GLOF_LRU,
 };
 
 const struct gfs2_glock_operations gfs2_journal_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 58b75abf6..a1ec7c20e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -225,6 +225,7 @@ struct gfs2_glock_operations {
 	const unsigned long go_flags;
 #define GLOF_ASPACE 1
 #define GLOF_LVB    2
+#define GLOF_LRU    4
 };
 
 enum {
@@ -432,6 +433,7 @@ enum {
 	QDF_CHANGE		= 1,
 	QDF_LOCKED		= 2,
 	QDF_REFRESH		= 3,
+	QDF_QMSG_QUIET          = 4,
 };
 
 struct gfs2_quota_data {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 1b3ca7a2e..063fdfcf8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1227,8 +1227,8 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
  */
 
 static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
-                            struct file *file, unsigned flags,
-                            umode_t mode, int *opened)
+			    struct file *file, unsigned flags,
+			    umode_t mode, int *opened)
 {
 	struct dentry *d;
 	bool excl = !!(flags & O_EXCL);
@@ -1307,6 +1307,35 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
 }
 
 /**
+ * update_moved_ino - Update an inode that's being moved
+ * @ip: The inode being moved
+ * @ndip: The parent directory of the new filename
+ * @dir_rename: True of ip is a directory
+ *
+ * Returns: errno
+ */
+
+static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip,
+			    int dir_rename)
+{
+	int error;
+	struct buffer_head *dibh;
+
+	if (dir_rename)
+		return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+	ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+	return 0;
+}
+
+
+/**
  * gfs2_rename - Rename a file
  * @odir: Parent directory of old file name
  * @odentry: The old dentry of the file
@@ -1354,7 +1383,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 
 		if (S_ISDIR(ip->i_inode.i_mode)) {
 			dir_rename = 1;
-			/* don't move a dirctory into it's subdir */
+			/* don't move a directory into its subdir */
 			error = gfs2_ok_to_move(ip, ndip);
 			if (error)
 				goto out_gunlock_r;
@@ -1494,20 +1523,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	if (nip)
 		error = gfs2_unlink_inode(ndip, ndentry);
 
-	if (dir_rename) {
-		error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
-		if (error)
-			goto out_end_trans;
-	} else {
-		struct buffer_head *dibh;
-		error = gfs2_meta_inode_buffer(ip, &dibh);
-		if (error)
-			goto out_end_trans;
-		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_meta(ip->i_gl, dibh);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
+	error = update_moved_ino(ip, ndip, dir_rename);
+	if (error)
+		goto out_end_trans;
 
 	error = gfs2_dir_del(odip, odentry);
 	if (error)
@@ -1539,6 +1557,161 @@ out:
 }
 
 /**
+ * gfs2_exchange - exchange two files
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ * @flags: The rename flags
+ *
+ * Returns: errno
+ */
+
+static int gfs2_exchange(struct inode *odir, struct dentry *odentry,
+			 struct inode *ndir, struct dentry *ndentry,
+			 unsigned int flags)
+{
+	struct gfs2_inode *odip = GFS2_I(odir);
+	struct gfs2_inode *ndip = GFS2_I(ndir);
+	struct gfs2_inode *oip = GFS2_I(odentry->d_inode);
+	struct gfs2_inode *nip = GFS2_I(ndentry->d_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(odir);
+	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+	unsigned int num_gh;
+	unsigned int x;
+	umode_t old_mode = oip->i_inode.i_mode;
+	umode_t new_mode = nip->i_inode.i_mode;
+	int error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	if (odip != ndip) {
+		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+					   0, &r_gh);
+		if (error)
+			goto out;
+
+		if (S_ISDIR(old_mode)) {
+			/* don't move a directory into its subdir */
+			error = gfs2_ok_to_move(oip, ndip);
+			if (error)
+				goto out_gunlock_r;
+		}
+
+		if (S_ISDIR(new_mode)) {
+			/* don't move a directory into its subdir */
+			error = gfs2_ok_to_move(nip, odip);
+			if (error)
+				goto out_gunlock_r;
+		}
+	}
+
+	num_gh = 1;
+	gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	if (odip != ndip) {
+		gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		num_gh++;
+	}
+	gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+	num_gh++;
+
+	gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+	num_gh++;
+
+	for (x = 0; x < num_gh; x++) {
+		error = gfs2_glock_nq(ghs + x);
+		if (error)
+			goto out_gunlock;
+	}
+
+	error = -ENOENT;
+	if (oip->i_inode.i_nlink == 0 || nip->i_inode.i_nlink == 0)
+		goto out_gunlock;
+
+	error = gfs2_unlink_ok(odip, &odentry->d_name, oip);
+	if (error)
+		goto out_gunlock;
+	error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+	if (error)
+		goto out_gunlock;
+
+	if (S_ISDIR(old_mode)) {
+		error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+		if (error)
+			goto out_gunlock;
+	}
+	if (S_ISDIR(new_mode)) {
+		error = gfs2_permission(ndentry->d_inode, MAY_WRITE);
+		if (error)
+			goto out_gunlock;
+	}
+	error = gfs2_trans_begin(sdp, 4 * RES_DINODE + 4 * RES_LEAF, 0);
+	if (error)
+		goto out_gunlock;
+
+	error = update_moved_ino(oip, ndip, S_ISDIR(old_mode));
+	if (error)
+		goto out_end_trans;
+
+	error = update_moved_ino(nip, odip, S_ISDIR(new_mode));
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_dir_mvino(ndip, &ndentry->d_name, oip,
+			       IF2DT(old_mode));
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_dir_mvino(odip, &odentry->d_name, nip,
+			       IF2DT(new_mode));
+	if (error)
+		goto out_end_trans;
+
+	if (odip != ndip) {
+		if (S_ISDIR(new_mode) && !S_ISDIR(old_mode)) {
+			inc_nlink(&odip->i_inode);
+			drop_nlink(&ndip->i_inode);
+		} else if (S_ISDIR(old_mode) && !S_ISDIR(new_mode)) {
+			inc_nlink(&ndip->i_inode);
+			drop_nlink(&odip->i_inode);
+		}
+	}
+	mark_inode_dirty(&ndip->i_inode);
+	if (odip != ndip)
+		mark_inode_dirty(&odip->i_inode);
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_gunlock:
+	while (x--) {
+		gfs2_glock_dq(ghs + x);
+		gfs2_holder_uninit(ghs + x);
+	}
+out_gunlock_r:
+	if (r_gh.gh_gl)
+		gfs2_glock_dq_uninit(&r_gh);
+out:
+	return error;
+}
+
+static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
+			struct inode *ndir, struct dentry *ndentry,
+			unsigned int flags)
+{
+	flags &= ~RENAME_NOREPLACE;
+
+	if (flags & ~RENAME_EXCHANGE)
+		return -EINVAL;
+
+	if (flags & RENAME_EXCHANGE)
+		return gfs2_exchange(odir, odentry, ndir, ndentry, flags);
+
+	return gfs2_rename(odir, odentry, ndir, ndentry);
+}
+
+/**
  * gfs2_follow_link - Follow a symbolic link
  * @dentry: The dentry of the link
  * @nd: Data that we pass to vfs_follow_link()
@@ -1548,7 +1721,7 @@ out:
  * Returns: 0 on success or error code
  */
 
-static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
 	struct gfs2_holder i_gh;
@@ -1561,8 +1734,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 	error = gfs2_glock_nq(&i_gh);
 	if (error) {
 		gfs2_holder_uninit(&i_gh);
-		nd_set_link(nd, ERR_PTR(error));
-		return NULL;
+		return ERR_PTR(error);
 	}
 
 	size = (unsigned int)i_size_read(&ip->i_inode);
@@ -1586,8 +1758,9 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 	brelse(dibh);
 out:
 	gfs2_glock_dq_uninit(&i_gh);
-	nd_set_link(nd, buf);
-	return NULL;
+	if (!IS_ERR(buf))
+		*cookie = buf;
+	return buf;
 }
 
 /**
@@ -1716,7 +1889,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 
 	if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
 	    !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
-		gfs2_quota_change(ip, -ap.target, ouid, ogid);
+		gfs2_quota_change(ip, -(s64)ap.target, ouid, ogid);
 		gfs2_quota_change(ip, ap.target, nuid, ngid);
 	}
 
@@ -1943,7 +2116,7 @@ const struct inode_operations gfs2_dir_iops = {
 	.mkdir = gfs2_mkdir,
 	.rmdir = gfs2_unlink,
 	.mknod = gfs2_mknod,
-	.rename = gfs2_rename,
+	.rename2 = gfs2_rename2,
 	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 35b49f44c..1e3a93f2f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -756,6 +756,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 		}
 	}
 
+	sdp->sd_log_idle = 1;
 	set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
 	gfs2_glock_dq_uninit(&ji_gh);
 	jindex = 0;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e3065cb9a..9b61f92fc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -649,9 +649,117 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 		slot_hold(qd);
 	}
 
+	if (change < 0) /* Reset quiet flag if we freed some blocks */
+		clear_bit(QDF_QMSG_QUIET, &qd->qd_flags);
 	mutex_unlock(&sdp->sd_quota_mutex);
 }
 
+static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
+				  unsigned off, void *buf, unsigned bytes)
+{
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	struct buffer_head *bh;
+	void *kaddr;
+	u64 blk;
+	unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0;
+	unsigned to_write = bytes, pg_off = off;
+	int done = 0;
+
+	blk = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift);
+	boff = off % bsize;
+
+	page = find_or_create_page(mapping, index, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, bsize, 0);
+
+	bh = page_buffers(page);
+	while (!done) {
+		/* Find the beginning block within the page */
+		if (pg_off >= ((bnum * bsize) + bsize)) {
+			bh = bh->b_this_page;
+			bnum++;
+			blk++;
+			continue;
+		}
+		if (!buffer_mapped(bh)) {
+			gfs2_block_map(inode, blk, bh, 1);
+			if (!buffer_mapped(bh))
+				goto unlock_out;
+			/* If it's a newly allocated disk block, zero it */
+			if (buffer_new(bh))
+				zero_user(page, bnum * bsize, bh->b_size);
+		}
+		if (PageUptodate(page))
+			set_buffer_uptodate(bh);
+		if (!buffer_uptodate(bh)) {
+			ll_rw_block(READ | REQ_META, 1, &bh);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				goto unlock_out;
+		}
+		gfs2_trans_add_data(ip->i_gl, bh);
+
+		/* If we need to write to the next block as well */
+		if (to_write > (bsize - boff)) {
+			pg_off += (bsize - boff);
+			to_write -= (bsize - boff);
+			boff = pg_off % bsize;
+			continue;
+		}
+		done = 1;
+	}
+
+	/* Write to the page, now that we have setup the buffer(s) */
+	kaddr = kmap_atomic(page);
+	memcpy(kaddr + off, buf, bytes);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return 0;
+
+unlock_out:
+	unlock_page(page);
+	page_cache_release(page);
+	return -EIO;
+}
+
+static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp,
+				 loff_t loc)
+{
+	unsigned long pg_beg;
+	unsigned pg_off, nbytes, overflow = 0;
+	int pg_oflow = 0, error;
+	void *ptr;
+
+	nbytes = sizeof(struct gfs2_quota);
+
+	pg_beg = loc >> PAGE_CACHE_SHIFT;
+	pg_off = loc % PAGE_CACHE_SIZE;
+
+	/* If the quota straddles a page boundary, split the write in two */
+	if ((pg_off + nbytes) > PAGE_CACHE_SIZE) {
+		pg_oflow = 1;
+		overflow = (pg_off + nbytes) - PAGE_CACHE_SIZE;
+	}
+
+	ptr = qp;
+	error = gfs2_write_buf_to_page(ip, pg_beg, pg_off, ptr,
+				       nbytes - overflow);
+	/* If there's an overflow, write the remaining bytes to the next page */
+	if (!error && pg_oflow)
+		error = gfs2_write_buf_to_page(ip, pg_beg + 1, 0,
+					       ptr + nbytes - overflow,
+					       overflow);
+	return error;
+}
+
 /**
  * gfs2_adjust_quota - adjust record of current block usage
  * @ip: The quota inode
@@ -672,15 +780,8 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 {
 	struct inode *inode = &ip->i_inode;
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long index = loc >> PAGE_CACHE_SHIFT;
-	unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
-	unsigned blocksize, iblock, pos;
-	struct buffer_head *bh;
-	struct page *page;
-	void *kaddr, *ptr;
 	struct gfs2_quota q;
-	int err, nbytes;
+	int err;
 	u64 size;
 
 	if (gfs2_is_stuffed(ip)) {
@@ -694,8 +795,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	if (err < 0)
 		return err;
 
+	loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */
 	err = -EIO;
 	be64_add_cpu(&q.qu_value, change);
+	if (((s64)be64_to_cpu(q.qu_value)) < 0)
+		q.qu_value = 0; /* Never go negative on quota usage */
 	qd->qd_qb.qb_value = q.qu_value;
 	if (fdq) {
 		if (fdq->d_fieldmask & QC_SPC_SOFT) {
@@ -712,79 +816,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 		}
 	}
 
-	/* Write the quota into the quota file on disk */
-	ptr = &q;
-	nbytes = sizeof(struct gfs2_quota);
-get_a_page:
-	page = find_or_create_page(mapping, index, GFP_NOFS);
-	if (!page)
-		return -ENOMEM;
-
-	blocksize = inode->i_sb->s_blocksize;
-	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-
-	bh = page_buffers(page);
-	pos = blocksize;
-	while (offset >= pos) {
-		bh = bh->b_this_page;
-		iblock++;
-		pos += blocksize;
-	}
-
-	if (!buffer_mapped(bh)) {
-		gfs2_block_map(inode, iblock, bh, 1);
-		if (!buffer_mapped(bh))
-			goto unlock_out;
-		/* If it's a newly allocated disk block for quota, zero it */
-		if (buffer_new(bh))
-			zero_user(page, pos - blocksize, bh->b_size);
-	}
-
-	if (PageUptodate(page))
-		set_buffer_uptodate(bh);
-
-	if (!buffer_uptodate(bh)) {
-		ll_rw_block(READ | REQ_META, 1, &bh);
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh))
-			goto unlock_out;
-	}
-
-	gfs2_trans_add_data(ip->i_gl, bh);
-
-	kaddr = kmap_atomic(page);
-	if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
-		nbytes = PAGE_CACHE_SIZE - offset;
-	memcpy(kaddr + offset, ptr, nbytes);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr);
-	unlock_page(page);
-	page_cache_release(page);
-
-	/* If quota straddles page boundary, we need to update the rest of the
-	 * quota at the beginning of the next page */
-	if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
-		ptr = ptr + nbytes;
-		nbytes = sizeof(struct gfs2_quota) - nbytes;
-		offset = 0;
-		index++;
-		goto get_a_page;
+	err = gfs2_write_disk_quota(ip, &q, loc);
+	if (!err) {
+		size = loc + sizeof(struct gfs2_quota);
+		if (size > inode->i_size)
+			i_size_write(inode, size);
+		inode->i_mtime = inode->i_atime = CURRENT_TIME;
+		mark_inode_dirty(inode);
+		set_bit(QDF_REFRESH, &qd->qd_flags);
 	}
 
-	size = loc + sizeof(struct gfs2_quota);
-	if (size > inode->i_size)
-		i_size_write(inode, size);
-	inode->i_mtime = inode->i_atime = CURRENT_TIME;
-	mark_inode_dirty(inode);
-	set_bit(QDF_REFRESH, &qd->qd_flags);
-	return 0;
-
-unlock_out:
-	unlock_page(page);
-	page_cache_release(page);
 	return err;
 }
 
@@ -1148,10 +1189,13 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
 			/* If no min_target specified or we don't meet
 			 * min_target, return -EDQUOT */
 			if (!ap->min_target || ap->min_target > ap->allowed) {
-				print_message(qd, "exceeded");
-				quota_send_warning(qd->qd_id,
-						   sdp->sd_vfs->s_dev,
-						   QUOTA_NL_BHARDWARN);
+				if (!test_and_set_bit(QDF_QMSG_QUIET,
+						      &qd->qd_flags)) {
+					print_message(qd, "exceeded");
+					quota_send_warning(qd->qd_id,
+							   sdp->sd_vfs->s_dev,
+							   QUOTA_NL_BHARDWARN);
+				}
 				error = -EDQUOT;
 				break;
 			}
@@ -1648,6 +1692,8 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
 
 	/* Apply changes */
 	error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+	if (!error)
+		clear_bit(QDF_QMSG_QUIET, &qd->qd_flags);
 
 	gfs2_trans_end(sdp);
 out_release:
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6af2396a3..c6c62321d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -978,10 +978,10 @@ static void set_rgrp_preferences(struct gfs2_sbd *sdp)
 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
 		for (i = 0; i < sdp->sd_journals; i++) {
 			rgd = gfs2_rgrpd_get_next(rgd);
-			if (rgd == first)
+			if (!rgd || rgd == first)
 				break;
 		}
-	} while (rgd != first);
+	} while (rgd && rgd != first);
 }
 
 /**
@@ -1244,14 +1244,13 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
 }
 
 /**
- * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
- * @gh: The glock holder for the resource group
+ * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * @rgd: The resource group
  *
  */
 
-void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
+void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
 {
-	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
 	int x, length = rgd->rd_length;
 
 	for (x = 0; x < length; x++) {
@@ -1264,6 +1263,22 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
 
 }
 
+/**
+ * gfs2_rgrp_go_unlock - Unlock a rgrp glock
+ * @gh: The glock holder for the resource group
+ *
+ */
+
+void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
+{
+	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
+	int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
+		test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
+
+	if (rgd && demote_requested)
+		gfs2_rgrp_brelse(rgd);
+}
+
 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 			     struct buffer_head *bh,
 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
@@ -1711,10 +1726,8 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
 		return ret;
 
 bitmap_full:	/* Mark bitmap as full and fall through */
-		if ((state == GFS2_BLKST_FREE) && initial_offset == 0) {
-			struct gfs2_bitmap *bi = rbm_bi(rbm);
+		if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
 			set_bit(GBF_FULL, &bi->bi_flags);
-		}
 
 next_bitmap:	/* Find next bitmap in the rgrp */
 		rbm->offset = 0;
@@ -1850,14 +1863,23 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
 	const struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct gfs2_lkstats *st;
 	s64 r_dcount, l_dcount;
-	s64 r_srttb, l_srttb;
+	s64 l_srttb, a_srttb = 0;
 	s64 srttb_diff;
 	s64 sqr_diff;
 	s64 var;
+	int cpu, nonzero = 0;
 
 	preempt_disable();
+	for_each_present_cpu(cpu) {
+		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
+		if (st->stats[GFS2_LKS_SRTTB]) {
+			a_srttb += st->stats[GFS2_LKS_SRTTB];
+			nonzero++;
+		}
+	}
 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
-	r_srttb = st->stats[GFS2_LKS_SRTTB];
+	if (nonzero)
+		do_div(a_srttb, nonzero);
 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
 	var = st->stats[GFS2_LKS_SRTTVARB] +
 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
@@ -1866,10 +1888,10 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
 
-	if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0))
+	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
 		return false;
 
-	srttb_diff = r_srttb - l_srttb;
+	srttb_diff = a_srttb - l_srttb;
 	sqr_diff = srttb_diff * srttb_diff;
 
 	var *= 2;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 68972ecfb..c0ab33fa3 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -36,6 +36,7 @@ extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
 extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
 extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
 extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
+extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
 extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 
 extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 859c6edbf..298244594 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
-	if (bdi->dirty_exceeded)
+	if (bdi->wb.dirty_exceeded)
 		gfs2_ail1_flush(sdp, wbc);
 	else
 		filemap_fdatawrite(metamapping);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ae8e8811f..c9ff1cf7d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -101,8 +101,11 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
 
 static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
-	int error;
-	int n = simple_strtol(buf, NULL, 0);
+	int error, n;
+
+	error = kstrtoint(buf, 0, &n);
+	if (error)
+		return error;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -134,10 +137,16 @@ static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
 
 static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
+	int error, val;
+
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (simple_strtol(buf, NULL, 0) != 1)
+	error = kstrtoint(buf, 0, &val);
+	if (error)
+		return error;
+
+	if (val != 1)
 		return -EINVAL;
 
 	gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n");
@@ -148,10 +157,16 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
 				 size_t len)
 {
+	int error, val;
+
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (simple_strtol(buf, NULL, 0) != 1)
+	error = kstrtoint(buf, 0, &val);
+	if (error)
+		return error;
+
+	if (val != 1)
 		return -EINVAL;
 
 	gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -161,10 +176,16 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
 static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
 				size_t len)
 {
+	int error, val;
+
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (simple_strtol(buf, NULL, 0) != 1)
+	error = kstrtoint(buf, 0, &val);
+	if (error)
+		return error;
+
+	if (val != 1)
 		return -EINVAL;
 
 	gfs2_quota_sync(sdp->sd_vfs, 0);
@@ -181,7 +202,9 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	id = simple_strtoul(buf, NULL, 0);
+	error = kstrtou32(buf, 0, &id);
+	if (error)
+		return error;
 
 	qid = make_kqid(current_user_ns(), USRQUOTA, id);
 	if (!qid_valid(qid))
@@ -201,7 +224,9 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	id = simple_strtoul(buf, NULL, 0);
+	error = kstrtou32(buf, 0, &id);
+	if (error)
+		return error;
 
 	qid = make_kqid(current_user_ns(), GRPQUOTA, id);
 	if (!qid_valid(qid))
@@ -324,10 +349,11 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
 static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	ssize_t ret = len;
-	int val;
+	int ret, val;
 
-	val = simple_strtol(buf, NULL, 0);
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
 
 	if (val == 1)
 		set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
@@ -336,9 +362,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 		smp_mb__after_atomic();
 		gfs2_glock_thaw(sdp);
 	} else {
-		ret = -EINVAL;
+		return -EINVAL;
 	}
-	return ret;
+	return len;
 }
 
 static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf)
@@ -350,17 +376,18 @@ static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf)
 
 static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
-	ssize_t ret = len;
-	int val;
+	int ret, val;
 
-	val = simple_strtol(buf, NULL, 0);
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
 
 	if ((val == 1) &&
 	    !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
 		complete(&sdp->sd_wdack);
 	else
-		ret = -EINVAL;
-	return ret;
+		return -EINVAL;
+	return len;
 }
 
 static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
@@ -553,11 +580,14 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
 {
 	struct gfs2_tune *gt = &sdp->sd_tune;
 	unsigned int x;
+	int error;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	x = simple_strtoul(buf, NULL, 0);
+	error = kstrtouint(buf, 0, &x);
+	if (error)
+		return error;
 
 	if (check_zero && !x)
 		return -EINVAL;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 95d255219..1f1c7dcbc 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -252,7 +252,7 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb);
 #define __hfs_u_to_mtime(sec)	cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60)
 #define __hfs_m_to_utime(sec)	(be32_to_cpu(sec) - 2082844800U  + sys_tz.tz_minuteswest * 60)
 
-#define HFS_I(inode)	(list_entry(inode, struct hfs_inode_info, vfs_inode))
+#define HFS_I(inode)	(container_of(inode, struct hfs_inode_info, vfs_inode))
 #define HFS_SB(sb)	((struct hfs_sb_info *)(sb)->s_fs_info)
 
 #define hfs_m_to_utime(time)	(struct timespec){ .tv_sec = __hfs_m_to_utime(time) }
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index eee7206c3..55c03b9e9 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -14,6 +14,7 @@
 
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/nls.h>
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index b0441d65f..f91a1faf8 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -263,7 +263,7 @@ struct hfsplus_inode_info {
 
 static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 {
-	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
+	return container_of(inode, struct hfsplus_inode_info, vfs_inode);
 }
 
 /*
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 593af2fdc..7302d96ae 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/vfs.h>
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 07d8d8f52..059597b23 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -892,7 +892,7 @@ static const struct inode_operations hostfs_dir_iops = {
 	.setattr	= hostfs_setattr,
 };
 
-static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
 {
 	char *link = __getname();
 	if (link) {
@@ -906,21 +906,18 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 		}
 		if (err < 0) {
 			__putname(link);
-			link = ERR_PTR(err);
+			return ERR_PTR(err);
 		}
 	} else {
-		link = ERR_PTR(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
 	}
 
-	nd_set_link(nd, link);
-	return NULL;
+	return *cookie = link;
 }
 
-static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+static void hostfs_put_link(struct inode *unused, void *cookie)
 {
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		__putname(s);
+	__putname(cookie);
 }
 
 static const struct inode_operations hostfs_link_iops = {
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index f005046e1..d6a4b55d2 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -484,3 +484,98 @@ struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *a
 	a->btree.first_free = cpu_to_le16(8);
 	return a;
 }
+
+static unsigned find_run(__le32 *bmp, unsigned *idx)
+{
+	unsigned len;
+	while (tstbits(bmp, *idx, 1)) {
+		(*idx)++;
+		if (unlikely(*idx >= 0x4000))
+			return 0;
+	}
+	len = 1;
+	while (!tstbits(bmp, *idx + len, 1))
+		len++;
+	return len;
+}
+
+static int do_trim(struct super_block *s, secno start, unsigned len, secno limit_start, secno limit_end, unsigned minlen, unsigned *result)
+{
+	int err;
+	secno end;
+	if (fatal_signal_pending(current))
+		return -EINTR;
+	end = start + len;
+	if (start < limit_start)
+		start = limit_start;
+	if (end > limit_end)
+		end = limit_end;
+	if (start >= end)
+		return 0;
+	if (end - start < minlen)
+		return 0;
+	err = sb_issue_discard(s, start, end - start, GFP_NOFS, 0);
+	if (err)
+		return err;
+	*result += end - start;
+	return 0;
+}
+
+int hpfs_trim_fs(struct super_block *s, u64 start, u64 end, u64 minlen, unsigned *result)
+{
+	int err = 0;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	unsigned idx, len, start_bmp, end_bmp;
+	__le32 *bmp;
+	struct quad_buffer_head qbh;
+
+	*result = 0;
+	if (!end || end > sbi->sb_fs_size)
+		end = sbi->sb_fs_size;
+	if (start >= sbi->sb_fs_size)
+		return 0;
+	if (minlen > 0x4000)
+		return 0;
+	if (start < sbi->sb_dirband_start + sbi->sb_dirband_size && end > sbi->sb_dirband_start) {
+		hpfs_lock(s);
+		if (s->s_flags & MS_RDONLY) {
+			err = -EROFS;
+			goto unlock_1;
+		}
+		if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
+			err = -EIO;
+			goto unlock_1;
+		}
+		idx = 0;
+		while ((len = find_run(bmp, &idx)) && !err) {
+			err = do_trim(s, sbi->sb_dirband_start + idx * 4, len * 4, start, end, minlen, result);
+			idx += len;
+		}
+		hpfs_brelse4(&qbh);
+unlock_1:
+		hpfs_unlock(s);
+	}
+	start_bmp = start >> 14;
+	end_bmp = (end + 0x3fff) >> 14;
+	while (start_bmp < end_bmp && !err) {
+		hpfs_lock(s);
+		if (s->s_flags & MS_RDONLY) {
+			err = -EROFS;
+			goto unlock_2;
+		}
+		if (!(bmp = hpfs_map_bitmap(s, start_bmp, &qbh, "trim"))) {
+			err = -EIO;
+			goto unlock_2;
+		}
+		idx = 0;
+		while ((len = find_run(bmp, &idx)) && !err) {
+			err = do_trim(s, (start_bmp << 14) + idx, len, start, end, minlen, result);
+			idx += len;
+		}
+		hpfs_brelse4(&qbh);
+unlock_2:
+		hpfs_unlock(s);
+		start_bmp++;
+	}
+	return err;
+}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2a8e07425..dc540bfce 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -327,4 +327,5 @@ const struct file_operations hpfs_dir_ops =
 	.iterate	= hpfs_readdir,
 	.release	= hpfs_dir_release,
 	.fsync		= hpfs_file_fsync,
+	.unlocked_ioctl	= hpfs_ioctl,
 };
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 6d8cfe9b5..7ca28d604 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -203,6 +203,7 @@ const struct file_operations hpfs_file_ops =
 	.release	= hpfs_file_release,
 	.fsync		= hpfs_file_fsync,
 	.splice_read	= generic_file_splice_read,
+	.unlocked_ioctl	= hpfs_ioctl,
 };
 
 const struct inode_operations hpfs_file_iops =
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b63b75fa0..c4867b511 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -18,6 +18,8 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/blkdev.h>
 #include <asm/unaligned.h>
 
 #include "hpfs.h"
@@ -200,6 +202,7 @@ void hpfs_free_dnode(struct super_block *, secno);
 struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *);
 struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **);
 struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **);
+int hpfs_trim_fs(struct super_block *, u64, u64, u64, unsigned *);
 
 /* anode.c */
 
@@ -304,7 +307,7 @@ extern const struct address_space_operations hpfs_symlink_aops;
 
 static inline struct hpfs_inode_info *hpfs_i(struct inode *inode)
 {
-	return list_entry(inode, struct hpfs_inode_info, vfs_inode);
+	return container_of(inode, struct hpfs_inode_info, vfs_inode);
 }
 
 static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
@@ -318,6 +321,7 @@ __printf(2, 3)
 void hpfs_error(struct super_block *, const char *, ...);
 int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
 unsigned hpfs_get_free_dnodes(struct super_block *);
+long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 
 /*
  * local time (HPFS) to GMT (Unix)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 8685c6557..68a9bed05 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -199,12 +199,39 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+
+long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	switch (cmd) {
+		case FITRIM: {
+			struct fstrim_range range;
+			secno n_trimmed;
+			int r;
+			if (!capable(CAP_SYS_ADMIN))
+				return -EPERM;
+			if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range)))
+				return -EFAULT;
+			r = hpfs_trim_fs(file_inode(file)->i_sb, range.start >> 9, (range.start + range.len) >> 9, (range.minlen + 511) >> 9, &n_trimmed);
+			if (r)
+				return r;
+			range.len = (u64)n_trimmed << 9;
+			if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range)))
+				return -EFAULT;
+			return 0;
+		}
+		default: {
+			return -ENOIOCTLCMD;
+		}
+	}
+}
+
+
 static struct kmem_cache * hpfs_inode_cachep;
 
 static struct inode *hpfs_alloc_inode(struct super_block *sb)
 {
 	struct hpfs_inode_info *ei;
-	ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
+	ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
 	ei->vfs_inode.i_version = 1;
diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile
deleted file mode 100644
index 3a982bd97..000000000
--- a/fs/hppfs/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-#
-# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
-# Licensed under the GPL
-#
-
-obj-$(CONFIG_HPPFS) += hppfs.o
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
deleted file mode 100644
index fa2bd5366..000000000
--- a/fs/hppfs/hppfs.c
+++ /dev/null
@@ -1,766 +0,0 @@
-/*
- * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <linux/ctype.h>
-#include <linux/dcache.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/slab.h>
-#include <linux/statfs.h>
-#include <linux/types.h>
-#include <linux/pid_namespace.h>
-#include <linux/namei.h>
-#include <asm/uaccess.h>
-#include <os.h>
-
-static struct inode *get_inode(struct super_block *, struct dentry *);
-
-struct hppfs_data {
-	struct list_head list;
-	char contents[PAGE_SIZE - sizeof(struct list_head)];
-};
-
-struct hppfs_private {
-	struct file *proc_file;
-	int host_fd;
-	loff_t len;
-	struct hppfs_data *contents;
-};
-
-struct hppfs_inode_info {
-	struct dentry *proc_dentry;
-	struct inode vfs_inode;
-};
-
-static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode)
-{
-	return container_of(inode, struct hppfs_inode_info, vfs_inode);
-}
-
-#define HPPFS_SUPER_MAGIC 0xb00000ee
-
-static const struct super_operations hppfs_sbops;
-
-static int is_pid(struct dentry *dentry)
-{
-	struct super_block *sb;
-	int i;
-
-	sb = dentry->d_sb;
-	if (dentry->d_parent != sb->s_root)
-		return 0;
-
-	for (i = 0; i < dentry->d_name.len; i++) {
-		if (!isdigit(dentry->d_name.name[i]))
-			return 0;
-	}
-	return 1;
-}
-
-static char *dentry_name(struct dentry *dentry, int extra)
-{
-	struct dentry *parent;
-	char *root, *name;
-	const char *seg_name;
-	int len, seg_len, root_len;
-
-	len = 0;
-	parent = dentry;
-	while (parent->d_parent != parent) {
-		if (is_pid(parent))
-			len += strlen("pid") + 1;
-		else len += parent->d_name.len + 1;
-		parent = parent->d_parent;
-	}
-
-	root = "proc";
-	root_len = strlen(root);
-	len += root_len;
-	name = kmalloc(len + extra + 1, GFP_KERNEL);
-	if (name == NULL)
-		return NULL;
-
-	name[len] = '\0';
-	parent = dentry;
-	while (parent->d_parent != parent) {
-		if (is_pid(parent)) {
-			seg_name = "pid";
-			seg_len = strlen(seg_name);
-		}
-		else {
-			seg_name = parent->d_name.name;
-			seg_len = parent->d_name.len;
-		}
-
-		len -= seg_len + 1;
-		name[len] = '/';
-		memcpy(&name[len + 1], seg_name, seg_len);
-		parent = parent->d_parent;
-	}
-	memcpy(name, root, root_len);
-	return name;
-}
-
-static int file_removed(struct dentry *dentry, const char *file)
-{
-	char *host_file;
-	int extra, fd;
-
-	extra = 0;
-	if (file != NULL)
-		extra += strlen(file) + 1;
-
-	host_file = dentry_name(dentry, extra + strlen("/remove"));
-	if (host_file == NULL) {
-		printk(KERN_ERR "file_removed : allocation failed\n");
-		return -ENOMEM;
-	}
-
-	if (file != NULL) {
-		strcat(host_file, "/");
-		strcat(host_file, file);
-	}
-	strcat(host_file, "/remove");
-
-	fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
-	kfree(host_file);
-	if (fd > 0) {
-		os_close_file(fd);
-		return 1;
-	}
-	return 0;
-}
-
-static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
-				   unsigned int flags)
-{
-	struct dentry *proc_dentry, *parent;
-	struct qstr *name = &dentry->d_name;
-	struct inode *inode;
-	int err, deleted;
-
-	deleted = file_removed(dentry, NULL);
-	if (deleted < 0)
-		return ERR_PTR(deleted);
-	else if (deleted)
-		return ERR_PTR(-ENOENT);
-
-	parent = HPPFS_I(ino)->proc_dentry;
-	mutex_lock(&d_inode(parent)->i_mutex);
-	proc_dentry = lookup_one_len(name->name, parent, name->len);
-	mutex_unlock(&d_inode(parent)->i_mutex);
-
-	if (IS_ERR(proc_dentry))
-		return proc_dentry;
-
-	err = -ENOMEM;
-	inode = get_inode(ino->i_sb, proc_dentry);
-	if (!inode)
-		goto out;
-
- 	d_add(dentry, inode);
-	return NULL;
-
- out:
-	return ERR_PTR(err);
-}
-
-static const struct inode_operations hppfs_file_iops = {
-};
-
-static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,
-			 loff_t *ppos, int is_user)
-{
-	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
-	ssize_t n;
-
-	read = file_inode(file)->i_fop->read;
-
-	if (!is_user)
-		set_fs(KERNEL_DS);
-
-	n = (*read)(file, buf, count, &file->f_pos);
-
-	if (!is_user)
-		set_fs(USER_DS);
-
-	if (ppos)
-		*ppos = file->f_pos;
-	return n;
-}
-
-static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count)
-{
-	ssize_t n;
-	int cur, err;
-	char *new_buf;
-
-	n = -ENOMEM;
-	new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (new_buf == NULL) {
-		printk(KERN_ERR "hppfs_read_file : kmalloc failed\n");
-		goto out;
-	}
-	n = 0;
-	while (count > 0) {
-		cur = min_t(ssize_t, count, PAGE_SIZE);
-		err = os_read_file(fd, new_buf, cur);
-		if (err < 0) {
-			printk(KERN_ERR "hppfs_read : read failed, "
-			       "errno = %d\n", err);
-			n = err;
-			goto out_free;
-		} else if (err == 0)
-			break;
-
-		if (copy_to_user(buf, new_buf, err)) {
-			n = -EFAULT;
-			goto out_free;
-		}
-		n += err;
-		count -= err;
-	}
- out_free:
-	kfree(new_buf);
- out:
-	return n;
-}
-
-static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
-			  loff_t *ppos)
-{
-	struct hppfs_private *hppfs = file->private_data;
-	struct hppfs_data *data;
-	loff_t off;
-	int err;
-
-	if (hppfs->contents != NULL) {
-		int rem;
-
-		if (*ppos >= hppfs->len)
-			return 0;
-
-		data = hppfs->contents;
-		off = *ppos;
-		while (off >= sizeof(data->contents)) {
-			data = list_entry(data->list.next, struct hppfs_data,
-					  list);
-			off -= sizeof(data->contents);
-		}
-
-		if (off + count > hppfs->len)
-			count = hppfs->len - off;
-		rem = copy_to_user(buf, &data->contents[off], count);
-		*ppos += count - rem;
-		if (rem > 0)
-			return -EFAULT;
-	} else if (hppfs->host_fd != -1) {
-		err = os_seek_file(hppfs->host_fd, *ppos);
-		if (err) {
-			printk(KERN_ERR "hppfs_read : seek failed, "
-			       "errno = %d\n", err);
-			return err;
-		}
-		err = hppfs_read_file(hppfs->host_fd, buf, count);
-		if (err < 0) {
-			printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
-			return err;
-		}
-		count = err;
-		if (count > 0)
-			*ppos += count;
-	}
-	else count = read_proc(hppfs->proc_file, buf, count, ppos, 1);
-
-	return count;
-}
-
-static ssize_t hppfs_write(struct file *file, const char __user *buf,
-			   size_t len, loff_t *ppos)
-{
-	struct hppfs_private *data = file->private_data;
-	struct file *proc_file = data->proc_file;
-	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
-
-	write = file_inode(proc_file)->i_fop->write;
-	return (*write)(proc_file, buf, len, ppos);
-}
-
-static int open_host_sock(char *host_file, int *filter_out)
-{
-	char *end;
-	int fd;
-
-	end = &host_file[strlen(host_file)];
-	strcpy(end, "/rw");
-	*filter_out = 1;
-	fd = os_connect_socket(host_file);
-	if (fd > 0)
-		return fd;
-
-	strcpy(end, "/r");
-	*filter_out = 0;
-	fd = os_connect_socket(host_file);
-	return fd;
-}
-
-static void free_contents(struct hppfs_data *head)
-{
-	struct hppfs_data *data;
-	struct list_head *ele, *next;
-
-	if (head == NULL)
-		return;
-
-	list_for_each_safe(ele, next, &head->list) {
-		data = list_entry(ele, struct hppfs_data, list);
-		kfree(data);
-	}
-	kfree(head);
-}
-
-static struct hppfs_data *hppfs_get_data(int fd, int filter,
-					 struct file *proc_file,
-					 struct file *hppfs_file,
-					 loff_t *size_out)
-{
-	struct hppfs_data *data, *new, *head;
-	int n, err;
-
-	err = -ENOMEM;
-	data = kmalloc(sizeof(*data), GFP_KERNEL);
-	if (data == NULL) {
-		printk(KERN_ERR "hppfs_get_data : head allocation failed\n");
-		goto failed;
-	}
-
-	INIT_LIST_HEAD(&data->list);
-
-	head = data;
-	*size_out = 0;
-
-	if (filter) {
-		while ((n = read_proc(proc_file, data->contents,
-				      sizeof(data->contents), NULL, 0)) > 0)
-			os_write_file(fd, data->contents, n);
-		err = os_shutdown_socket(fd, 0, 1);
-		if (err) {
-			printk(KERN_ERR "hppfs_get_data : failed to shut down "
-			       "socket\n");
-			goto failed_free;
-		}
-	}
-	while (1) {
-		n = os_read_file(fd, data->contents, sizeof(data->contents));
-		if (n < 0) {
-			err = n;
-			printk(KERN_ERR "hppfs_get_data : read failed, "
-			       "errno = %d\n", err);
-			goto failed_free;
-		} else if (n == 0)
-			break;
-
-		*size_out += n;
-
-		if (n < sizeof(data->contents))
-			break;
-
-		new = kmalloc(sizeof(*data), GFP_KERNEL);
-		if (new == 0) {
-			printk(KERN_ERR "hppfs_get_data : data allocation "
-			       "failed\n");
-			err = -ENOMEM;
-			goto failed_free;
-		}
-
-		INIT_LIST_HEAD(&new->list);
-		list_add(&new->list, &data->list);
-		data = new;
-	}
-	return head;
-
- failed_free:
-	free_contents(head);
- failed:
-	return ERR_PTR(err);
-}
-
-static struct hppfs_private *hppfs_data(void)
-{
-	struct hppfs_private *data;
-
-	data = kmalloc(sizeof(*data), GFP_KERNEL);
-	if (data == NULL)
-		return data;
-
-	*data = ((struct hppfs_private ) { .host_fd  		= -1,
-					   .len  		= -1,
-					   .contents 		= NULL } );
-	return data;
-}
-
-static int file_mode(int fmode)
-{
-	if (fmode == (FMODE_READ | FMODE_WRITE))
-		return O_RDWR;
-	if (fmode == FMODE_READ)
-		return O_RDONLY;
-	if (fmode == FMODE_WRITE)
-		return O_WRONLY;
-	return 0;
-}
-
-static int hppfs_open(struct inode *inode, struct file *file)
-{
-	const struct cred *cred = file->f_cred;
-	struct hppfs_private *data;
-	struct path path;
-	char *host_file;
-	int err, fd, type, filter;
-
-	err = -ENOMEM;
-	data = hppfs_data();
-	if (data == NULL)
-		goto out;
-
-	host_file = dentry_name(file->f_path.dentry, strlen("/rw"));
-	if (host_file == NULL)
-		goto out_free2;
-
-	path.mnt = inode->i_sb->s_fs_info;
-	path.dentry = HPPFS_I(inode)->proc_dentry;
-
-	data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred);
-	err = PTR_ERR(data->proc_file);
-	if (IS_ERR(data->proc_file))
-		goto out_free1;
-
-	type = os_file_type(host_file);
-	if (type == OS_TYPE_FILE) {
-		fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
-		if (fd >= 0)
-			data->host_fd = fd;
-		else
-			printk(KERN_ERR "hppfs_open : failed to open '%s', "
-			       "errno = %d\n", host_file, -fd);
-
-		data->contents = NULL;
-	} else if (type == OS_TYPE_DIR) {
-		fd = open_host_sock(host_file, &filter);
-		if (fd > 0) {
-			data->contents = hppfs_get_data(fd, filter,
-							data->proc_file,
-							file, &data->len);
-			if (!IS_ERR(data->contents))
-				data->host_fd = fd;
-		} else
-			printk(KERN_ERR "hppfs_open : failed to open a socket "
-			       "in '%s', errno = %d\n", host_file, -fd);
-	}
-	kfree(host_file);
-
-	file->private_data = data;
-	return 0;
-
- out_free1:
-	kfree(host_file);
- out_free2:
-	free_contents(data->contents);
-	kfree(data);
- out:
-	return err;
-}
-
-static int hppfs_dir_open(struct inode *inode, struct file *file)
-{
-	const struct cred *cred = file->f_cred;
-	struct hppfs_private *data;
-	struct path path;
-	int err;
-
-	err = -ENOMEM;
-	data = hppfs_data();
-	if (data == NULL)
-		goto out;
-
-	path.mnt = inode->i_sb->s_fs_info;
-	path.dentry = HPPFS_I(inode)->proc_dentry;
-	data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred);
-	err = PTR_ERR(data->proc_file);
-	if (IS_ERR(data->proc_file))
-		goto out_free;
-
-	file->private_data = data;
-	return 0;
-
- out_free:
-	kfree(data);
- out:
-	return err;
-}
-
-static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
-{
-	struct hppfs_private *data = file->private_data;
-	struct file *proc_file = data->proc_file;
-	loff_t (*llseek)(struct file *, loff_t, int);
-	loff_t ret;
-
-	llseek = file_inode(proc_file)->i_fop->llseek;
-	if (llseek != NULL) {
-		ret = (*llseek)(proc_file, off, where);
-		if (ret < 0)
-			return ret;
-	}
-
-	return default_llseek(file, off, where);
-}
-
-static int hppfs_release(struct inode *inode, struct file *file)
-{
-	struct hppfs_private *data = file->private_data;
-	struct file *proc_file = data->proc_file;
-	if (proc_file)
-		fput(proc_file);
-	kfree(data);
-	return 0;
-}
-
-static const struct file_operations hppfs_file_fops = {
-	.owner		= NULL,
-	.llseek		= hppfs_llseek,
-	.read		= hppfs_read,
-	.write		= hppfs_write,
-	.open		= hppfs_open,
-	.release	= hppfs_release,
-};
-
-struct hppfs_dirent {
-	struct dir_context ctx;
-	struct dir_context *caller;
-	struct dentry *dentry;
-};
-
-static int hppfs_filldir(struct dir_context *ctx, const char *name, int size,
-			 loff_t offset, u64 inode, unsigned int type)
-{
-	struct hppfs_dirent *dirent =
-		container_of(ctx, struct hppfs_dirent, ctx);
-
-	if (file_removed(dirent->dentry, name))
-		return 0;
-
-	dirent->caller->pos = dirent->ctx.pos;
-	return !dir_emit(dirent->caller, name, size, inode, type);
-}
-
-static int hppfs_readdir(struct file *file, struct dir_context *ctx)
-{
-	struct hppfs_private *data = file->private_data;
-	struct file *proc_file = data->proc_file;
-	struct hppfs_dirent d = {
-		.ctx.actor	= hppfs_filldir,
-		.caller		= ctx,
-		.dentry  	= file->f_path.dentry
-	};
-	int err;
-	proc_file->f_pos = ctx->pos;
-	err = iterate_dir(proc_file, &d.ctx);
-	ctx->pos = d.ctx.pos;
-	return err;
-}
-
-static const struct file_operations hppfs_dir_fops = {
-	.owner		= NULL,
-	.iterate	= hppfs_readdir,
-	.open		= hppfs_dir_open,
-	.llseek		= default_llseek,
-	.release	= hppfs_release,
-};
-
-static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
-{
-	sf->f_blocks = 0;
-	sf->f_bfree = 0;
-	sf->f_bavail = 0;
-	sf->f_files = 0;
-	sf->f_ffree = 0;
-	sf->f_type = HPPFS_SUPER_MAGIC;
-	return 0;
-}
-
-static struct inode *hppfs_alloc_inode(struct super_block *sb)
-{
-	struct hppfs_inode_info *hi;
-
-	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
-	if (!hi)
-		return NULL;
-
-	hi->proc_dentry = NULL;
-	inode_init_once(&hi->vfs_inode);
-	return &hi->vfs_inode;
-}
-
-void hppfs_evict_inode(struct inode *ino)
-{
-	clear_inode(ino);
-	dput(HPPFS_I(ino)->proc_dentry);
-	mntput(ino->i_sb->s_fs_info);
-}
-
-static void hppfs_i_callback(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kfree(HPPFS_I(inode));
-}
-
-static void hppfs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, hppfs_i_callback);
-}
-
-static const struct super_operations hppfs_sbops = {
-	.alloc_inode	= hppfs_alloc_inode,
-	.destroy_inode	= hppfs_destroy_inode,
-	.evict_inode	= hppfs_evict_inode,
-	.statfs		= hppfs_statfs,
-};
-
-static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
-			  int buflen)
-{
-	struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
-	return d_inode(proc_dentry)->i_op->readlink(proc_dentry, buffer,
-						    buflen);
-}
-
-static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
-
-	return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, nd);
-}
-
-static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
-			   void *cookie)
-{
-	struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
-
-	if (d_inode(proc_dentry)->i_op->put_link)
-		d_inode(proc_dentry)->i_op->put_link(proc_dentry, nd, cookie);
-}
-
-static const struct inode_operations hppfs_dir_iops = {
-	.lookup		= hppfs_lookup,
-};
-
-static const struct inode_operations hppfs_link_iops = {
-	.readlink	= hppfs_readlink,
-	.follow_link	= hppfs_follow_link,
-	.put_link	= hppfs_put_link,
-};
-
-static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
-{
-	struct inode *proc_ino = d_inode(dentry);
-	struct inode *inode = new_inode(sb);
-
-	if (!inode) {
-		dput(dentry);
-		return NULL;
-	}
-
-	if (d_is_dir(dentry)) {
-		inode->i_op = &hppfs_dir_iops;
-		inode->i_fop = &hppfs_dir_fops;
-	} else if (d_is_symlink(dentry)) {
-		inode->i_op = &hppfs_link_iops;
-		inode->i_fop = &hppfs_file_fops;
-	} else {
-		inode->i_op = &hppfs_file_iops;
-		inode->i_fop = &hppfs_file_fops;
-	}
-
-	HPPFS_I(inode)->proc_dentry = dentry;
-
-	inode->i_uid = proc_ino->i_uid;
-	inode->i_gid = proc_ino->i_gid;
-	inode->i_atime = proc_ino->i_atime;
-	inode->i_mtime = proc_ino->i_mtime;
-	inode->i_ctime = proc_ino->i_ctime;
-	inode->i_ino = proc_ino->i_ino;
-	inode->i_mode = proc_ino->i_mode;
-	set_nlink(inode, proc_ino->i_nlink);
-	inode->i_size = proc_ino->i_size;
-	inode->i_blocks = proc_ino->i_blocks;
-
-	return inode;
-}
-
-static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
-{
-	struct inode *root_inode;
-	struct vfsmount *proc_mnt;
-	int err = -ENOENT;
-
-	proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
-	if (IS_ERR(proc_mnt))
-		goto out;
-
-	sb->s_blocksize = 1024;
-	sb->s_blocksize_bits = 10;
-	sb->s_magic = HPPFS_SUPER_MAGIC;
-	sb->s_op = &hppfs_sbops;
-	sb->s_fs_info = proc_mnt;
-
-	err = -ENOMEM;
-	root_inode = get_inode(sb, dget(proc_mnt->mnt_root));
-	sb->s_root = d_make_root(root_inode);
-	if (!sb->s_root)
-		goto out_mntput;
-
-	return 0;
-
- out_mntput:
-	mntput(proc_mnt);
- out:
-	return(err);
-}
-
-static struct dentry *hppfs_read_super(struct file_system_type *type,
-			    int flags, const char *dev_name,
-			    void *data)
-{
-	return mount_nodev(type, flags, data, hppfs_fill_super);
-}
-
-static struct file_system_type hppfs_type = {
-	.owner 		= THIS_MODULE,
-	.name 		= "hppfs",
-	.mount 		= hppfs_read_super,
-	.kill_sb	= kill_anon_super,
-	.fs_flags 	= 0,
-};
-MODULE_ALIAS_FS("hppfs");
-
-static int __init init_hppfs(void)
-{
-	return register_filesystem(&hppfs_type);
-}
-
-static void __exit exit_hppfs(void)
-{
-	unregister_filesystem(&hppfs_type);
-}
-
-module_init(init_hppfs)
-module_exit(exit_hppfs)
-MODULE_LICENSE("GPL");
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87724c1d7..973c24ce5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -130,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 		goto out;
 
 	ret = 0;
-	hugetlb_prefault_arch_hook(vma->vm_mm);
 	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
 		inode->i_size = len;
 out:
@@ -1011,6 +1010,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
 	if (!inode)
 		goto out_dentry;
+	if (creat_flags == HUGETLB_SHMFS_INODE)
+		inode->i_flags |= S_PRIVATE;
 
 	file = ERR_PTR(-ENOMEM);
 	if (hugetlb_reserve_pages(inode, 0,
diff --git a/fs/inode.c b/fs/inode.c
index 205733701..d30640f7a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -58,7 +58,6 @@ static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
-EXPORT_SYMBOL(inode_sb_list_lock);
 
 /*
  * Empty aops. Can be used for the cases where the user does not
@@ -153,6 +152,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_pipe = NULL;
 	inode->i_bdev = NULL;
 	inode->i_cdev = NULL;
+	inode->i_link = NULL;
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
 
@@ -224,6 +224,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
+	inode_detach_wb(inode);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
 	locks_free_lock_context(inode->i_flctx);
@@ -831,8 +832,6 @@ unsigned int get_next_ino(void)
 	unsigned int *p = &get_cpu_var(last_ino);
 	unsigned int res = *p;
 
-start:
-
 #ifdef CONFIG_SMP
 	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
 		static atomic_t shared_last_ino;
@@ -842,8 +841,10 @@ start:
 	}
 #endif
 
-	if (unlikely(!++res))
-		goto start;	/* never zero */
+	res++;
+	/* get_next_ino should not provide a 0 inode number */
+	if (unlikely(!res))
+		res++;
 	*p = res;
 	put_cpu_var(last_ino);
 	return res;
@@ -1589,36 +1590,47 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
  *	This function automatically handles read only file systems and media,
  *	as well as the "noatime" flag and inode specific "noatime" markers.
  */
-void touch_atime(const struct path *path)
+bool atime_needs_update(const struct path *path, struct inode *inode)
 {
 	struct vfsmount *mnt = path->mnt;
-	struct inode *inode = d_inode(path->dentry);
 	struct timespec now;
 
 	if (inode->i_flags & S_NOATIME)
-		return;
+		return false;
 	if (IS_NOATIME(inode))
-		return;
+		return false;
 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
-		return;
+		return false;
 
 	if (mnt->mnt_flags & MNT_NOATIME)
-		return;
+		return false;
 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
-		return;
+		return false;
 
 	now = current_fs_time(inode->i_sb);
 
 	if (!relatime_need_update(mnt, inode, now))
-		return;
+		return false;
 
 	if (timespec_equal(&inode->i_atime, &now))
+		return false;
+
+	return true;
+}
+
+void touch_atime(const struct path *path)
+{
+	struct vfsmount *mnt = path->mnt;
+	struct inode *inode = d_inode(path->dentry);
+	struct timespec now;
+
+	if (!atime_needs_update(path, inode))
 		return;
 
 	if (!sb_start_write_trylock(inode->i_sb))
 		return;
 
-	if (__mnt_want_write(mnt))
+	if (__mnt_want_write(mnt) != 0)
 		goto skip_update;
 	/*
 	 * File systems can error out when updating inodes if they need to
@@ -1629,6 +1641,7 @@ void touch_atime(const struct path *path)
 	 * We may also fail on filesystems that have the ability to make parts
 	 * of the fs read only, e.g. subvolumes in Btrfs.
 	 */
+	now = current_fs_time(inode->i_sb);
 	update_time(inode, &now, S_ATIME);
 	__mnt_drop_write(mnt);
 skip_update:
@@ -1665,7 +1678,31 @@ int should_remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(should_remove_suid);
 
-static int __remove_suid(struct dentry *dentry, int kill)
+/*
+ * Return mask of changes for notify_change() that need to be done as a
+ * response to write or truncate. Return 0 if nothing has to be changed.
+ * Negative value on error (change should be denied).
+ */
+int dentry_needs_remove_privs(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	int mask = 0;
+	int ret;
+
+	if (IS_NOSEC(inode))
+		return 0;
+
+	mask = should_remove_suid(dentry);
+	ret = security_inode_need_killpriv(dentry);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		mask |= ATTR_KILL_PRIV;
+	return mask;
+}
+EXPORT_SYMBOL(dentry_needs_remove_privs);
+
+static int __remove_privs(struct dentry *dentry, int kill)
 {
 	struct iattr newattrs;
 
@@ -1677,33 +1714,32 @@ static int __remove_suid(struct dentry *dentry, int kill)
 	return notify_change(dentry, &newattrs, NULL);
 }
 
-int file_remove_suid(struct file *file)
+/*
+ * Remove special file priviledges (suid, capabilities) when file is written
+ * to or truncated.
+ */
+int file_remove_privs(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
-	int killsuid;
-	int killpriv;
+	int kill;
 	int error = 0;
 
 	/* Fast path for nothing security related */
 	if (IS_NOSEC(inode))
 		return 0;
 
-	killsuid = should_remove_suid(dentry);
-	killpriv = security_inode_need_killpriv(dentry);
-
-	if (killpriv < 0)
-		return killpriv;
-	if (killpriv)
-		error = security_inode_killpriv(dentry);
-	if (!error && killsuid)
-		error = __remove_suid(dentry, killsuid);
+	kill = file_needs_remove_privs(file);
+	if (kill < 0)
+		return kill;
+	if (kill)
+		error = __remove_privs(dentry, kill);
 	if (!error)
 		inode_has_no_xattr(inode);
 
 	return error;
 }
-EXPORT_SYMBOL(file_remove_suid);
+EXPORT_SYMBOL(file_remove_privs);
 
 /**
  *	file_update_time	-	update mtime and ctime time
@@ -1958,9 +1994,8 @@ EXPORT_SYMBOL(inode_dio_wait);
  * inode is being instantiated).  The reason for the cmpxchg() loop
  * --- which wouldn't be necessary if all code paths which modify
  * i_flags actually followed this rule, is that there is at least one
- * code path which doesn't today --- for example,
- * __generic_file_aio_write() calls file_remove_suid() without holding
- * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ * code path which doesn't today so we use cmpxchg() out of an abundance
+ * of caution.
  *
  * In the long run, i_mutex is overkill, and we should probably look
  * at using the i_lock spinlock to protect i_flags, and then make sure
diff --git a/fs/internal.h b/fs/internal.h
index 01dce1d14..4d5af583a 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -107,6 +107,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 extern long do_handle_open(int mountdirfd,
 			   struct file_handle __user *ufh, int open_flag);
 extern int open_check_o_direct(struct file *f);
+extern int vfs_open(const struct path *, struct file *, const struct cred *);
 
 /*
  * inode.c
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 112fad9e1..4ff3fad4e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 	 */
 	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
 
-retry_alloc:
-	new_bh = alloc_buffer_head(GFP_NOFS);
-	if (!new_bh) {
-		/*
-		 * Failure is not an option, but __GFP_NOFAIL is going
-		 * away; so we retry ourselves here.
-		 */
-		congestion_wait(BLK_RW_ASYNC, HZ/50);
-		goto retry_alloc;
-	}
+	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
 
 	/* keep subsequent assertions sane */
 	atomic_set(&new_bh->b_count, 1);
@@ -1144,7 +1135,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 {
 	journal_t *journal = journal_init_common();
 	struct buffer_head *bh;
-	char *p;
 	int n;
 
 	if (!journal)
@@ -1157,9 +1147,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 	journal->j_blk_offset = start;
 	journal->j_maxlen = len;
 	bdevname(journal->j_dev, journal->j_devname);
-	p = journal->j_devname;
-	while ((p = strchr(p, '/')))
-		*p = '!';
+	strreplace(journal->j_devname, '/', '!');
 	jbd2_stats_proc_init(journal);
 	n = journal->j_blocksize / sizeof(journal_block_tag_t);
 	journal->j_wbufsize = n;
@@ -1211,10 +1199,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
 	journal->j_inode = inode;
 	bdevname(journal->j_dev, journal->j_devname);
-	p = journal->j_devname;
-	while ((p = strchr(p, '/')))
-		*p = '!';
-	p = journal->j_devname + strlen(journal->j_devname);
+	p = strreplace(journal->j_devname, '/', '!');
 	sprintf(p, "-%lu", journal->j_inode->i_ino);
 	jbd_debug(1,
 		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
@@ -2354,7 +2339,7 @@ static int jbd2_journal_init_journal_head_cache(void)
 	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
 				sizeof(struct journal_head),
 				0,		/* offset */
-				SLAB_TEMPORARY,	/* flags */
+				SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU,
 				NULL);		/* ctor */
 	retval = 0;
 	if (!jbd2_journal_head_cache) {
@@ -2386,10 +2371,8 @@ static struct journal_head *journal_alloc_journal_head(void)
 	if (!ret) {
 		jbd_debug(1, "out of memory for journal_head\n");
 		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
-		while (!ret) {
-			yield();
-			ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
-		}
+		ret = kmem_cache_zalloc(jbd2_journal_head_cache,
+				GFP_NOFS | __GFP_NOFAIL);
 	}
 	return ret;
 }
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 14214da80..0abf2e7f7 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -141,11 +141,13 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
 {
 	struct list_head *hash_list;
 	struct jbd2_revoke_record_s *record;
+	gfp_t gfp_mask = GFP_NOFS;
 
-repeat:
-	record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
+	if (journal_oom_retry)
+		gfp_mask |= __GFP_NOFAIL;
+	record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask);
 	if (!record)
-		goto oom;
+		return -ENOMEM;
 
 	record->sequence = seq;
 	record->blocknr = blocknr;
@@ -154,13 +156,6 @@ repeat:
 	list_add(&record->hash, hash_list);
 	spin_unlock(&journal->j_revoke_lock);
 	return 0;
-
-oom:
-	if (!journal_oom_retry)
-		return -ENOMEM;
-	jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
-	yield();
-	goto repeat;
 }
 
 /* Find a revoke record in the journal's hash table. */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index ff2f2e6ad..f3d06174b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -278,22 +278,16 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 
 alloc_transaction:
 	if (!journal->j_running_transaction) {
+		/*
+		 * If __GFP_FS is not present, then we may be being called from
+		 * inside the fs writeback layer, so we MUST NOT fail.
+		 */
+		if ((gfp_mask & __GFP_FS) == 0)
+			gfp_mask |= __GFP_NOFAIL;
 		new_transaction = kmem_cache_zalloc(transaction_cache,
 						    gfp_mask);
-		if (!new_transaction) {
-			/*
-			 * If __GFP_FS is not present, then we may be
-			 * being called from inside the fs writeback
-			 * layer, so we MUST NOT fail.  Since
-			 * __GFP_NOFAIL is going away, we will arrange
-			 * to retry the allocation ourselves.
-			 */
-			if ((gfp_mask & __GFP_FS) == 0) {
-				congestion_wait(BLK_RW_ASYNC, HZ/50);
-				goto alloc_transaction;
-			}
+		if (!new_transaction)
 			return -ENOMEM;
-		}
 	}
 
 	jbd_debug(3, "New handle %p going live.\n", handle);
@@ -761,6 +755,30 @@ static void warn_dirty_buffer(struct buffer_head *bh)
 	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 
+/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
+static void jbd2_freeze_jh_data(struct journal_head *jh)
+{
+	struct page *page;
+	int offset;
+	char *source;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
+	page = bh->b_page;
+	offset = offset_in_page(bh->b_data);
+	source = kmap_atomic(page);
+	/* Fire data frozen trigger just before we copy the data */
+	jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers);
+	memcpy(jh->b_frozen_data, source + offset, bh->b_size);
+	kunmap_atomic(source);
+
+	/*
+	 * Now that the frozen data is saved off, we need to store any matching
+	 * triggers.
+	 */
+	jh->b_frozen_triggers = jh->b_triggers;
+}
+
 /*
  * If the buffer is already part of the current transaction, then there
  * is nothing we need to do.  If it is already part of a prior
@@ -780,7 +798,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	journal_t *journal;
 	int error;
 	char *frozen_buffer = NULL;
-	int need_copy = 0;
 	unsigned long start_lock, time_lock;
 
 	if (is_handle_aborted(handle))
@@ -867,119 +884,96 @@ repeat:
        jh->b_modified = 0;
 
 	/*
+	 * If the buffer is not journaled right now, we need to make sure it
+	 * doesn't get written to disk before the caller actually commits the
+	 * new data
+	 */
+	if (!jh->b_transaction) {
+		JBUFFER_TRACE(jh, "no transaction");
+		J_ASSERT_JH(jh, !jh->b_next_transaction);
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		/*
+		 * Make sure all stores to jh (b_modified, b_frozen_data) are
+		 * visible before attaching it to the running transaction.
+		 * Paired with barrier in jbd2_write_access_granted()
+		 */
+		smp_wmb();
+		spin_lock(&journal->j_list_lock);
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+		spin_unlock(&journal->j_list_lock);
+		goto done;
+	}
+	/*
 	 * If there is already a copy-out version of this buffer, then we don't
 	 * need to make another one
 	 */
 	if (jh->b_frozen_data) {
 		JBUFFER_TRACE(jh, "has frozen data");
 		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-		jh->b_next_transaction = transaction;
-		goto done;
+		goto attach_next;
 	}
 
-	/* Is there data here we need to preserve? */
+	JBUFFER_TRACE(jh, "owned by older transaction");
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);
 
-	if (jh->b_transaction && jh->b_transaction != transaction) {
-		JBUFFER_TRACE(jh, "owned by older transaction");
-		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-		J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
+	/*
+	 * There is one case we have to be very careful about.  If the
+	 * committing transaction is currently writing this buffer out to disk
+	 * and has NOT made a copy-out, then we cannot modify the buffer
+	 * contents at all right now.  The essence of copy-out is that it is
+	 * the extra copy, not the primary copy, which gets journaled.  If the
+	 * primary copy is already going to disk then we cannot do copy-out
+	 * here.
+	 */
+	if (buffer_shadow(bh)) {
+		JBUFFER_TRACE(jh, "on shadow: sleep");
+		jbd_unlock_bh_state(bh);
+		wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
+		goto repeat;
+	}
 
-		/* There is one case we have to be very careful about.
-		 * If the committing transaction is currently writing
-		 * this buffer out to disk and has NOT made a copy-out,
-		 * then we cannot modify the buffer contents at all
-		 * right now.  The essence of copy-out is that it is the
-		 * extra copy, not the primary copy, which gets
-		 * journaled.  If the primary copy is already going to
-		 * disk then we cannot do copy-out here. */
-
-		if (buffer_shadow(bh)) {
-			JBUFFER_TRACE(jh, "on shadow: sleep");
+	/*
+	 * Only do the copy if the currently-owning transaction still needs it.
+	 * If buffer isn't on BJ_Metadata list, the committing transaction is
+	 * past that stage (here we use the fact that BH_Shadow is set under
+	 * bh_state lock together with refiling to BJ_Shadow list and at this
+	 * point we know the buffer doesn't have BH_Shadow set).
+	 *
+	 * Subtle point, though: if this is a get_undo_access, then we will be
+	 * relying on the frozen_data to contain the new value of the
+	 * committed_data record after the transaction, so we HAVE to force the
+	 * frozen_data copy in that case.
+	 */
+	if (jh->b_jlist == BJ_Metadata || force_copy) {
+		JBUFFER_TRACE(jh, "generate frozen data");
+		if (!frozen_buffer) {
+			JBUFFER_TRACE(jh, "allocate memory for buffer");
 			jbd_unlock_bh_state(bh);
-			wait_on_bit_io(&bh->b_state, BH_Shadow,
-				       TASK_UNINTERRUPTIBLE);
-			goto repeat;
-		}
-
-		/*
-		 * Only do the copy if the currently-owning transaction still
-		 * needs it. If buffer isn't on BJ_Metadata list, the
-		 * committing transaction is past that stage (here we use the
-		 * fact that BH_Shadow is set under bh_state lock together with
-		 * refiling to BJ_Shadow list and at this point we know the
-		 * buffer doesn't have BH_Shadow set).
-		 *
-		 * Subtle point, though: if this is a get_undo_access,
-		 * then we will be relying on the frozen_data to contain
-		 * the new value of the committed_data record after the
-		 * transaction, so we HAVE to force the frozen_data copy
-		 * in that case.
-		 */
-		if (jh->b_jlist == BJ_Metadata || force_copy) {
-			JBUFFER_TRACE(jh, "generate frozen data");
+			frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
 			if (!frozen_buffer) {
-				JBUFFER_TRACE(jh, "allocate memory for buffer");
-				jbd_unlock_bh_state(bh);
-				frozen_buffer =
-					jbd2_alloc(jh2bh(jh)->b_size,
-							 GFP_NOFS);
-				if (!frozen_buffer) {
-					printk(KERN_ERR
-					       "%s: OOM for frozen_buffer\n",
-					       __func__);
-					JBUFFER_TRACE(jh, "oom!");
-					error = -ENOMEM;
-					jbd_lock_bh_state(bh);
-					goto done;
-				}
-				goto repeat;
+				printk(KERN_ERR "%s: OOM for frozen_buffer\n",
+				       __func__);
+				JBUFFER_TRACE(jh, "oom!");
+				error = -ENOMEM;
+				goto out;
 			}
-			jh->b_frozen_data = frozen_buffer;
-			frozen_buffer = NULL;
-			need_copy = 1;
+			goto repeat;
 		}
-		jh->b_next_transaction = transaction;
+		jh->b_frozen_data = frozen_buffer;
+		frozen_buffer = NULL;
+		jbd2_freeze_jh_data(jh);
 	}
-
-
+attach_next:
 	/*
-	 * Finally, if the buffer is not journaled right now, we need to make
-	 * sure it doesn't get written to disk before the caller actually
-	 * commits the new data
+	 * Make sure all stores to jh (b_modified, b_frozen_data) are visible
+	 * before attaching it to the running transaction. Paired with barrier
+	 * in jbd2_write_access_granted()
 	 */
-	if (!jh->b_transaction) {
-		JBUFFER_TRACE(jh, "no transaction");
-		J_ASSERT_JH(jh, !jh->b_next_transaction);
-		JBUFFER_TRACE(jh, "file as BJ_Reserved");
-		spin_lock(&journal->j_list_lock);
-		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
-		spin_unlock(&journal->j_list_lock);
-	}
+	smp_wmb();
+	jh->b_next_transaction = transaction;
 
 done:
-	if (need_copy) {
-		struct page *page;
-		int offset;
-		char *source;
-
-		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
-			    "Possible IO failure.\n");
-		page = jh2bh(jh)->b_page;
-		offset = offset_in_page(jh2bh(jh)->b_data);
-		source = kmap_atomic(page);
-		/* Fire data frozen trigger just before we copy the data */
-		jbd2_buffer_frozen_trigger(jh, source + offset,
-					   jh->b_triggers);
-		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
-		kunmap_atomic(source);
-
-		/*
-		 * Now that the frozen data is saved off, we need to store
-		 * any matching triggers.
-		 */
-		jh->b_frozen_triggers = jh->b_triggers;
-	}
 	jbd_unlock_bh_state(bh);
 
 	/*
@@ -996,6 +990,55 @@ out:
 	return error;
 }
 
+/* Fast check whether buffer is already attached to the required transaction */
+static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh)
+{
+	struct journal_head *jh;
+	bool ret = false;
+
+	/* Dirty buffers require special handling... */
+	if (buffer_dirty(bh))
+		return false;
+
+	/*
+	 * RCU protects us from dereferencing freed pages. So the checks we do
+	 * are guaranteed not to oops. However the jh slab object can get freed
+	 * & reallocated while we work with it. So we have to be careful. When
+	 * we see jh attached to the running transaction, we know it must stay
+	 * so until the transaction is committed. Thus jh won't be freed and
+	 * will be attached to the same bh while we run.  However it can
+	 * happen jh gets freed, reallocated, and attached to the transaction
+	 * just after we get pointer to it from bh. So we have to be careful
+	 * and recheck jh still belongs to our bh before we return success.
+	 */
+	rcu_read_lock();
+	if (!buffer_jbd(bh))
+		goto out;
+	/* This should be bh2jh() but that doesn't work with inline functions */
+	jh = READ_ONCE(bh->b_private);
+	if (!jh)
+		goto out;
+	if (jh->b_transaction != handle->h_transaction &&
+	    jh->b_next_transaction != handle->h_transaction)
+		goto out;
+	/*
+	 * There are two reasons for the barrier here:
+	 * 1) Make sure to fetch b_bh after we did previous checks so that we
+	 * detect when jh went through free, realloc, attach to transaction
+	 * while we were checking. Paired with implicit barrier in that path.
+	 * 2) So that access to bh done after jbd2_write_access_granted()
+	 * doesn't get reordered and see inconsistent state of concurrent
+	 * do_get_write_access().
+	 */
+	smp_mb();
+	if (unlikely(jh->b_bh != bh))
+		goto out;
+	ret = true;
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
 /**
  * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
  * @handle: transaction to add buffer modifications to
@@ -1009,9 +1052,13 @@ out:
 
 int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
 {
-	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	struct journal_head *jh;
 	int rc;
 
+	if (jbd2_write_access_granted(handle, bh))
+		return 0;
+
+	jh = jbd2_journal_add_journal_head(bh);
 	/* We do not want to get caught playing with fields which the
 	 * log thread also manipulates.  Make sure that the buffer
 	 * completes any outstanding IO before proceeding. */
@@ -1141,11 +1188,14 @@ out:
 int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 {
 	int err;
-	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	struct journal_head *jh;
 	char *committed_data = NULL;
 
 	JBUFFER_TRACE(jh, "entry");
+	if (jbd2_write_access_granted(handle, bh))
+		return 0;
 
+	jh = jbd2_journal_add_journal_head(bh);
 	/*
 	 * Do this first --- it can drop the journal lock, so we want to
 	 * make sure that obtaining the committed_data is done
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 1ba5c9794..811800229 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -354,6 +354,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		ret = -ENOMEM;
 		goto fail;
 	}
+	inode->i_link = f->target;
 
 	jffs2_dbg(1, "%s(): symlink's target '%s' cached\n",
 		  __func__, (char *)f->target);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index fe5ea080b..2caf16820 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -272,12 +272,9 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
 	mutex_lock(&f->sem);
 
 	ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node);
+	if (ret)
+		goto error;
 
-	if (ret) {
-		mutex_unlock(&f->sem);
-		iget_failed(inode);
-		return ERR_PTR(ret);
-	}
 	inode->i_mode = jemode_to_cpu(latest_node.mode);
 	i_uid_write(inode, je16_to_cpu(latest_node.uid));
 	i_gid_write(inode, je16_to_cpu(latest_node.gid));
@@ -294,6 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
 
 	case S_IFLNK:
 		inode->i_op = &jffs2_symlink_inode_operations;
+		inode->i_link = f->target;
 		break;
 
 	case S_IFDIR:
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index d200a9b8f..824e61ede 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -19,7 +19,7 @@
 struct kstatfs;
 struct kvec;
 
-#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode))
+#define JFFS2_INODE_INFO(i) (container_of(i, struct jffs2_inode_info, vfs_inode))
 #define OFNI_EDONI_2SFFJ(f)  (&(f)->vfs_inode)
 #define JFFS2_SB_INFO(sb) (sb->s_fs_info)
 #define OFNI_BS_2SFFJ(c)  ((struct super_block *)c->os_priv)
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index dddbde4f5..28e0aab42 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1203,17 +1203,13 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
 		JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n",
 			ret, retlen, sizeof(*latest_node));
 		/* FIXME: If this fails, there seems to be a memory leak. Find it. */
-		mutex_unlock(&f->sem);
-		jffs2_do_clear_inode(c, f);
-		return ret?ret:-EIO;
+		return ret ? ret : -EIO;
 	}
 
 	crc = crc32(0, latest_node, sizeof(*latest_node)-8);
 	if (crc != je32_to_cpu(latest_node->node_crc)) {
 		JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n",
 			f->inocache->ino, ref_offset(rii.latest_ref));
-		mutex_unlock(&f->sem);
-		jffs2_do_clear_inode(c, f);
 		return -EIO;
 	}
 
@@ -1250,16 +1246,11 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
 			 * keep in RAM to facilitate quick follow symlink
 			 * operation. */
 			uint32_t csize = je32_to_cpu(latest_node->csize);
-			if (csize > JFFS2_MAX_NAME_LEN) {
-				mutex_unlock(&f->sem);
-				jffs2_do_clear_inode(c, f);
+			if (csize > JFFS2_MAX_NAME_LEN)
 				return -ENAMETOOLONG;
-			}
 			f->target = kmalloc(csize + 1, GFP_KERNEL);
 			if (!f->target) {
 				JFFS2_ERROR("can't allocate %u bytes of memory for the symlink target path cache\n", csize);
-				mutex_unlock(&f->sem);
-				jffs2_do_clear_inode(c, f);
 				return -ENOMEM;
 			}
 
@@ -1271,8 +1262,6 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
 					ret = -EIO;
 				kfree(f->target);
 				f->target = NULL;
-				mutex_unlock(&f->sem);
-				jffs2_do_clear_inode(c, f);
 				return ret;
 			}
 
@@ -1289,15 +1278,11 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
 		if (f->metadata) {
 			JFFS2_ERROR("Argh. Special inode #%u with mode 0%o had metadata node\n",
 			       f->inocache->ino, jemode_to_cpu(latest_node->mode));
-			mutex_unlock(&f->sem);
-			jffs2_do_clear_inode(c, f);
 			return -EIO;
 		}
 		if (!frag_first(&f->fragtree)) {
 			JFFS2_ERROR("Argh. Special inode #%u with mode 0%o has no fragments\n",
 			       f->inocache->ino, jemode_to_cpu(latest_node->mode));
-			mutex_unlock(&f->sem);
-			jffs2_do_clear_inode(c, f);
 			return -EIO;
 		}
 		/* ASSERT: f->fraglist != NULL */
@@ -1305,8 +1290,6 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
 			JFFS2_ERROR("Argh. Special inode #%u with mode 0x%x had more than one node\n",
 			       f->inocache->ino, jemode_to_cpu(latest_node->mode));
 			/* FIXME: Deal with it - check crc32, check for duplicate node, check times and discard the older one */
-			mutex_unlock(&f->sem);
-			jffs2_do_clear_inode(c, f);
 			return -EIO;
 		}
 		/* OK. We're happy */
@@ -1400,10 +1383,8 @@ int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
 	f->inocache = ic;
 
 	ret = jffs2_do_read_inode_internal(c, f, &n);
-	if (!ret) {
-		mutex_unlock(&f->sem);
-		jffs2_do_clear_inode(c, f);
-	}
+	mutex_unlock(&f->sem);
+	jffs2_do_clear_inode(c, f);
 	jffs2_xattr_do_crccheck_inode(c, ic);
 	kfree (f);
 	return ret;
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 1fefa25d0..8ce2f2401 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -9,58 +9,15 @@
  *
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/namei.h>
 #include "nodelist.h"
 
-static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd);
-
 const struct inode_operations jffs2_symlink_inode_operations =
 {
 	.readlink =	generic_readlink,
-	.follow_link =	jffs2_follow_link,
+	.follow_link =	simple_follow_link,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
 	.listxattr =	jffs2_listxattr,
 	.removexattr =	jffs2_removexattr
 };
-
-static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry));
-	char *p = (char *)f->target;
-
-	/*
-	 * We don't acquire the f->sem mutex here since the only data we
-	 * use is f->target.
-	 *
-	 * 1. If we are here the inode has already built and f->target has
-	 * to point to the target path.
-	 * 2. Nobody uses f->target (if the inode is symlink's inode). The
-	 * exception is inode freeing function which frees f->target. But
-	 * it can't be called while we are here and before VFS has
-	 * stopped using our f->target string which we provide by means of
-	 * nd_set_link() call.
-	 */
-
-	if (!p) {
-		pr_err("%s(): can't find symlink target\n", __func__);
-		p = ERR_PTR(-EIO);
-	}
-	jffs2_dbg(1, "%s(): target path is '%s'\n",
-		  __func__, (char *)f->target);
-
-	nd_set_link(nd, p);
-
-	/*
-	 * We will unlock the f->sem mutex but VFS will use the f->target string. This is safe
-	 * since the only way that may cause f->target to be changed is iput() operation.
-	 * But VFS will not use f->target after iput() has been called.
-	 */
-	return NULL;
-}
-
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index e98d39d75..b9dc23cd0 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -76,7 +76,7 @@ static int jfs_open(struct inode *inode, struct file *file)
 		if (ji->active_ag == -1) {
 			struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb);
 			ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb);
-			atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]);
+			atomic_inc(&jfs_sb->bmap->db_active[ji->active_ag]);
 		}
 		spin_unlock_irq(&ji->ag_lock);
 	}
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 070dc4b33..41aa3ca6a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -63,11 +63,12 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
 			inode->i_mapping->a_ops = &jfs_aops;
 		} else {
 			inode->i_op = &jfs_fast_symlink_inode_operations;
+			inode->i_link = JFS_IP(inode)->i_inline;
 			/*
 			 * The inline data should be null-terminated, but
 			 * don't let on-disk corruption crash the kernel
 			 */
-			JFS_IP(inode)->i_inline[inode->i_size] = '\0';
+			inode->i_link[inode->i_size] = '\0';
 		}
 	} else {
 		inode->i_op = &jfs_file_inode_operations;
@@ -133,11 +134,11 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	 * It has been committed since the last change, but was still
 	 * on the dirty inode list.
 	 */
-	 if (!test_cflag(COMMIT_Dirty, inode)) {
+	if (!test_cflag(COMMIT_Dirty, inode)) {
 		/* Make sure committed changes hit the disk */
 		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait);
 		return 0;
-	 }
+	}
 
 	if (jfs_commit_inode(inode, wait)) {
 		jfs_err("jfs_write_inode: jfs_commit_inode failed!");
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 93a123289..8db8b7d61 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -180,9 +180,6 @@ long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case JFS_IOC_SETFLAGS32:
 		cmd = JFS_IOC_SETFLAGS;
 		break;
-	case FITRIM:
-		cmd = FITRIM;
-		break;
 	}
 	return jfs_ioctl(filp, cmd, arg);
 }
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index fa7e795bd..1f26d1910 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -206,7 +206,7 @@ struct jfs_sb_info {
 
 static inline struct jfs_inode_info *JFS_IP(struct inode *inode)
 {
-	return list_entry(inode, struct jfs_inode_info, vfs_inode);
+	return container_of(inode, struct jfs_inode_info, vfs_inode);
 }
 
 static inline int jfs_dirtable_inline(struct inode *inode)
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 66db7bc0e..a5ac97b9a 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -880,7 +880,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	int ssize;		/* source pathname size */
 	struct btstack btstack;
 	struct inode *ip = d_inode(dentry);
-	unchar *i_fastsymlink;
 	s64 xlen = 0;
 	int bmask = 0, xsize;
 	s64 xaddr;
@@ -946,8 +945,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	if (ssize <= IDATASIZE) {
 		ip->i_op = &jfs_fast_symlink_inode_operations;
 
-		i_fastsymlink = JFS_IP(ip)->i_inline;
-		memcpy(i_fastsymlink, name, ssize);
+		ip->i_link = JFS_IP(ip)->i_inline;
+		memcpy(ip->i_link, name, ssize);
 		ip->i_size = ssize - 1;
 
 		/*
@@ -1161,7 +1160,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		rc = dtModify(tid, new_dir, &new_dname, &ino,
 			      old_ip->i_ino, JFS_RENAME);
 		if (rc)
-			goto out4;
+			goto out_tx;
 		drop_nlink(new_ip);
 		if (S_ISDIR(new_ip->i_mode)) {
 			drop_nlink(new_ip);
@@ -1186,7 +1185,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			if ((new_size = commitZeroLink(tid, new_ip)) < 0) {
 				txAbort(tid, 1);	/* Marks FS Dirty */
 				rc = new_size;
-				goto out4;
+				goto out_tx;
 			}
 			tblk = tid_to_tblock(tid);
 			tblk->xflag |= COMMIT_DELETE;
@@ -1204,7 +1203,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (rc) {
 			jfs_err("jfs_rename didn't expect dtSearch to fail "
 				"w/rc = %d", rc);
-			goto out4;
+			goto out_tx;
 		}
 
 		ino = old_ip->i_ino;
@@ -1212,7 +1211,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (rc) {
 			if (rc == -EIO)
 				jfs_err("jfs_rename: dtInsert returned -EIO");
-			goto out4;
+			goto out_tx;
 		}
 		if (S_ISDIR(old_ip->i_mode))
 			inc_nlink(new_dir);
@@ -1227,7 +1226,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		jfs_err("jfs_rename did not expect dtDelete to return rc = %d",
 			rc);
 		txAbort(tid, 1);	/* Marks Filesystem dirty */
-		goto out4;
+		goto out_tx;
 	}
 	if (S_ISDIR(old_ip->i_mode)) {
 		drop_nlink(old_dir);
@@ -1286,7 +1285,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	rc = txCommit(tid, ipcount, iplist, commit_flag);
 
-      out4:
+      out_tx:
 	txEnd(tid);
 	if (new_ip)
 		mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
@@ -1309,13 +1308,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 	if (new_ip && (new_ip->i_nlink == 0))
 		set_cflag(COMMIT_Nolink, new_ip);
-      out3:
-	free_UCSname(&new_dname);
-      out2:
-	free_UCSname(&old_dname);
-      out1:
-	if (new_ip && !S_ISDIR(new_ip->i_mode))
-		IWRITE_UNLOCK(new_ip);
 	/*
 	 * Truncating the directory index table is not guaranteed.  It
 	 * may need to be done iteratively
@@ -1326,7 +1318,13 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 		clear_cflag(COMMIT_Stale, old_dir);
 	}
-
+	if (new_ip && !S_ISDIR(new_ip->i_mode))
+		IWRITE_UNLOCK(new_ip);
+      out3:
+	free_UCSname(&new_dname);
+      out2:
+	free_UCSname(&old_dname);
+      out1:
 	jfs_info("jfs_rename: returning %d", rc);
 	return rc;
 }
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 80f42bcc4..5929e2363 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -17,21 +17,13 @@
  */
 
 #include <linux/fs.h>
-#include <linux/namei.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_xattr.h"
 
-static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	char *s = JFS_IP(d_inode(dentry))->i_inline;
-	nd_set_link(nd, s);
-	return NULL;
-}
-
 const struct inode_operations jfs_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= jfs_follow_link,
+	.follow_link	= simple_follow_link,
 	.setattr	= jfs_setattr,
 	.setxattr	= jfs_setxattr,
 	.getxattr	= jfs_getxattr,
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 2bacb9988..7247252ee 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -785,7 +785,6 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
 	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
 	struct kernfs_open_node *on = kn->attr.open;
 
-	/* need parent for the kobj, grab both */
 	if (!kernfs_get_active(kn))
 		goto trigger;
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index af9fa7499..6762bfbd8 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -76,7 +76,6 @@ extern struct kmem_cache *kernfs_node_cache;
 /*
  * inode.c
  */
-struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
 void kernfs_evict_inode(struct inode *inode);
 int kernfs_iop_permission(struct inode *inode, int mask);
 int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 8a198898e..db272528a 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -112,25 +112,18 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
 	return error;
 }
 
-static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
 {
 	int error = -ENOMEM;
 	unsigned long page = get_zeroed_page(GFP_KERNEL);
-	if (page) {
-		error = kernfs_getlink(dentry, (char *) page);
-		if (error < 0)
-			free_page((unsigned long)page);
-	}
-	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
-	return NULL;
-}
-
-static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
-				void *cookie)
-{
-	char *page = nd_get_link(nd);
-	if (!IS_ERR(page))
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+	error = kernfs_getlink(dentry, (char *)page);
+	if (unlikely(error < 0)) {
 		free_page((unsigned long)page);
+		return ERR_PTR(error);
+	}
+	return *cookie = (char *)page;
 }
 
 const struct inode_operations kernfs_symlink_iops = {
@@ -140,7 +133,7 @@ const struct inode_operations kernfs_symlink_iops = {
 	.listxattr	= kernfs_iop_listxattr,
 	.readlink	= generic_readlink,
 	.follow_link	= kernfs_iop_follow_link,
-	.put_link	= kernfs_iop_put_link,
+	.put_link	= free_page_put_link,
 	.setattr	= kernfs_iop_setattr,
 	.getattr	= kernfs_iop_getattr,
 	.permission	= kernfs_iop_permission,
diff --git a/fs/libfs.c b/fs/libfs.c
index 02813592e..102edfd39 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,11 +20,6 @@
 
 #include "internal.h"
 
-static inline int simple_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		   struct kstat *stat)
 {
@@ -1024,15 +1019,18 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 }
 EXPORT_SYMBOL(noop_fsync);
 
-void kfree_put_link(struct dentry *dentry, struct nameidata *nd,
-				void *cookie)
+void kfree_put_link(struct inode *unused, void *cookie)
 {
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		kfree(s);
+	kfree(cookie);
 }
 EXPORT_SYMBOL(kfree_put_link);
 
+void free_page_put_link(struct inode *unused, void *cookie)
+{
+	free_page((unsigned long) cookie);
+}
+EXPORT_SYMBOL(free_page_put_link);
+
 /*
  * nop .set_page_dirty method so that people can use .page_mkwrite on
  * anon inodes.
@@ -1094,6 +1092,17 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
 }
 EXPORT_SYMBOL(simple_nosetlease);
 
+const char *simple_follow_link(struct dentry *dentry, void **cookie)
+{
+	return d_inode(dentry)->i_link;
+}
+EXPORT_SYMBOL(simple_follow_link);
+
+const struct inode_operations simple_symlink_inode_operations = {
+	.follow_link = simple_follow_link,
+	.readlink = generic_readlink
+};
+EXPORT_SYMBOL(simple_symlink_inode_operations);
 
 /*
  * Operations for a permanently empty directory.
diff --git a/fs/locks.c b/fs/locks.c
index 653faabb0..d3d558ba4 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -862,12 +862,11 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
  * whether or not a lock was successfully freed by testing the return
  * value for -ENOENT.
  */
-static int flock_lock_file(struct file *filp, struct file_lock *request)
+static int flock_lock_inode(struct inode *inode, struct file_lock *request)
 {
 	struct file_lock *new_fl = NULL;
 	struct file_lock *fl;
 	struct file_lock_context *ctx;
-	struct inode *inode = file_inode(filp);
 	int error = 0;
 	bool found = false;
 	LIST_HEAD(dispose);
@@ -890,7 +889,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 		goto find_conflict;
 
 	list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
-		if (filp != fl->fl_file)
+		if (request->fl_file != fl->fl_file)
 			continue;
 		if (request->fl_type == fl->fl_type)
 			goto out;
@@ -1164,20 +1163,19 @@ int posix_lock_file(struct file *filp, struct file_lock *fl,
 EXPORT_SYMBOL(posix_lock_file);
 
 /**
- * posix_lock_file_wait - Apply a POSIX-style lock to a file
- * @filp: The file to apply the lock to
+ * posix_lock_inode_wait - Apply a POSIX-style lock to a file
+ * @inode: inode of file to which lock request should be applied
  * @fl: The lock to be applied
  *
- * Add a POSIX style lock to a file.
- * We merge adjacent & overlapping locks whenever possible.
- * POSIX locks are sorted by owner task, then by starting address
+ * Variant of posix_lock_file_wait that does not take a filp, and so can be
+ * used after the filp has already been torn down.
  */
-int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 {
 	int error;
 	might_sleep ();
 	for (;;) {
-		error = posix_lock_file(filp, fl, NULL);
+		error = __posix_lock_file(inode, fl, NULL);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1189,7 +1187,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 	}
 	return error;
 }
-EXPORT_SYMBOL(posix_lock_file_wait);
+EXPORT_SYMBOL(posix_lock_inode_wait);
 
 /**
  * locks_mandatory_locked - Check for an active lock
@@ -1851,18 +1849,18 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 }
 
 /**
- * flock_lock_file_wait - Apply a FLOCK-style lock to a file
- * @filp: The file to apply the lock to
+ * flock_lock_inode_wait - Apply a FLOCK-style lock to a file
+ * @inode: inode of the file to apply to
  * @fl: The lock to be applied
  *
- * Add a FLOCK style lock to a file.
+ * Apply a FLOCK style lock request to an inode.
  */
-int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 {
 	int error;
 	might_sleep();
 	for (;;) {
-		error = flock_lock_file(filp, fl);
+		error = flock_lock_inode(inode, fl);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1874,8 +1872,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
 	}
 	return error;
 }
-
-EXPORT_SYMBOL(flock_lock_file_wait);
+EXPORT_SYMBOL(flock_lock_inode_wait);
 
 /**
  *	sys_flock: - flock() system call.
@@ -2401,7 +2398,8 @@ locks_remove_flock(struct file *filp)
 		.fl_type = F_UNLCK,
 		.fl_end = OFFSET_MAX,
 	};
-	struct file_lock_context *flctx = file_inode(filp)->i_flctx;
+	struct inode *inode = file_inode(filp);
+	struct file_lock_context *flctx = inode->i_flctx;
 
 	if (list_empty(&flctx->flc_flock))
 		return;
@@ -2409,7 +2407,7 @@ locks_remove_flock(struct file *filp)
 	if (filp->f_op->flock)
 		filp->f_op->flock(filp, F_SETLKW, &fl);
 	else
-		flock_lock_file(filp, &fl);
+		flock_lock_inode(inode, &fl);
 
 	if (fl.fl_ops && fl.fl_ops->fl_release_private)
 		fl.fl_ops->fl_release_private(&fl);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 4cf38f118..f9b45d46d 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -779,6 +779,7 @@ fail:
 const struct inode_operations logfs_symlink_iops = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
 };
 
 const struct inode_operations logfs_dir_iops = {
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 118e4e7bc..d19ac2581 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -45,11 +45,6 @@ minix_last_byte(struct inode *inode, unsigned long page_nr)
 	return last_byte;
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
 {
 	struct address_space *mapping = page->mapping;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 1182d1e26..086cd0a61 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -62,7 +62,7 @@ static struct kmem_cache * minix_inode_cachep;
 static struct inode *minix_alloc_inode(struct super_block *sb)
 {
 	struct minix_inode_info *ei;
-	ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
+	ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 1ebd11854..01ad81dca 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -84,7 +84,7 @@ static inline struct minix_sb_info *minix_sb(struct super_block *sb)
 
 static inline struct minix_inode_info *minix_i(struct inode *inode)
 {
-	return list_entry(inode, struct minix_inode_info, vfs_inode);
+	return container_of(inode, struct minix_inode_info, vfs_inode);
 }
 
 static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize)
diff --git a/fs/mount.h b/fs/mount.h
index 6a61c2b3e..14db05d42 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -88,6 +88,7 @@ static inline int is_mounted(struct vfsmount *mnt)
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
 extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 
+extern int __legitimize_mnt(struct vfsmount *, unsigned);
 extern bool legitimize_mnt(struct vfsmount *, unsigned);
 
 extern void __detach_mounts(struct dentry *dentry);
@@ -117,7 +118,6 @@ static inline void unlock_mount_hash(void)
 }
 
 struct proc_mounts {
-	struct seq_file m;
 	struct mnt_namespace *ns;
 	struct path root;
 	int (*show)(struct seq_file *, struct vfsmount *);
@@ -126,8 +126,6 @@ struct proc_mounts {
 	loff_t cached_index;
 };
 
-#define proc_mounts(p) (container_of((p), struct proc_mounts, m))
-
 extern const struct seq_operations mounts_op;
 
 extern bool __is_local_mountpoint(struct dentry *dentry);
diff --git a/fs/mpage.c b/fs/mpage.c
index 3e79220ba..ca0244b69 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -605,6 +605,8 @@ alloc_new:
 				bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
 		if (bio == NULL)
 			goto confused;
+
+		wbc_init_bio(wbc, bio);
 	}
 
 	/*
@@ -612,6 +614,7 @@ alloc_new:
 	 * the confused fail path above (OOM) will be very confused when
 	 * it finds all bh marked clean (i.e. it will not write anything)
 	 */
+	wbc_account_io(wbc, page, PAGE_SIZE);
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
 		bio = mpage_bio_submit(WRITE, bio);
diff --git a/fs/namei.c b/fs/namei.c
index fe30d3be4..1c2105ed2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -492,6 +492,7 @@ void path_put(const struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
+#define EMBEDDED_LEVELS 2
 struct nameidata {
 	struct path	path;
 	struct qstr	last;
@@ -501,10 +502,139 @@ struct nameidata {
 	unsigned	seq, m_seq;
 	int		last_type;
 	unsigned	depth;
-	struct file	*base;
-	char *saved_names[MAX_NESTED_LINKS + 1];
+	int		total_link_count;
+	struct saved {
+		struct path link;
+		void *cookie;
+		const char *name;
+		struct inode *inode;
+		unsigned seq;
+	} *stack, internal[EMBEDDED_LEVELS];
+	struct filename	*name;
+	struct nameidata *saved;
+	unsigned	root_seq;
+	int		dfd;
 };
 
+static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
+{
+	struct nameidata *old = current->nameidata;
+	p->stack = p->internal;
+	p->dfd = dfd;
+	p->name = name;
+	p->total_link_count = old ? old->total_link_count : 0;
+	p->saved = old;
+	current->nameidata = p;
+}
+
+static void restore_nameidata(void)
+{
+	struct nameidata *now = current->nameidata, *old = now->saved;
+
+	current->nameidata = old;
+	if (old)
+		old->total_link_count = now->total_link_count;
+	if (now->stack != now->internal) {
+		kfree(now->stack);
+		now->stack = now->internal;
+	}
+}
+
+static int __nd_alloc_stack(struct nameidata *nd)
+{
+	struct saved *p;
+
+	if (nd->flags & LOOKUP_RCU) {
+		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
+				  GFP_ATOMIC);
+		if (unlikely(!p))
+			return -ECHILD;
+	} else {
+		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
+				  GFP_KERNEL);
+		if (unlikely(!p))
+			return -ENOMEM;
+	}
+	memcpy(p, nd->internal, sizeof(nd->internal));
+	nd->stack = p;
+	return 0;
+}
+
+static inline int nd_alloc_stack(struct nameidata *nd)
+{
+	if (likely(nd->depth != EMBEDDED_LEVELS))
+		return 0;
+	if (likely(nd->stack != nd->internal))
+		return 0;
+	return __nd_alloc_stack(nd);
+}
+
+static void drop_links(struct nameidata *nd)
+{
+	int i = nd->depth;
+	while (i--) {
+		struct saved *last = nd->stack + i;
+		struct inode *inode = last->inode;
+		if (last->cookie && inode->i_op->put_link) {
+			inode->i_op->put_link(inode, last->cookie);
+			last->cookie = NULL;
+		}
+	}
+}
+
+static void terminate_walk(struct nameidata *nd)
+{
+	drop_links(nd);
+	if (!(nd->flags & LOOKUP_RCU)) {
+		int i;
+		path_put(&nd->path);
+		for (i = 0; i < nd->depth; i++)
+			path_put(&nd->stack[i].link);
+		if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+			path_put(&nd->root);
+			nd->root.mnt = NULL;
+		}
+	} else {
+		nd->flags &= ~LOOKUP_RCU;
+		if (!(nd->flags & LOOKUP_ROOT))
+			nd->root.mnt = NULL;
+		rcu_read_unlock();
+	}
+	nd->depth = 0;
+}
+
+/* path_put is needed afterwards regardless of success or failure */
+static bool legitimize_path(struct nameidata *nd,
+			    struct path *path, unsigned seq)
+{
+	int res = __legitimize_mnt(path->mnt, nd->m_seq);
+	if (unlikely(res)) {
+		if (res > 0)
+			path->mnt = NULL;
+		path->dentry = NULL;
+		return false;
+	}
+	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
+		path->dentry = NULL;
+		return false;
+	}
+	return !read_seqcount_retry(&path->dentry->d_seq, seq);
+}
+
+static bool legitimize_links(struct nameidata *nd)
+{
+	int i;
+	for (i = 0; i < nd->depth; i++) {
+		struct saved *last = nd->stack + i;
+		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
+			drop_links(nd);
+			nd->depth = i + 1;
+			return false;
+		}
+	}
+	return true;
+}
+
 /*
  * Path walking has 2 modes, rcu-walk and ref-walk (see
  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
@@ -520,35 +650,28 @@ struct nameidata {
  * unlazy_walk - try to switch to ref-walk mode.
  * @nd: nameidata pathwalk data
  * @dentry: child of nd->path.dentry or NULL
+ * @seq: seq number to check dentry against
  * Returns: 0 on success, -ECHILD on failure
  *
  * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
  * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
  * @nd or NULL.  Must be called from rcu-walk context.
+ * Nothing should touch nameidata between unlazy_walk() failure and
+ * terminate_walk().
  */
-static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq)
 {
-	struct fs_struct *fs = current->fs;
 	struct dentry *parent = nd->path.dentry;
 
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 
-	/*
-	 * After legitimizing the bastards, terminate_walk()
-	 * will do the right thing for non-RCU mode, and all our
-	 * subsequent exit cases should rcu_read_unlock()
-	 * before returning.  Do vfsmount first; if dentry
-	 * can't be legitimized, just set nd->path.dentry to NULL
-	 * and rely on dput(NULL) being a no-op.
-	 */
-	if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
-		return -ECHILD;
 	nd->flags &= ~LOOKUP_RCU;
-
-	if (!lockref_get_not_dead(&parent->d_lockref)) {
-		nd->path.dentry = NULL;	
-		goto out;
-	}
+	if (unlikely(!legitimize_links(nd)))
+		goto out2;
+	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
+		goto out2;
+	if (unlikely(!lockref_get_not_dead(&parent->d_lockref)))
+		goto out1;
 
 	/*
 	 * For a negative lookup, the lookup sequence point is the parents
@@ -568,7 +691,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 	} else {
 		if (!lockref_get_not_dead(&dentry->d_lockref))
 			goto out;
-		if (read_seqcount_retry(&dentry->d_seq, nd->seq))
+		if (read_seqcount_retry(&dentry->d_seq, seq))
 			goto drop_dentry;
 	}
 
@@ -577,22 +700,24 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 	 * still valid and get it if required.
 	 */
 	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-		spin_lock(&fs->lock);
-		if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
-			goto unlock_and_drop_dentry;
-		path_get(&nd->root);
-		spin_unlock(&fs->lock);
+		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
+			rcu_read_unlock();
+			dput(dentry);
+			return -ECHILD;
+		}
 	}
 
 	rcu_read_unlock();
 	return 0;
 
-unlock_and_drop_dentry:
-	spin_unlock(&fs->lock);
 drop_dentry:
 	rcu_read_unlock();
 	dput(dentry);
 	goto drop_root_mnt;
+out2:
+	nd->path.mnt = NULL;
+out1:
+	nd->path.dentry = NULL;
 out:
 	rcu_read_unlock();
 drop_root_mnt:
@@ -601,6 +726,24 @@ drop_root_mnt:
 	return -ECHILD;
 }
 
+static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq)
+{
+	if (unlikely(!legitimize_path(nd, link, seq))) {
+		drop_links(nd);
+		nd->depth = 0;
+		nd->flags &= ~LOOKUP_RCU;
+		nd->path.mnt = NULL;
+		nd->path.dentry = NULL;
+		if (!(nd->flags & LOOKUP_ROOT))
+			nd->root.mnt = NULL;
+		rcu_read_unlock();
+	} else if (likely(unlazy_walk(nd, NULL, 0)) == 0) {
+		return 0;
+	}
+	path_put(link);
+	return -ECHILD;
+}
+
 static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	return dentry->d_op->d_revalidate(dentry, flags);
@@ -622,26 +765,10 @@ static int complete_walk(struct nameidata *nd)
 	int status;
 
 	if (nd->flags & LOOKUP_RCU) {
-		nd->flags &= ~LOOKUP_RCU;
 		if (!(nd->flags & LOOKUP_ROOT))
 			nd->root.mnt = NULL;
-
-		if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
-			rcu_read_unlock();
-			return -ECHILD;
-		}
-		if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
-			rcu_read_unlock();
-			mntput(nd->path.mnt);
+		if (unlikely(unlazy_walk(nd, NULL, 0)))
 			return -ECHILD;
-		}
-		if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
-			rcu_read_unlock();
-			dput(dentry);
-			mntput(nd->path.mnt);
-			return -ECHILD;
-		}
-		rcu_read_unlock();
 	}
 
 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -657,28 +784,24 @@ static int complete_walk(struct nameidata *nd)
 	if (!status)
 		status = -ESTALE;
 
-	path_put(&nd->path);
 	return status;
 }
 
-static __always_inline void set_root(struct nameidata *nd)
+static void set_root(struct nameidata *nd)
 {
 	get_fs_root(current->fs, &nd->root);
 }
 
-static int link_path_walk(const char *, struct nameidata *);
-
-static __always_inline unsigned set_root_rcu(struct nameidata *nd)
+static void set_root_rcu(struct nameidata *nd)
 {
 	struct fs_struct *fs = current->fs;
-	unsigned seq, res;
+	unsigned seq;
 
 	do {
 		seq = read_seqcount_begin(&fs->seq);
 		nd->root = fs->root;
-		res = __read_seqcount_begin(&nd->root.dentry->d_seq);
+		nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
 	} while (read_seqcount_retry(&fs->seq, seq));
-	return res;
 }
 
 static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -704,8 +827,9 @@ static inline void path_to_nameidata(const struct path *path,
  * Helper to directly jump to a known parsed path from ->follow_link,
  * caller must have taken a reference to path beforehand.
  */
-void nd_jump_link(struct nameidata *nd, struct path *path)
+void nd_jump_link(struct path *path)
 {
+	struct nameidata *nd = current->nameidata;
 	path_put(&nd->path);
 
 	nd->path = *path;
@@ -713,24 +837,14 @@ void nd_jump_link(struct nameidata *nd, struct path *path)
 	nd->flags |= LOOKUP_JUMPED;
 }
 
-void nd_set_link(struct nameidata *nd, char *path)
-{
-	nd->saved_names[nd->depth] = path;
-}
-EXPORT_SYMBOL(nd_set_link);
-
-char *nd_get_link(struct nameidata *nd)
-{
-	return nd->saved_names[nd->depth];
-}
-EXPORT_SYMBOL(nd_get_link);
-
-static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+static inline void put_link(struct nameidata *nd)
 {
-	struct inode *inode = link->dentry->d_inode;
-	if (inode->i_op->put_link)
-		inode->i_op->put_link(link->dentry, nd, cookie);
-	path_put(link);
+	struct saved *last = nd->stack + --nd->depth;
+	struct inode *inode = last->inode;
+	if (last->cookie && inode->i_op->put_link)
+		inode->i_op->put_link(inode, last->cookie);
+	if (!(nd->flags & LOOKUP_RCU))
+		path_put(&last->link);
 }
 
 int sysctl_protected_symlinks __read_mostly = 0;
@@ -738,7 +852,6 @@ int sysctl_protected_hardlinks __read_mostly = 0;
 
 /**
  * may_follow_link - Check symlink following for unsafe situations
- * @link: The path of the symlink
  * @nd: nameidata pathwalk data
  *
  * In the case of the sysctl_protected_symlinks sysctl being enabled,
@@ -752,7 +865,7 @@ int sysctl_protected_hardlinks __read_mostly = 0;
  *
  * Returns 0 if following the symlink is allowed, -ve on error.
  */
-static inline int may_follow_link(struct path *link, struct nameidata *nd)
+static inline int may_follow_link(struct nameidata *nd)
 {
 	const struct inode *inode;
 	const struct inode *parent;
@@ -761,12 +874,12 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
 		return 0;
 
 	/* Allowed if owner and follower match. */
-	inode = link->dentry->d_inode;
+	inode = nd->stack[0].inode;
 	if (uid_eq(current_cred()->fsuid, inode->i_uid))
 		return 0;
 
 	/* Allowed if parent directory not sticky and world-writable. */
-	parent = nd->path.dentry->d_inode;
+	parent = nd->inode;
 	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
 		return 0;
 
@@ -774,9 +887,10 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
 	if (uid_eq(parent->i_uid, inode->i_uid))
 		return 0;
 
-	audit_log_link_denied("follow_link", link);
-	path_put_conditional(link, nd);
-	path_put(&nd->path);
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	audit_log_link_denied("follow_link", &nd->stack[0].link);
 	return -EACCES;
 }
 
@@ -849,82 +963,68 @@ static int may_linkat(struct path *link)
 	return -EPERM;
 }
 
-static __always_inline int
-follow_link(struct path *link, struct nameidata *nd, void **p)
+static __always_inline
+const char *get_link(struct nameidata *nd)
 {
-	struct dentry *dentry = link->dentry;
+	struct saved *last = nd->stack + nd->depth - 1;
+	struct dentry *dentry = last->link.dentry;
+	struct inode *inode = last->inode;
 	int error;
-	char *s;
+	const char *res;
 
-	BUG_ON(nd->flags & LOOKUP_RCU);
-
-	if (link->mnt == nd->path.mnt)
-		mntget(link->mnt);
-
-	error = -ELOOP;
-	if (unlikely(current->total_link_count >= 40))
-		goto out_put_nd_path;
-
-	cond_resched();
-	current->total_link_count++;
-
-	touch_atime(link);
-	nd_set_link(nd, NULL);
+	if (!(nd->flags & LOOKUP_RCU)) {
+		touch_atime(&last->link);
+		cond_resched();
+	} else if (atime_needs_update(&last->link, inode)) {
+		if (unlikely(unlazy_walk(nd, NULL, 0)))
+			return ERR_PTR(-ECHILD);
+		touch_atime(&last->link);
+	}
 
-	error = security_inode_follow_link(link->dentry, nd);
-	if (error)
-		goto out_put_nd_path;
+	error = security_inode_follow_link(dentry, inode,
+					   nd->flags & LOOKUP_RCU);
+	if (unlikely(error))
+		return ERR_PTR(error);
 
 	nd->last_type = LAST_BIND;
-	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
-	error = PTR_ERR(*p);
-	if (IS_ERR(*p))
-		goto out_put_nd_path;
-
-	error = 0;
-	s = nd_get_link(nd);
-	if (s) {
-		if (unlikely(IS_ERR(s))) {
-			path_put(&nd->path);
-			put_link(nd, link, *p);
-			return PTR_ERR(s);
+	res = inode->i_link;
+	if (!res) {
+		if (nd->flags & LOOKUP_RCU) {
+			if (unlikely(unlazy_walk(nd, NULL, 0)))
+				return ERR_PTR(-ECHILD);
 		}
-		if (*s == '/') {
+		res = inode->i_op->follow_link(dentry, &last->cookie);
+		if (IS_ERR_OR_NULL(res)) {
+			last->cookie = NULL;
+			return res;
+		}
+	}
+	if (*res == '/') {
+		if (nd->flags & LOOKUP_RCU) {
+			struct dentry *d;
+			if (!nd->root.mnt)
+				set_root_rcu(nd);
+			nd->path = nd->root;
+			d = nd->path.dentry;
+			nd->inode = d->d_inode;
+			nd->seq = nd->root_seq;
+			if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
+				return ERR_PTR(-ECHILD);
+		} else {
 			if (!nd->root.mnt)
 				set_root(nd);
 			path_put(&nd->path);
 			nd->path = nd->root;
 			path_get(&nd->root);
-			nd->flags |= LOOKUP_JUMPED;
+			nd->inode = nd->path.dentry->d_inode;
 		}
-		nd->inode = nd->path.dentry->d_inode;
-		error = link_path_walk(s, nd);
-		if (unlikely(error))
-			put_link(nd, link, *p);
+		nd->flags |= LOOKUP_JUMPED;
+		while (unlikely(*++res == '/'))
+			;
 	}
-
-	return error;
-
-out_put_nd_path:
-	*p = NULL;
-	path_put(&nd->path);
-	path_put(link);
-	return error;
-}
-
-static int follow_up_rcu(struct path *path)
-{
-	struct mount *mnt = real_mount(path->mnt);
-	struct mount *parent;
-	struct dentry *mountpoint;
-
-	parent = mnt->mnt_parent;
-	if (&parent->mnt == path->mnt)
-		return 0;
-	mountpoint = mnt->mnt_mountpoint;
-	path->dentry = mountpoint;
-	path->mnt = &parent->mnt;
-	return 1;
+	if (!*res)
+		res = NULL;
+	return res;
 }
 
 /*
@@ -965,7 +1065,7 @@ EXPORT_SYMBOL(follow_up);
  * - return -EISDIR to tell follow_managed() to stop and return the path we
  *   were called with.
  */
-static int follow_automount(struct path *path, unsigned flags,
+static int follow_automount(struct path *path, struct nameidata *nd,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
@@ -985,13 +1085,13 @@ static int follow_automount(struct path *path, unsigned flags,
 	 * as being automount points.  These will need the attentions
 	 * of the daemon to instantiate them before they can be used.
 	 */
-	if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-		     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
 	    path->dentry->d_inode)
 		return -EISDIR;
 
-	current->total_link_count++;
-	if (current->total_link_count >= 40)
+	nd->total_link_count++;
+	if (nd->total_link_count >= 40)
 		return -ELOOP;
 
 	mnt = path->dentry->d_op->d_automount(path);
@@ -1005,7 +1105,7 @@ static int follow_automount(struct path *path, unsigned flags,
 		 * the path being looked up; if it wasn't then the remainder of
 		 * the path is inaccessible and we should say so.
 		 */
-		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
+		if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
 			return -EREMOTE;
 		return PTR_ERR(mnt);
 	}
@@ -1045,7 +1145,7 @@ static int follow_automount(struct path *path, unsigned flags,
  *
  * Serialization is taken care of in namespace.c
  */
-static int follow_managed(struct path *path, unsigned flags)
+static int follow_managed(struct path *path, struct nameidata *nd)
 {
 	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
 	unsigned managed;
@@ -1089,7 +1189,7 @@ static int follow_managed(struct path *path, unsigned flags)
 
 		/* Handle an automount point */
 		if (managed & DCACHE_NEED_AUTOMOUNT) {
-			ret = follow_automount(path, flags, &need_mntput);
+			ret = follow_automount(path, nd, &need_mntput);
 			if (ret < 0)
 				break;
 			continue;
@@ -1103,7 +1203,11 @@ static int follow_managed(struct path *path, unsigned flags)
 		mntput(path->mnt);
 	if (ret == -EISDIR)
 		ret = 0;
-	return ret < 0 ? ret : need_mntput;
+	if (need_mntput)
+		nd->flags |= LOOKUP_JUMPED;
+	if (unlikely(ret < 0))
+		path_put_conditional(path, nd);
+	return ret;
 }
 
 int follow_down_one(struct path *path)
@@ -1133,7 +1237,7 @@ static inline int managed_dentry_rcu(struct dentry *dentry)
  * we meet a managed dentry that would need blocking.
  */
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
-			       struct inode **inode)
+			       struct inode **inode, unsigned *seqp)
 {
 	for (;;) {
 		struct mount *mounted;
@@ -1160,7 +1264,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		path->mnt = &mounted->mnt;
 		path->dentry = mounted->mnt.mnt_root;
 		nd->flags |= LOOKUP_JUMPED;
-		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+		*seqp = read_seqcount_begin(&path->dentry->d_seq);
 		/*
 		 * Update the inode too. We don't need to re-check the
 		 * dentry sequence number here after this d_inode read,
@@ -1179,10 +1283,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		set_root_rcu(nd);
 
 	while (1) {
-		if (nd->path.dentry == nd->root.dentry &&
-		    nd->path.mnt == nd->root.mnt) {
+		if (path_equal(&nd->path, &nd->root))
 			break;
-		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			struct dentry *old = nd->path.dentry;
 			struct dentry *parent = old->d_parent;
@@ -1190,38 +1292,42 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 
 			inode = parent->d_inode;
 			seq = read_seqcount_begin(&parent->d_seq);
-			if (read_seqcount_retry(&old->d_seq, nd->seq))
-				goto failed;
+			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
+				return -ECHILD;
 			nd->path.dentry = parent;
 			nd->seq = seq;
 			break;
+		} else {
+			struct mount *mnt = real_mount(nd->path.mnt);
+			struct mount *mparent = mnt->mnt_parent;
+			struct dentry *mountpoint = mnt->mnt_mountpoint;
+			struct inode *inode2 = mountpoint->d_inode;
+			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
+			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+				return -ECHILD;
+			if (&mparent->mnt == nd->path.mnt)
+				break;
+			/* we know that mountpoint was pinned */
+			nd->path.dentry = mountpoint;
+			nd->path.mnt = &mparent->mnt;
+			inode = inode2;
+			nd->seq = seq;
 		}
-		if (!follow_up_rcu(&nd->path))
-			break;
-		inode = nd->path.dentry->d_inode;
-		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 	}
-	while (d_mountpoint(nd->path.dentry)) {
+	while (unlikely(d_mountpoint(nd->path.dentry))) {
 		struct mount *mounted;
 		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
+		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+			return -ECHILD;
 		if (!mounted)
 			break;
 		nd->path.mnt = &mounted->mnt;
 		nd->path.dentry = mounted->mnt.mnt_root;
 		inode = nd->path.dentry->d_inode;
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-		if (read_seqretry(&mount_lock, nd->m_seq))
-			goto failed;
 	}
 	nd->inode = inode;
 	return 0;
-
-failed:
-	nd->flags &= ~LOOKUP_RCU;
-	if (!(nd->flags & LOOKUP_ROOT))
-		nd->root.mnt = NULL;
-	rcu_read_unlock();
-	return -ECHILD;
 }
 
 /*
@@ -1400,7 +1506,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
  *  It _is_ time-critical.
  */
 static int lookup_fast(struct nameidata *nd,
-		       struct path *path, struct inode **inode)
+		       struct path *path, struct inode **inode,
+		       unsigned *seqp)
 {
 	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
@@ -1424,7 +1531,7 @@ static int lookup_fast(struct nameidata *nd,
 		 * This sequence count validates that the inode matches
 		 * the dentry name information from lookup.
 		 */
-		*inode = dentry->d_inode;
+		*inode = d_backing_inode(dentry);
 		negative = d_is_negative(dentry);
 		if (read_seqcount_retry(&dentry->d_seq, seq))
 			return -ECHILD;
@@ -1440,8 +1547,8 @@ static int lookup_fast(struct nameidata *nd,
 		 */
 		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
 			return -ECHILD;
-		nd->seq = seq;
 
+		*seqp = seq;
 		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
 			status = d_revalidate(dentry, nd->flags);
 			if (unlikely(status <= 0)) {
@@ -1452,10 +1559,10 @@ static int lookup_fast(struct nameidata *nd,
 		}
 		path->mnt = mnt;
 		path->dentry = dentry;
-		if (likely(__follow_mount_rcu(nd, path, inode)))
+		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
 			return 0;
 unlazy:
-		if (unlazy_walk(nd, dentry))
+		if (unlazy_walk(nd, dentry, seq))
 			return -ECHILD;
 	} else {
 		dentry = __d_lookup(parent, &nd->last);
@@ -1482,15 +1589,10 @@ unlazy:
 	}
 	path->mnt = mnt;
 	path->dentry = dentry;
-	err = follow_managed(path, nd->flags);
-	if (unlikely(err < 0)) {
-		path_put_conditional(path, nd);
-		return err;
-	}
-	if (err)
-		nd->flags |= LOOKUP_JUMPED;
-	*inode = path->dentry->d_inode;
-	return 0;
+	err = follow_managed(path, nd);
+	if (likely(!err))
+		*inode = d_backing_inode(path->dentry);
+	return err;
 
 need_lookup:
 	return 1;
@@ -1500,7 +1602,6 @@ need_lookup:
 static int lookup_slow(struct nameidata *nd, struct path *path)
 {
 	struct dentry *dentry, *parent;
-	int err;
 
 	parent = nd->path.dentry;
 	BUG_ON(nd->inode != parent->d_inode);
@@ -1512,14 +1613,7 @@ static int lookup_slow(struct nameidata *nd, struct path *path)
 		return PTR_ERR(dentry);
 	path->mnt = nd->path.mnt;
 	path->dentry = dentry;
-	err = follow_managed(path, nd->flags);
-	if (unlikely(err < 0)) {
-		path_put_conditional(path, nd);
-		return err;
-	}
-	if (err)
-		nd->flags |= LOOKUP_JUMPED;
-	return 0;
+	return follow_managed(path, nd);
 }
 
 static inline int may_lookup(struct nameidata *nd)
@@ -1528,7 +1622,7 @@ static inline int may_lookup(struct nameidata *nd)
 		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
 		if (err != -ECHILD)
 			return err;
-		if (unlazy_walk(nd, NULL))
+		if (unlazy_walk(nd, NULL, 0))
 			return -ECHILD;
 	}
 	return inode_permission(nd->inode, MAY_EXEC);
@@ -1538,24 +1632,45 @@ static inline int handle_dots(struct nameidata *nd, int type)
 {
 	if (type == LAST_DOTDOT) {
 		if (nd->flags & LOOKUP_RCU) {
-			if (follow_dotdot_rcu(nd))
-				return -ECHILD;
+			return follow_dotdot_rcu(nd);
 		} else
 			follow_dotdot(nd);
 	}
 	return 0;
 }
 
-static void terminate_walk(struct nameidata *nd)
+static int pick_link(struct nameidata *nd, struct path *link,
+		     struct inode *inode, unsigned seq)
 {
+	int error;
+	struct saved *last;
+	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
+		path_to_nameidata(link, nd);
+		return -ELOOP;
+	}
 	if (!(nd->flags & LOOKUP_RCU)) {
-		path_put(&nd->path);
-	} else {
-		nd->flags &= ~LOOKUP_RCU;
-		if (!(nd->flags & LOOKUP_ROOT))
-			nd->root.mnt = NULL;
-		rcu_read_unlock();
+		if (link->mnt == nd->path.mnt)
+			mntget(link->mnt);
+	}
+	error = nd_alloc_stack(nd);
+	if (unlikely(error)) {
+		if (error == -ECHILD) {
+			if (unlikely(unlazy_link(nd, link, seq)))
+				return -ECHILD;
+			error = nd_alloc_stack(nd);
+		}
+		if (error) {
+			path_put(link);
+			return error;
+		}
 	}
+
+	last = nd->stack + nd->depth++;
+	last->link = *link;
+	last->cookie = NULL;
+	last->inode = inode;
+	last->seq = seq;
+	return 1;
 }
 
 /*
@@ -1564,98 +1679,68 @@ static void terminate_walk(struct nameidata *nd)
  * so we keep a cache of "no, this doesn't need follow_link"
  * for the common case.
  */
-static inline int should_follow_link(struct dentry *dentry, int follow)
+static inline int should_follow_link(struct nameidata *nd, struct path *link,
+				     int follow,
+				     struct inode *inode, unsigned seq)
 {
-	return unlikely(d_is_symlink(dentry)) ? follow : 0;
+	if (likely(!d_is_symlink(link->dentry)))
+		return 0;
+	if (!follow)
+		return 0;
+	return pick_link(nd, link, inode, seq);
 }
 
-static inline int walk_component(struct nameidata *nd, struct path *path,
-		int follow)
+enum {WALK_GET = 1, WALK_PUT = 2};
+
+static int walk_component(struct nameidata *nd, int flags)
 {
+	struct path path;
 	struct inode *inode;
+	unsigned seq;
 	int err;
 	/*
 	 * "." and ".." are special - ".." especially so because it has
 	 * to be able to know about the current root directory and
 	 * parent relationships.
 	 */
-	if (unlikely(nd->last_type != LAST_NORM))
-		return handle_dots(nd, nd->last_type);
-	err = lookup_fast(nd, path, &inode);
+	if (unlikely(nd->last_type != LAST_NORM)) {
+		err = handle_dots(nd, nd->last_type);
+		if (flags & WALK_PUT)
+			put_link(nd);
+		return err;
+	}
+	err = lookup_fast(nd, &path, &inode, &seq);
 	if (unlikely(err)) {
 		if (err < 0)
-			goto out_err;
+			return err;
 
-		err = lookup_slow(nd, path);
+		err = lookup_slow(nd, &path);
 		if (err < 0)
-			goto out_err;
+			return err;
 
-		inode = path->dentry->d_inode;
+		inode = d_backing_inode(path.dentry);
+		seq = 0;	/* we are already out of RCU mode */
 		err = -ENOENT;
-		if (d_is_negative(path->dentry))
+		if (d_is_negative(path.dentry))
 			goto out_path_put;
 	}
 
-	if (should_follow_link(path->dentry, follow)) {
-		if (nd->flags & LOOKUP_RCU) {
-			if (unlikely(nd->path.mnt != path->mnt ||
-				     unlazy_walk(nd, path->dentry))) {
-				err = -ECHILD;
-				goto out_err;
-			}
-		}
-		BUG_ON(inode != path->dentry->d_inode);
-		return 1;
-	}
-	path_to_nameidata(path, nd);
+	if (flags & WALK_PUT)
+		put_link(nd);
+	err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
+	if (unlikely(err))
+		return err;
+	path_to_nameidata(&path, nd);
 	nd->inode = inode;
+	nd->seq = seq;
 	return 0;
 
 out_path_put:
-	path_to_nameidata(path, nd);
-out_err:
-	terminate_walk(nd);
+	path_to_nameidata(&path, nd);
 	return err;
 }
 
 /*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups.
- */
-static inline int nested_symlink(struct path *path, struct nameidata *nd)
-{
-	int res;
-
-	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
-		path_put_conditional(path, nd);
-		path_put(&nd->path);
-		return -ELOOP;
-	}
-	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-
-	nd->depth++;
-	current->link_count++;
-
-	do {
-		struct path link = *path;
-		void *cookie;
-
-		res = follow_link(&link, nd, &cookie);
-		if (res)
-			break;
-		res = walk_component(nd, path, LOOKUP_FOLLOW);
-		put_link(nd, &link, cookie);
-	} while (res > 0);
-
-	current->link_count--;
-	nd->depth--;
-	return res;
-}
-
-/*
  * We can do the critical dentry name comparison and hashing
  * operations one word at a time, but we are limited to:
  *
@@ -1781,9 +1866,8 @@ static inline u64 hash_name(const char *name)
  */
 static int link_path_walk(const char *name, struct nameidata *nd)
 {
-	struct path next;
 	int err;
-	
+
 	while (*name=='/')
 		name++;
 	if (!*name)
@@ -1796,7 +1880,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 		err = may_lookup(nd);
  		if (err)
-			break;
+			return err;
 
 		hash_len = hash_name(name);
 
@@ -1818,7 +1902,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 				struct qstr this = { { .hash_len = hash_len }, .name = name };
 				err = parent->d_op->d_hash(parent, &this);
 				if (err < 0)
-					break;
+					return err;
 				hash_len = this.hash_len;
 				name = this.name;
 			}
@@ -1830,7 +1914,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 		name += hashlen_len(hash_len);
 		if (!*name)
-			return 0;
+			goto OK;
 		/*
 		 * If it wasn't NUL, we know it was '/'. Skip that
 		 * slash, and continue until no more slashes.
@@ -1838,57 +1922,78 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		do {
 			name++;
 		} while (unlikely(*name == '/'));
-		if (!*name)
-			return 0;
-
-		err = walk_component(nd, &next, LOOKUP_FOLLOW);
+		if (unlikely(!*name)) {
+OK:
+			/* pathname body, done */
+			if (!nd->depth)
+				return 0;
+			name = nd->stack[nd->depth - 1].name;
+			/* trailing symlink, done */
+			if (!name)
+				return 0;
+			/* last component of nested symlink */
+			err = walk_component(nd, WALK_GET | WALK_PUT);
+		} else {
+			err = walk_component(nd, WALK_GET);
+		}
 		if (err < 0)
 			return err;
 
 		if (err) {
-			err = nested_symlink(&next, nd);
-			if (err)
-				return err;
+			const char *s = get_link(nd);
+
+			if (unlikely(IS_ERR(s)))
+				return PTR_ERR(s);
+			err = 0;
+			if (unlikely(!s)) {
+				/* jumped */
+				put_link(nd);
+			} else {
+				nd->stack[nd->depth - 1].name = name;
+				name = s;
+				continue;
+			}
 		}
-		if (!d_can_lookup(nd->path.dentry)) {
-			err = -ENOTDIR; 
-			break;
+		if (unlikely(!d_can_lookup(nd->path.dentry))) {
+			if (nd->flags & LOOKUP_RCU) {
+				if (unlazy_walk(nd, NULL, 0))
+					return -ECHILD;
+			}
+			return -ENOTDIR;
 		}
 	}
-	terminate_walk(nd);
-	return err;
 }
 
-static int path_init(int dfd, const struct filename *name, unsigned int flags,
-		     struct nameidata *nd)
+static const char *path_init(struct nameidata *nd, unsigned flags)
 {
 	int retval = 0;
-	const char *s = name->name;
+	const char *s = nd->name->name;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
 	nd->depth = 0;
-	nd->base = NULL;
+	nd->total_link_count = 0;
 	if (flags & LOOKUP_ROOT) {
 		struct dentry *root = nd->root.dentry;
 		struct inode *inode = root->d_inode;
 		if (*s) {
 			if (!d_can_lookup(root))
-				return -ENOTDIR;
+				return ERR_PTR(-ENOTDIR);
 			retval = inode_permission(inode, MAY_EXEC);
 			if (retval)
-				return retval;
+				return ERR_PTR(retval);
 		}
 		nd->path = nd->root;
 		nd->inode = inode;
 		if (flags & LOOKUP_RCU) {
 			rcu_read_lock();
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			nd->root_seq = nd->seq;
 			nd->m_seq = read_seqbegin(&mount_lock);
 		} else {
 			path_get(&nd->path);
 		}
-		goto done;
+		return s;
 	}
 
 	nd->root.mnt = NULL;
@@ -1897,13 +2002,14 @@ static int path_init(int dfd, const struct filename *name, unsigned int flags,
 	if (*s == '/') {
 		if (flags & LOOKUP_RCU) {
 			rcu_read_lock();
-			nd->seq = set_root_rcu(nd);
+			set_root_rcu(nd);
+			nd->seq = nd->root_seq;
 		} else {
 			set_root(nd);
 			path_get(&nd->root);
 		}
 		nd->path = nd->root;
-	} else if (dfd == AT_FDCWD) {
+	} else if (nd->dfd == AT_FDCWD) {
 		if (flags & LOOKUP_RCU) {
 			struct fs_struct *fs = current->fs;
 			unsigned seq;
@@ -1920,180 +2026,205 @@ static int path_init(int dfd, const struct filename *name, unsigned int flags,
 		}
 	} else {
 		/* Caller must check execute permissions on the starting path component */
-		struct fd f = fdget_raw(dfd);
+		struct fd f = fdget_raw(nd->dfd);
 		struct dentry *dentry;
 
 		if (!f.file)
-			return -EBADF;
+			return ERR_PTR(-EBADF);
 
 		dentry = f.file->f_path.dentry;
 
 		if (*s) {
 			if (!d_can_lookup(dentry)) {
 				fdput(f);
-				return -ENOTDIR;
+				return ERR_PTR(-ENOTDIR);
 			}
 		}
 
 		nd->path = f.file->f_path;
 		if (flags & LOOKUP_RCU) {
-			if (f.flags & FDPUT_FPUT)
-				nd->base = f.file;
-			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 			rcu_read_lock();
+			nd->inode = nd->path.dentry->d_inode;
+			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 		} else {
 			path_get(&nd->path);
-			fdput(f);
+			nd->inode = nd->path.dentry->d_inode;
 		}
+		fdput(f);
+		return s;
 	}
 
 	nd->inode = nd->path.dentry->d_inode;
 	if (!(flags & LOOKUP_RCU))
-		goto done;
+		return s;
 	if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
-		goto done;
+		return s;
 	if (!(nd->flags & LOOKUP_ROOT))
 		nd->root.mnt = NULL;
 	rcu_read_unlock();
-	return -ECHILD;
-done:
-	current->total_link_count = 0;
-	return link_path_walk(s, nd);
+	return ERR_PTR(-ECHILD);
 }
 
-static void path_cleanup(struct nameidata *nd)
+static const char *trailing_symlink(struct nameidata *nd)
 {
-	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-		path_put(&nd->root);
-		nd->root.mnt = NULL;
-	}
-	if (unlikely(nd->base))
-		fput(nd->base);
+	const char *s;
+	int error = may_follow_link(nd);
+	if (unlikely(error))
+		return ERR_PTR(error);
+	nd->flags |= LOOKUP_PARENT;
+	nd->stack[0].name = NULL;
+	s = get_link(nd);
+	return s ? s : "";
 }
 
-static inline int lookup_last(struct nameidata *nd, struct path *path)
+static inline int lookup_last(struct nameidata *nd)
 {
 	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
 		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 
 	nd->flags &= ~LOOKUP_PARENT;
-	return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW);
+	return walk_component(nd,
+			nd->flags & LOOKUP_FOLLOW
+				? nd->depth
+					? WALK_PUT | WALK_GET
+					: WALK_GET
+				: 0);
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int path_lookupat(int dfd, const struct filename *name,
-				unsigned int flags, struct nameidata *nd)
+static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
 {
-	struct path path;
+	const char *s = path_init(nd, flags);
 	int err;
 
-	/*
-	 * Path walking is largely split up into 2 different synchronisation
-	 * schemes, rcu-walk and ref-walk (explained in
-	 * Documentation/filesystems/path-lookup.txt). These share much of the
-	 * path walk code, but some things particularly setup, cleanup, and
-	 * following mounts are sufficiently divergent that functions are
-	 * duplicated. Typically there is a function foo(), and its RCU
-	 * analogue, foo_rcu().
-	 *
-	 * -ECHILD is the error number of choice (just to avoid clashes) that
-	 * is returned if some aspect of an rcu-walk fails. Such an error must
-	 * be handled by restarting a traditional ref-walk (which will always
-	 * be able to complete).
-	 */
-	err = path_init(dfd, name, flags, nd);
-	if (!err && !(flags & LOOKUP_PARENT)) {
-		err = lookup_last(nd, &path);
-		while (err > 0) {
-			void *cookie;
-			struct path link = path;
-			err = may_follow_link(&link, nd);
-			if (unlikely(err))
-				break;
-			nd->flags |= LOOKUP_PARENT;
-			err = follow_link(&link, nd, &cookie);
-			if (err)
-				break;
-			err = lookup_last(nd, &path);
-			put_link(nd, &link, cookie);
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+	while (!(err = link_path_walk(s, nd))
+		&& ((err = lookup_last(nd)) > 0)) {
+		s = trailing_symlink(nd);
+		if (IS_ERR(s)) {
+			err = PTR_ERR(s);
+			break;
 		}
 	}
-
 	if (!err)
 		err = complete_walk(nd);
 
-	if (!err && nd->flags & LOOKUP_DIRECTORY) {
-		if (!d_can_lookup(nd->path.dentry)) {
-			path_put(&nd->path);
+	if (!err && nd->flags & LOOKUP_DIRECTORY)
+		if (!d_can_lookup(nd->path.dentry))
 			err = -ENOTDIR;
-		}
+	if (!err) {
+		*path = nd->path;
+		nd->path.mnt = NULL;
+		nd->path.dentry = NULL;
 	}
-
-	path_cleanup(nd);
+	terminate_walk(nd);
 	return err;
 }
 
-static int filename_lookup(int dfd, struct filename *name,
-				unsigned int flags, struct nameidata *nd)
+static int filename_lookup(int dfd, struct filename *name, unsigned flags,
+			   struct path *path, struct path *root)
 {
-	int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+	int retval;
+	struct nameidata nd;
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+	if (unlikely(root)) {
+		nd.root = *root;
+		flags |= LOOKUP_ROOT;
+	}
+	set_nameidata(&nd, dfd, name);
+	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
 	if (unlikely(retval == -ECHILD))
-		retval = path_lookupat(dfd, name, flags, nd);
+		retval = path_lookupat(&nd, flags, path);
 	if (unlikely(retval == -ESTALE))
-		retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
+		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
 
 	if (likely(!retval))
-		audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
+		audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
+	restore_nameidata();
+	putname(name);
 	return retval;
 }
 
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int path_parentat(struct nameidata *nd, unsigned flags,
+				struct path *parent)
+{
+	const char *s = path_init(nd, flags);
+	int err;
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+	err = link_path_walk(s, nd);
+	if (!err)
+		err = complete_walk(nd);
+	if (!err) {
+		*parent = nd->path;
+		nd->path.mnt = NULL;
+		nd->path.dentry = NULL;
+	}
+	terminate_walk(nd);
+	return err;
+}
+
+static struct filename *filename_parentat(int dfd, struct filename *name,
+				unsigned int flags, struct path *parent,
+				struct qstr *last, int *type)
+{
+	int retval;
+	struct nameidata nd;
+
+	if (IS_ERR(name))
+		return name;
+	set_nameidata(&nd, dfd, name);
+	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
+	if (unlikely(retval == -ECHILD))
+		retval = path_parentat(&nd, flags, parent);
+	if (unlikely(retval == -ESTALE))
+		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
+	if (likely(!retval)) {
+		*last = nd.last;
+		*type = nd.last_type;
+		audit_inode(name, parent->dentry, LOOKUP_PARENT);
+	} else {
+		putname(name);
+		name = ERR_PTR(retval);
+	}
+	restore_nameidata();
+	return name;
+}
+
 /* does lookup, returns the object with parent locked */
 struct dentry *kern_path_locked(const char *name, struct path *path)
 {
-	struct filename *filename = getname_kernel(name);
-	struct nameidata nd;
+	struct filename *filename;
 	struct dentry *d;
-	int err;
+	struct qstr last;
+	int type;
 
+	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
+				    &last, &type);
 	if (IS_ERR(filename))
 		return ERR_CAST(filename);
-
-	err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd);
-	if (err) {
-		d = ERR_PTR(err);
-		goto out;
-	}
-	if (nd.last_type != LAST_NORM) {
-		path_put(&nd.path);
-		d = ERR_PTR(-EINVAL);
-		goto out;
+	if (unlikely(type != LAST_NORM)) {
+		path_put(path);
+		putname(filename);
+		return ERR_PTR(-EINVAL);
 	}
-	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-	d = __lookup_hash(&nd.last, nd.path.dentry, 0);
+	mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	d = __lookup_hash(&last, path->dentry, 0);
 	if (IS_ERR(d)) {
-		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-		path_put(&nd.path);
-		goto out;
+		mutex_unlock(&path->dentry->d_inode->i_mutex);
+		path_put(path);
 	}
-	*path = nd.path;
-out:
 	putname(filename);
 	return d;
 }
 
 int kern_path(const char *name, unsigned int flags, struct path *path)
 {
-	struct nameidata nd;
-	struct filename *filename = getname_kernel(name);
-	int res = PTR_ERR(filename);
-
-	if (!IS_ERR(filename)) {
-		res = filename_lookup(AT_FDCWD, filename, flags, &nd);
-		putname(filename);
-		if (!res)
-			*path = nd.path;
-	}
-	return res;
+	return filename_lookup(AT_FDCWD, getname_kernel(name),
+			       flags, path, NULL);
 }
 EXPORT_SYMBOL(kern_path);
 
@@ -2109,36 +2240,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		    const char *name, unsigned int flags,
 		    struct path *path)
 {
-	struct filename *filename = getname_kernel(name);
-	int err = PTR_ERR(filename);
-
-	BUG_ON(flags & LOOKUP_PARENT);
-
-	/* the first argument of filename_lookup() is ignored with LOOKUP_ROOT */
-	if (!IS_ERR(filename)) {
-		struct nameidata nd;
-		nd.root.dentry = dentry;
-		nd.root.mnt = mnt;
-		err = filename_lookup(AT_FDCWD, filename,
-				      flags | LOOKUP_ROOT, &nd);
-		if (!err)
-			*path = nd.path;
-		putname(filename);
-	}
-	return err;
+	struct path root = {.mnt = mnt, .dentry = dentry};
+	/* the first argument of filename_lookup() is ignored with root */
+	return filename_lookup(AT_FDCWD, getname_kernel(name),
+			       flags , path, &root);
 }
 EXPORT_SYMBOL(vfs_path_lookup);
 
-/*
- * Restricted form of lookup. Doesn't follow links, single-component only,
- * needs parent already locked. Doesn't follow mounts.
- * SMP-safe.
- */
-static struct dentry *lookup_hash(struct nameidata *nd)
-{
-	return __lookup_hash(&nd->last, nd->path.dentry, nd->flags);
-}
-
 /**
  * lookup_one_len - filesystem helper to lookup single pathname component
  * @name:	pathname component to lookup
@@ -2193,27 +2301,10 @@ EXPORT_SYMBOL(lookup_one_len);
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
-	struct nameidata nd;
-	struct filename *tmp = getname_flags(name, flags, empty);
-	int err = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-
-		BUG_ON(flags & LOOKUP_PARENT);
-
-		err = filename_lookup(dfd, tmp, flags, &nd);
-		putname(tmp);
-		if (!err)
-			*path = nd.path;
-	}
-	return err;
-}
-
-int user_path_at(int dfd, const char __user *name, unsigned flags,
-		 struct path *path)
-{
-	return user_path_at_empty(dfd, name, flags, path, NULL);
+	return filename_lookup(dfd, getname_flags(name, flags, empty),
+			       flags, path, NULL);
 }
-EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(user_path_at_empty);
 
 /*
  * NB: most callers don't do anything directly with the reference to the
@@ -2221,26 +2312,16 @@ EXPORT_SYMBOL(user_path_at);
  *     allocated by getname. So we must hold the reference to it until all
  *     path-walking is complete.
  */
-static struct filename *
-user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
+static inline struct filename *
+user_path_parent(int dfd, const char __user *path,
+		 struct path *parent,
+		 struct qstr *last,
+		 int *type,
 		 unsigned int flags)
 {
-	struct filename *s = getname(path);
-	int error;
-
 	/* only LOOKUP_REVAL is allowed in extra flags */
-	flags &= LOOKUP_REVAL;
-
-	if (IS_ERR(s))
-		return s;
-
-	error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
-	if (error) {
-		putname(s);
-		return ERR_PTR(error);
-	}
-
-	return s;
+	return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
+				 parent, last, type);
 }
 
 /**
@@ -2279,10 +2360,8 @@ mountpoint_last(struct nameidata *nd, struct path *path)
 
 	/* If we're in rcuwalk, drop out of it to handle last component */
 	if (nd->flags & LOOKUP_RCU) {
-		if (unlazy_walk(nd, NULL)) {
-			error = -ECHILD;
-			goto out;
-		}
+		if (unlazy_walk(nd, NULL, 0))
+			return -ECHILD;
 	}
 
 	nd->flags &= ~LOOKUP_PARENT;
@@ -2290,7 +2369,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
 	if (unlikely(nd->last_type != LAST_NORM)) {
 		error = handle_dots(nd, nd->last_type);
 		if (error)
-			goto out;
+			return error;
 		dentry = dget(nd->path.dentry);
 		goto done;
 	}
@@ -2305,74 +2384,60 @@ mountpoint_last(struct nameidata *nd, struct path *path)
 		 */
 		dentry = d_alloc(dir, &nd->last);
 		if (!dentry) {
-			error = -ENOMEM;
 			mutex_unlock(&dir->d_inode->i_mutex);
-			goto out;
+			return -ENOMEM;
 		}
 		dentry = lookup_real(dir->d_inode, dentry, nd->flags);
-		error = PTR_ERR(dentry);
 		if (IS_ERR(dentry)) {
 			mutex_unlock(&dir->d_inode->i_mutex);
-			goto out;
+			return PTR_ERR(dentry);
 		}
 	}
 	mutex_unlock(&dir->d_inode->i_mutex);
 
 done:
 	if (d_is_negative(dentry)) {
-		error = -ENOENT;
 		dput(dentry);
-		goto out;
+		return -ENOENT;
 	}
+	if (nd->depth)
+		put_link(nd);
 	path->dentry = dentry;
 	path->mnt = nd->path.mnt;
-	if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW))
-		return 1;
+	error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
+				   d_backing_inode(dentry), 0);
+	if (unlikely(error))
+		return error;
 	mntget(path->mnt);
 	follow_mount(path);
-	error = 0;
-out:
-	terminate_walk(nd);
-	return error;
+	return 0;
 }
 
 /**
  * path_mountpoint - look up a path to be umounted
- * @dfd:	directory file descriptor to start walk from
- * @name:	full pathname to walk
- * @path:	pointer to container for result
+ * @nameidata:	lookup context
  * @flags:	lookup flags
+ * @path:	pointer to container for result
  *
  * Look up the given name, but don't attempt to revalidate the last component.
  * Returns 0 and "path" will be valid on success; Returns error otherwise.
  */
 static int
-path_mountpoint(int dfd, const struct filename *name, struct path *path,
-		unsigned int flags)
+path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
 {
-	struct nameidata nd;
+	const char *s = path_init(nd, flags);
 	int err;
-
-	err = path_init(dfd, name, flags, &nd);
-	if (unlikely(err))
-		goto out;
-
-	err = mountpoint_last(&nd, path);
-	while (err > 0) {
-		void *cookie;
-		struct path link = *path;
-		err = may_follow_link(&link, &nd);
-		if (unlikely(err))
-			break;
-		nd.flags |= LOOKUP_PARENT;
-		err = follow_link(&link, &nd, &cookie);
-		if (err)
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+	while (!(err = link_path_walk(s, nd)) &&
+		(err = mountpoint_last(nd, path)) > 0) {
+		s = trailing_symlink(nd);
+		if (IS_ERR(s)) {
+			err = PTR_ERR(s);
 			break;
-		err = mountpoint_last(&nd, path);
-		put_link(&nd, &link, cookie);
+		}
 	}
-out:
-	path_cleanup(&nd);
+	terminate_walk(nd);
 	return err;
 }
 
@@ -2380,16 +2445,19 @@ static int
 filename_mountpoint(int dfd, struct filename *name, struct path *path,
 			unsigned int flags)
 {
+	struct nameidata nd;
 	int error;
 	if (IS_ERR(name))
 		return PTR_ERR(name);
-	error = path_mountpoint(dfd, name, path, flags | LOOKUP_RCU);
+	set_nameidata(&nd, dfd, name);
+	error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
 	if (unlikely(error == -ECHILD))
-		error = path_mountpoint(dfd, name, path, flags);
+		error = path_mountpoint(&nd, flags, path);
 	if (unlikely(error == -ESTALE))
-		error = path_mountpoint(dfd, name, path, flags | LOOKUP_REVAL);
+		error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
 	if (likely(!error))
 		audit_inode(name, path->dentry, 0);
+	restore_nameidata();
 	putname(name);
 	return error;
 }
@@ -2456,7 +2524,7 @@ EXPORT_SYMBOL(__check_sticky);
  */
 static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 {
-	struct inode *inode = victim->d_inode;
+	struct inode *inode = d_backing_inode(victim);
 	int error;
 
 	if (d_is_negative(victim))
@@ -2922,18 +2990,19 @@ out_dput:
 /*
  * Handle the last step of open()
  */
-static int do_last(struct nameidata *nd, struct path *path,
+static int do_last(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op,
-		   int *opened, struct filename *name)
+		   int *opened)
 {
 	struct dentry *dir = nd->path.dentry;
 	int open_flag = op->open_flag;
 	bool will_truncate = (open_flag & O_TRUNC) != 0;
 	bool got_write = false;
 	int acc_mode = op->acc_mode;
+	unsigned seq;
 	struct inode *inode;
-	bool symlink_ok = false;
 	struct path save_parent = { .dentry = NULL, .mnt = NULL };
+	struct path path;
 	bool retried = false;
 	int error;
 
@@ -2942,7 +3011,7 @@ static int do_last(struct nameidata *nd, struct path *path,
 
 	if (nd->last_type != LAST_NORM) {
 		error = handle_dots(nd, nd->last_type);
-		if (error)
+		if (unlikely(error))
 			return error;
 		goto finish_open;
 	}
@@ -2950,15 +3019,13 @@ static int do_last(struct nameidata *nd, struct path *path,
 	if (!(open_flag & O_CREAT)) {
 		if (nd->last.name[nd->last.len])
 			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
-			symlink_ok = true;
 		/* we _can_ be in RCU mode here */
-		error = lookup_fast(nd, path, &inode);
+		error = lookup_fast(nd, &path, &inode, &seq);
 		if (likely(!error))
 			goto finish_lookup;
 
 		if (error < 0)
-			goto out;
+			return error;
 
 		BUG_ON(nd->inode != dir->d_inode);
 	} else {
@@ -2972,11 +3039,10 @@ static int do_last(struct nameidata *nd, struct path *path,
 		if (error)
 			return error;
 
-		audit_inode(name, dir, LOOKUP_PARENT);
-		error = -EISDIR;
+		audit_inode(nd->name, dir, LOOKUP_PARENT);
 		/* trailing slashes? */
-		if (nd->last.name[nd->last.len])
-			goto out;
+		if (unlikely(nd->last.name[nd->last.len]))
+			return -EISDIR;
 	}
 
 retry_lookup:
@@ -2991,7 +3057,7 @@ retry_lookup:
 		 */
 	}
 	mutex_lock(&dir->d_inode->i_mutex);
-	error = lookup_open(nd, path, file, op, got_write, opened);
+	error = lookup_open(nd, &path, file, op, got_write, opened);
 	mutex_unlock(&dir->d_inode->i_mutex);
 
 	if (error <= 0) {
@@ -3002,7 +3068,7 @@ retry_lookup:
 		    !S_ISREG(file_inode(file)->i_mode))
 			will_truncate = false;
 
-		audit_inode(name, file->f_path.dentry, 0);
+		audit_inode(nd->name, file->f_path.dentry, 0);
 		goto opened;
 	}
 
@@ -3011,15 +3077,15 @@ retry_lookup:
 		open_flag &= ~O_TRUNC;
 		will_truncate = false;
 		acc_mode = MAY_OPEN;
-		path_to_nameidata(path, nd);
+		path_to_nameidata(&path, nd);
 		goto finish_open_created;
 	}
 
 	/*
 	 * create/update audit record if it already exists.
 	 */
-	if (d_is_positive(path->dentry))
-		audit_inode(name, path->dentry, 0);
+	if (d_is_positive(path.dentry))
+		audit_inode(nd->name, path.dentry, 0);
 
 	/*
 	 * If atomic_open() acquired write access it is dropped now due to
@@ -3031,47 +3097,45 @@ retry_lookup:
 		got_write = false;
 	}
 
-	error = -EEXIST;
-	if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
-		goto exit_dput;
-
-	error = follow_managed(path, nd->flags);
-	if (error < 0)
-		goto exit_dput;
+	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
+		path_to_nameidata(&path, nd);
+		return -EEXIST;
+	}
 
-	if (error)
-		nd->flags |= LOOKUP_JUMPED;
+	error = follow_managed(&path, nd);
+	if (unlikely(error < 0))
+		return error;
 
 	BUG_ON(nd->flags & LOOKUP_RCU);
-	inode = path->dentry->d_inode;
-	error = -ENOENT;
-	if (d_is_negative(path->dentry)) {
-		path_to_nameidata(path, nd);
-		goto out;
+	inode = d_backing_inode(path.dentry);
+	seq = 0;	/* out of RCU mode, so the value doesn't matter */
+	if (unlikely(d_is_negative(path.dentry))) {
+		path_to_nameidata(&path, nd);
+		return -ENOENT;
 	}
 finish_lookup:
-	/* we _can_ be in RCU mode here */
-	if (should_follow_link(path->dentry, !symlink_ok)) {
-		if (nd->flags & LOOKUP_RCU) {
-			if (unlikely(nd->path.mnt != path->mnt ||
-				     unlazy_walk(nd, path->dentry))) {
-				error = -ECHILD;
-				goto out;
-			}
-		}
-		BUG_ON(inode != path->dentry->d_inode);
-		return 1;
+	if (nd->depth)
+		put_link(nd);
+	error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
+				   inode, seq);
+	if (unlikely(error))
+		return error;
+
+	if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
+		path_to_nameidata(&path, nd);
+		return -ELOOP;
 	}
 
-	if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
-		path_to_nameidata(path, nd);
+	if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
+		path_to_nameidata(&path, nd);
 	} else {
 		save_parent.dentry = nd->path.dentry;
-		save_parent.mnt = mntget(path->mnt);
-		nd->path.dentry = path->dentry;
+		save_parent.mnt = mntget(path.mnt);
+		nd->path.dentry = path.dentry;
 
 	}
 	nd->inode = inode;
+	nd->seq = seq;
 	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
 finish_open:
 	error = complete_walk(nd);
@@ -3079,7 +3143,7 @@ finish_open:
 		path_put(&save_parent);
 		return error;
 	}
-	audit_inode(name, nd->path.dentry, 0);
+	audit_inode(nd->name, nd->path.dentry, 0);
 	error = -EISDIR;
 	if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
 		goto out;
@@ -3126,12 +3190,8 @@ out:
 	if (got_write)
 		mnt_drop_write(nd->path.mnt);
 	path_put(&save_parent);
-	terminate_walk(nd);
 	return error;
 
-exit_dput:
-	path_put_conditional(path, nd);
-	goto out;
 exit_fput:
 	fput(file);
 	goto out;
@@ -3155,50 +3215,46 @@ stale_open:
 	goto retry_lookup;
 }
 
-static int do_tmpfile(int dfd, struct filename *pathname,
-		struct nameidata *nd, int flags,
+static int do_tmpfile(struct nameidata *nd, unsigned flags,
 		const struct open_flags *op,
 		struct file *file, int *opened)
 {
 	static const struct qstr name = QSTR_INIT("/", 1);
-	struct dentry *dentry, *child;
+	struct dentry *child;
 	struct inode *dir;
-	int error = path_lookupat(dfd, pathname,
-				  flags | LOOKUP_DIRECTORY, nd);
+	struct path path;
+	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
 	if (unlikely(error))
 		return error;
-	error = mnt_want_write(nd->path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (unlikely(error))
 		goto out;
+	dir = path.dentry->d_inode;
 	/* we want directory to be writable */
-	error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out2;
-	dentry = nd->path.dentry;
-	dir = dentry->d_inode;
 	if (!dir->i_op->tmpfile) {
 		error = -EOPNOTSUPP;
 		goto out2;
 	}
-	child = d_alloc(dentry, &name);
+	child = d_alloc(path.dentry, &name);
 	if (unlikely(!child)) {
 		error = -ENOMEM;
 		goto out2;
 	}
-	nd->flags &= ~LOOKUP_DIRECTORY;
-	nd->flags |= op->intent;
-	dput(nd->path.dentry);
-	nd->path.dentry = child;
-	error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
+	dput(path.dentry);
+	path.dentry = child;
+	error = dir->i_op->tmpfile(dir, child, op->mode);
 	if (error)
 		goto out2;
-	audit_inode(pathname, nd->path.dentry, 0);
+	audit_inode(nd->name, child, 0);
 	/* Don't check for other permissions, the inode was just created */
-	error = may_open(&nd->path, MAY_OPEN, op->open_flag);
+	error = may_open(&path, MAY_OPEN, op->open_flag);
 	if (error)
 		goto out2;
-	file->f_path.mnt = nd->path.mnt;
-	error = finish_open(file, nd->path.dentry, NULL, opened);
+	file->f_path.mnt = path.mnt;
+	error = finish_open(file, child, NULL, opened);
 	if (error)
 		goto out2;
 	error = open_check_o_direct(file);
@@ -3211,17 +3267,17 @@ static int do_tmpfile(int dfd, struct filename *pathname,
 		spin_unlock(&inode->i_lock);
 	}
 out2:
-	mnt_drop_write(nd->path.mnt);
+	mnt_drop_write(path.mnt);
 out:
-	path_put(&nd->path);
+	path_put(&path);
 	return error;
 }
 
-static struct file *path_openat(int dfd, struct filename *pathname,
-		struct nameidata *nd, const struct open_flags *op, int flags)
+static struct file *path_openat(struct nameidata *nd,
+			const struct open_flags *op, unsigned flags)
 {
+	const char *s;
 	struct file *file;
-	struct path path;
 	int opened = 0;
 	int error;
 
@@ -3232,37 +3288,25 @@ static struct file *path_openat(int dfd, struct filename *pathname,
 	file->f_flags = op->open_flag;
 
 	if (unlikely(file->f_flags & __O_TMPFILE)) {
-		error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
+		error = do_tmpfile(nd, flags, op, file, &opened);
 		goto out2;
 	}
 
-	error = path_init(dfd, pathname, flags, nd);
-	if (unlikely(error))
-		goto out;
-
-	error = do_last(nd, &path, file, op, &opened, pathname);
-	while (unlikely(error > 0)) { /* trailing symlink */
-		struct path link = path;
-		void *cookie;
-		if (!(nd->flags & LOOKUP_FOLLOW)) {
-			path_put_conditional(&path, nd);
-			path_put(&nd->path);
-			error = -ELOOP;
-			break;
-		}
-		error = may_follow_link(&link, nd);
-		if (unlikely(error))
-			break;
-		nd->flags |= LOOKUP_PARENT;
+	s = path_init(nd, flags);
+	if (IS_ERR(s)) {
+		put_filp(file);
+		return ERR_CAST(s);
+	}
+	while (!(error = link_path_walk(s, nd)) &&
+		(error = do_last(nd, file, op, &opened)) > 0) {
 		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-		error = follow_link(&link, nd, &cookie);
-		if (unlikely(error))
+		s = trailing_symlink(nd);
+		if (IS_ERR(s)) {
+			error = PTR_ERR(s);
 			break;
-		error = do_last(nd, &path, file, op, &opened, pathname);
-		put_link(nd, &link, cookie);
+		}
 	}
-out:
-	path_cleanup(nd);
+	terminate_walk(nd);
 out2:
 	if (!(opened & FILE_OPENED)) {
 		BUG_ON(!error);
@@ -3287,11 +3331,13 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
 	int flags = op->lookup_flags;
 	struct file *filp;
 
-	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
+	set_nameidata(&nd, dfd, pathname);
+	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
 	if (unlikely(filp == ERR_PTR(-ECHILD)))
-		filp = path_openat(dfd, pathname, &nd, op, flags);
+		filp = path_openat(&nd, op, flags);
 	if (unlikely(filp == ERR_PTR(-ESTALE)))
-		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
+	restore_nameidata();
 	return filp;
 }
 
@@ -3313,11 +3359,13 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 	if (unlikely(IS_ERR(filename)))
 		return ERR_CAST(filename);
 
-	file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU);
+	set_nameidata(&nd, -1, filename);
+	file = path_openat(&nd, op, flags | LOOKUP_RCU);
 	if (unlikely(file == ERR_PTR(-ECHILD)))
-		file = path_openat(-1, filename, &nd, op, flags);
+		file = path_openat(&nd, op, flags);
 	if (unlikely(file == ERR_PTR(-ESTALE)))
-		file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
+		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
+	restore_nameidata();
 	putname(filename);
 	return file;
 }
@@ -3326,7 +3374,8 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 				struct path *path, unsigned int lookup_flags)
 {
 	struct dentry *dentry = ERR_PTR(-EEXIST);
-	struct nameidata nd;
+	struct qstr last;
+	int type;
 	int err2;
 	int error;
 	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
@@ -3337,26 +3386,25 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 	 */
 	lookup_flags &= LOOKUP_REVAL;
 
-	error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd);
-	if (error)
-		return ERR_PTR(error);
+	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+	if (IS_ERR(name))
+		return ERR_CAST(name);
 
 	/*
 	 * Yucky last component or no last component at all?
 	 * (foo/., foo/.., /////)
 	 */
-	if (nd.last_type != LAST_NORM)
+	if (unlikely(type != LAST_NORM))
 		goto out;
-	nd.flags &= ~LOOKUP_PARENT;
-	nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
 
 	/* don't fail immediately if it's r/o, at least try to report other errors */
-	err2 = mnt_want_write(nd.path.mnt);
+	err2 = mnt_want_write(path->mnt);
 	/*
 	 * Do the final lookup.
 	 */
-	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-	dentry = lookup_hash(&nd);
+	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+	mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
 	if (IS_ERR(dentry))
 		goto unlock;
 
@@ -3370,7 +3418,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 	 * all is fine. Let's be bastards - you had / on the end, you've
 	 * been asking for (non-existent) directory. -ENOENT for you.
 	 */
-	if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
+	if (unlikely(!is_dir && last.name[last.len])) {
 		error = -ENOENT;
 		goto fail;
 	}
@@ -3378,31 +3426,26 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 		error = err2;
 		goto fail;
 	}
-	*path = nd.path;
+	putname(name);
 	return dentry;
 fail:
 	dput(dentry);
 	dentry = ERR_PTR(error);
 unlock:
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
 	if (!err2)
-		mnt_drop_write(nd.path.mnt);
+		mnt_drop_write(path->mnt);
 out:
-	path_put(&nd.path);
+	path_put(path);
+	putname(name);
 	return dentry;
 }
 
 struct dentry *kern_path_create(int dfd, const char *pathname,
 				struct path *path, unsigned int lookup_flags)
 {
-	struct filename *filename = getname_kernel(pathname);
-	struct dentry *res;
-
-	if (IS_ERR(filename))
-		return ERR_CAST(filename);
-	res = filename_create(dfd, filename, path, lookup_flags);
-	putname(filename);
-	return res;
+	return filename_create(dfd, getname_kernel(pathname),
+				path, lookup_flags);
 }
 EXPORT_SYMBOL(kern_path_create);
 
@@ -3415,16 +3458,10 @@ void done_path_create(struct path *path, struct dentry *dentry)
 }
 EXPORT_SYMBOL(done_path_create);
 
-struct dentry *user_path_create(int dfd, const char __user *pathname,
+inline struct dentry *user_path_create(int dfd, const char __user *pathname,
 				struct path *path, unsigned int lookup_flags)
 {
-	struct filename *tmp = getname(pathname);
-	struct dentry *res;
-	if (IS_ERR(tmp))
-		return ERR_CAST(tmp);
-	res = filename_create(dfd, tmp, path, lookup_flags);
-	putname(tmp);
-	return res;
+	return filename_create(dfd, getname(pathname), path, lookup_flags);
 }
 EXPORT_SYMBOL(user_path_create);
 
@@ -3645,14 +3682,17 @@ static long do_rmdir(int dfd, const char __user *pathname)
 	int error = 0;
 	struct filename *name;
 	struct dentry *dentry;
-	struct nameidata nd;
+	struct path path;
+	struct qstr last;
+	int type;
 	unsigned int lookup_flags = 0;
 retry:
-	name = user_path_parent(dfd, pathname, &nd, lookup_flags);
+	name = user_path_parent(dfd, pathname,
+				&path, &last, &type, lookup_flags);
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
-	switch(nd.last_type) {
+	switch (type) {
 	case LAST_DOTDOT:
 		error = -ENOTEMPTY;
 		goto exit1;
@@ -3664,13 +3704,12 @@ retry:
 		goto exit1;
 	}
 
-	nd.flags &= ~LOOKUP_PARENT;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto exit1;
 
-	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-	dentry = lookup_hash(&nd);
+	mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
 		goto exit2;
@@ -3678,17 +3717,17 @@ retry:
 		error = -ENOENT;
 		goto exit3;
 	}
-	error = security_path_rmdir(&nd.path, dentry);
+	error = security_path_rmdir(&path, dentry);
 	if (error)
 		goto exit3;
-	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+	error = vfs_rmdir(path.dentry->d_inode, dentry);
 exit3:
 	dput(dentry);
 exit2:
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-	mnt_drop_write(nd.path.mnt);
+	mutex_unlock(&path.dentry->d_inode->i_mutex);
+	mnt_drop_write(path.mnt);
 exit1:
-	path_put(&nd.path);
+	path_put(&path);
 	putname(name);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -3771,43 +3810,45 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 	int error;
 	struct filename *name;
 	struct dentry *dentry;
-	struct nameidata nd;
+	struct path path;
+	struct qstr last;
+	int type;
 	struct inode *inode = NULL;
 	struct inode *delegated_inode = NULL;
 	unsigned int lookup_flags = 0;
 retry:
-	name = user_path_parent(dfd, pathname, &nd, lookup_flags);
+	name = user_path_parent(dfd, pathname,
+				&path, &last, &type, lookup_flags);
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
 	error = -EISDIR;
-	if (nd.last_type != LAST_NORM)
+	if (type != LAST_NORM)
 		goto exit1;
 
-	nd.flags &= ~LOOKUP_PARENT;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto exit1;
 retry_deleg:
-	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-	dentry = lookup_hash(&nd);
+	mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
 		/* Why not before? Because we want correct error value */
-		if (nd.last.name[nd.last.len])
+		if (last.name[last.len])
 			goto slashes;
 		inode = dentry->d_inode;
 		if (d_is_negative(dentry))
 			goto slashes;
 		ihold(inode);
-		error = security_path_unlink(&nd.path, dentry);
+		error = security_path_unlink(&path, dentry);
 		if (error)
 			goto exit2;
-		error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode);
+		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
 exit2:
 		dput(dentry);
 	}
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	mutex_unlock(&path.dentry->d_inode->i_mutex);
 	if (inode)
 		iput(inode);	/* truncate the inode here */
 	inode = NULL;
@@ -3816,9 +3857,9 @@ exit2:
 		if (!error)
 			goto retry_deleg;
 	}
-	mnt_drop_write(nd.path.mnt);
+	mnt_drop_write(path.mnt);
 exit1:
-	path_put(&nd.path);
+	path_put(&path);
 	putname(name);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -4248,14 +4289,15 @@ EXPORT_SYMBOL(vfs_rename);
 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
 		int, newdfd, const char __user *, newname, unsigned int, flags)
 {
-	struct dentry *old_dir, *new_dir;
 	struct dentry *old_dentry, *new_dentry;
 	struct dentry *trap;
-	struct nameidata oldnd, newnd;
+	struct path old_path, new_path;
+	struct qstr old_last, new_last;
+	int old_type, new_type;
 	struct inode *delegated_inode = NULL;
 	struct filename *from;
 	struct filename *to;
-	unsigned int lookup_flags = 0;
+	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
 	bool should_retry = false;
 	int error;
 
@@ -4269,47 +4311,45 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
 	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
 		return -EPERM;
 
+	if (flags & RENAME_EXCHANGE)
+		target_flags = 0;
+
 retry:
-	from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
+	from = user_path_parent(olddfd, oldname,
+				&old_path, &old_last, &old_type, lookup_flags);
 	if (IS_ERR(from)) {
 		error = PTR_ERR(from);
 		goto exit;
 	}
 
-	to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
+	to = user_path_parent(newdfd, newname,
+				&new_path, &new_last, &new_type, lookup_flags);
 	if (IS_ERR(to)) {
 		error = PTR_ERR(to);
 		goto exit1;
 	}
 
 	error = -EXDEV;
-	if (oldnd.path.mnt != newnd.path.mnt)
+	if (old_path.mnt != new_path.mnt)
 		goto exit2;
 
-	old_dir = oldnd.path.dentry;
 	error = -EBUSY;
-	if (oldnd.last_type != LAST_NORM)
+	if (old_type != LAST_NORM)
 		goto exit2;
 
-	new_dir = newnd.path.dentry;
 	if (flags & RENAME_NOREPLACE)
 		error = -EEXIST;
-	if (newnd.last_type != LAST_NORM)
+	if (new_type != LAST_NORM)
 		goto exit2;
 
-	error = mnt_want_write(oldnd.path.mnt);
+	error = mnt_want_write(old_path.mnt);
 	if (error)
 		goto exit2;
 
-	oldnd.flags &= ~LOOKUP_PARENT;
-	newnd.flags &= ~LOOKUP_PARENT;
-	if (!(flags & RENAME_EXCHANGE))
-		newnd.flags |= LOOKUP_RENAME_TARGET;
-
 retry_deleg:
-	trap = lock_rename(new_dir, old_dir);
+	trap = lock_rename(new_path.dentry, old_path.dentry);
 
-	old_dentry = lookup_hash(&oldnd);
+	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
 	error = PTR_ERR(old_dentry);
 	if (IS_ERR(old_dentry))
 		goto exit3;
@@ -4317,7 +4357,7 @@ retry_deleg:
 	error = -ENOENT;
 	if (d_is_negative(old_dentry))
 		goto exit4;
-	new_dentry = lookup_hash(&newnd);
+	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
 	error = PTR_ERR(new_dentry);
 	if (IS_ERR(new_dentry))
 		goto exit4;
@@ -4331,16 +4371,16 @@ retry_deleg:
 
 		if (!d_is_dir(new_dentry)) {
 			error = -ENOTDIR;
-			if (newnd.last.name[newnd.last.len])
+			if (new_last.name[new_last.len])
 				goto exit5;
 		}
 	}
 	/* unless the source is a directory trailing slashes give -ENOTDIR */
 	if (!d_is_dir(old_dentry)) {
 		error = -ENOTDIR;
-		if (oldnd.last.name[oldnd.last.len])
+		if (old_last.name[old_last.len])
 			goto exit5;
-		if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
+		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
 			goto exit5;
 	}
 	/* source should not be ancestor of target */
@@ -4353,32 +4393,32 @@ retry_deleg:
 	if (new_dentry == trap)
 		goto exit5;
 
-	error = security_path_rename(&oldnd.path, old_dentry,
-				     &newnd.path, new_dentry, flags);
+	error = security_path_rename(&old_path, old_dentry,
+				     &new_path, new_dentry, flags);
 	if (error)
 		goto exit5;
-	error = vfs_rename(old_dir->d_inode, old_dentry,
-			   new_dir->d_inode, new_dentry,
+	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
+			   new_path.dentry->d_inode, new_dentry,
 			   &delegated_inode, flags);
 exit5:
 	dput(new_dentry);
 exit4:
 	dput(old_dentry);
 exit3:
-	unlock_rename(new_dir, old_dir);
+	unlock_rename(new_path.dentry, old_path.dentry);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
 			goto retry_deleg;
 	}
-	mnt_drop_write(oldnd.path.mnt);
+	mnt_drop_write(old_path.mnt);
 exit2:
 	if (retry_estale(error, lookup_flags))
 		should_retry = true;
-	path_put(&newnd.path);
+	path_put(&new_path);
 	putname(to);
 exit1:
-	path_put(&oldnd.path);
+	path_put(&old_path);
 	putname(from);
 	if (should_retry) {
 		should_retry = false;
@@ -4437,18 +4477,19 @@ EXPORT_SYMBOL(readlink_copy);
  */
 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
-	struct nameidata nd;
 	void *cookie;
+	struct inode *inode = d_inode(dentry);
+	const char *link = inode->i_link;
 	int res;
 
-	nd.depth = 0;
-	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
-	if (IS_ERR(cookie))
-		return PTR_ERR(cookie);
-
-	res = readlink_copy(buffer, buflen, nd_get_link(&nd));
-	if (dentry->d_inode->i_op->put_link)
-		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+	if (!link) {
+		link = inode->i_op->follow_link(dentry, &cookie);
+		if (IS_ERR(link))
+			return PTR_ERR(link);
+	}
+	res = readlink_copy(buffer, buflen, link);
+	if (inode->i_op->put_link)
+		inode->i_op->put_link(inode, cookie);
 	return res;
 }
 EXPORT_SYMBOL(generic_readlink);
@@ -4480,22 +4521,21 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 }
 EXPORT_SYMBOL(page_readlink);
 
-void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
+const char *page_follow_link_light(struct dentry *dentry, void **cookie)
 {
 	struct page *page = NULL;
-	nd_set_link(nd, page_getlink(dentry, &page));
-	return page;
+	char *res = page_getlink(dentry, &page);
+	if (!IS_ERR(res))
+		*cookie = page;
+	return res;
 }
 EXPORT_SYMBOL(page_follow_link_light);
 
-void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+void page_put_link(struct inode *unused, void *cookie)
 {
 	struct page *page = cookie;
-
-	if (page) {
-		kunmap(page);
-		page_cache_release(page);
-	}
+	kunmap(page);
+	page_cache_release(page);
 }
 EXPORT_SYMBOL(page_put_link);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 777b4819c..2b8aa15fd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -463,7 +463,6 @@ void __mnt_drop_write(struct vfsmount *mnt)
 	mnt_dec_writers(real_mount(mnt));
 	preempt_enable();
 }
-EXPORT_SYMBOL_GPL(__mnt_drop_write);
 
 /**
  * mnt_drop_write - give up write access to a mount
@@ -591,24 +590,35 @@ static void delayed_free_vfsmnt(struct rcu_head *head)
 }
 
 /* call under rcu_read_lock */
-bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
 {
 	struct mount *mnt;
 	if (read_seqretry(&mount_lock, seq))
-		return false;
+		return 1;
 	if (bastard == NULL)
-		return true;
+		return 0;
 	mnt = real_mount(bastard);
 	mnt_add_count(mnt, 1);
 	if (likely(!read_seqretry(&mount_lock, seq)))
-		return true;
+		return 0;
 	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
 		mnt_add_count(mnt, -1);
-		return false;
+		return 1;
+	}
+	return -1;
+}
+
+/* call under rcu_read_lock */
+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+{
+	int res = __legitimize_mnt(bastard, seq);
+	if (likely(!res))
+		return true;
+	if (unlikely(res < 0)) {
+		rcu_read_unlock();
+		mntput(bastard);
+		rcu_read_lock();
 	}
-	rcu_read_unlock();
-	mntput(bastard);
-	rcu_read_lock();
 	return false;
 }
 
@@ -1216,7 +1226,7 @@ EXPORT_SYMBOL(replace_mount_options);
 /* iterator; we want it to have access to namespace_sem, thus here... */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
-	struct proc_mounts *p = proc_mounts(m);
+	struct proc_mounts *p = m->private;
 
 	down_read(&namespace_sem);
 	if (p->cached_event == p->ns->event) {
@@ -1237,7 +1247,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct proc_mounts *p = proc_mounts(m);
+	struct proc_mounts *p = m->private;
 
 	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
 	p->cached_index = *pos;
@@ -1251,7 +1261,7 @@ static void m_stop(struct seq_file *m, void *v)
 
 static int m_show(struct seq_file *m, void *v)
 {
-	struct proc_mounts *p = proc_mounts(m);
+	struct proc_mounts *p = m->private;
 	struct mount *r = list_entry(v, struct mount, mnt_list);
 	return p->show(m, &r->mnt);
 }
@@ -1793,7 +1803,6 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(iterate_mounts);
 
 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
 {
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 80021c709..93575e91a 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1145,6 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 		case 0x00:
 			ncp_dbg(1, "renamed %pd -> %pd\n",
 				old_dentry, new_dentry);
+			ncp_d_prune(old_dentry);
+			ncp_d_prune(new_dentry);
 			break;
 		case 0x9E:
 			error = -ENAMETOOLONG;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 8d129bb73..682529c00 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -458,7 +458,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
  * pg_authenticate method for nfsv4 callback threads.
  *
  * The authflavor has been negotiated, so an incorrect flavor is a server
- * bug. Drop packets with incorrect authflavor.
+ * bug. Deny packets with incorrect authflavor.
  *
  * All other checking done after NFS decoding where the nfs_client can be
  * found in nfs4_callback_compound
@@ -468,12 +468,12 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 	switch (rqstp->rq_authop->flavour) {
 	case RPC_AUTH_NULL:
 		if (rqstp->rq_proc != CB_NULL)
-			return SVC_DROP;
+			return SVC_DENIED;
 		break;
 	case RPC_AUTH_GSS:
 		/* No RPC_AUTH_GSS support yet in NFSv4.1 */
 		 if (svc_is_backchannel(rqstp))
-			return SVC_DROP;
+			return SVC_DENIED;
 	}
 	return SVC_OK;
 }
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 197806fb8..29e3c1b01 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -327,10 +327,8 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 	dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
 
 	/* Normal */
-	if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
-		slot->seq_nr++;
+	if (likely(args->csa_sequenceid == slot->seq_nr + 1))
 		goto out_ok;
-	}
 
 	/* Replay */
 	if (args->csa_sequenceid == slot->seq_nr) {
@@ -418,6 +416,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 			      struct cb_process_state *cps)
 {
 	struct nfs4_slot_table *tbl;
+	struct nfs4_slot *slot;
 	struct nfs_client *clp;
 	int i;
 	__be32 status = htonl(NFS4ERR_BADSESSION);
@@ -429,25 +428,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 
 	if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
 		goto out;
+
 	tbl = &clp->cl_session->bc_slot_table;
+	slot = tbl->slots + args->csa_slotid;
 
 	spin_lock(&tbl->slot_tbl_lock);
 	/* state manager is resetting the session */
 	if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
-		spin_unlock(&tbl->slot_tbl_lock);
 		status = htonl(NFS4ERR_DELAY);
 		/* Return NFS4ERR_BADSESSION if we're draining the session
 		 * in order to reset it.
 		 */
 		if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
 			status = htonl(NFS4ERR_BADSESSION);
-		goto out;
+		goto out_unlock;
 	}
 
-	status = validate_seqid(&clp->cl_session->bc_slot_table, args);
-	spin_unlock(&tbl->slot_tbl_lock);
+	memcpy(&res->csr_sessionid, &args->csa_sessionid,
+	       sizeof(res->csr_sessionid));
+	res->csr_sequenceid = args->csa_sequenceid;
+	res->csr_slotid = args->csa_slotid;
+	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+
+	status = validate_seqid(tbl, args);
 	if (status)
-		goto out;
+		goto out_unlock;
 
 	cps->slotid = args->csa_slotid;
 
@@ -458,15 +464,17 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	 */
 	if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
 		status = htonl(NFS4ERR_DELAY);
-		goto out;
+		goto out_unlock;
 	}
 
-	memcpy(&res->csr_sessionid, &args->csa_sessionid,
-	       sizeof(res->csr_sessionid));
-	res->csr_sequenceid = args->csa_sequenceid;
-	res->csr_slotid = args->csa_slotid;
-	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
-	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+	/*
+	 * RFC5661 20.9.3
+	 * If CB_SEQUENCE returns an error, then the state of the slot
+	 * (sequence ID, cached reply) MUST NOT change.
+	 */
+	slot->seq_nr++;
+out_unlock:
+	spin_unlock(&tbl->slot_tbl_lock);
 
 out:
 	cps->clp = clp; /* put in nfs4_callback_compound */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 19ca95cdf..6b1697a01 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -909,7 +909,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
 
 	status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
-	if (status == __constant_htonl(NFS4ERR_RESOURCE))
+	if (status == htonl(NFS4ERR_RESOURCE))
 		return rpc_garbage_args;
 
 	if (hdr_arg.minorversion == 0) {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 892aefff3..4a90c9bb3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -775,7 +775,7 @@ static int nfs_init_server(struct nfs_server *server,
 	server->options = data->options;
 	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
 		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
-		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR;
+		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
 
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
@@ -825,7 +825,6 @@ error:
  * Load up the server record from information gained in an fsinfo record
  */
 static void nfs_server_set_fsinfo(struct nfs_server *server,
-				  struct nfs_fh *mntfh,
 				  struct nfs_fsinfo *fsinfo)
 {
 	unsigned long max_rpc_payload;
@@ -901,7 +900,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
 	if (error < 0)
 		goto out_error;
 
-	nfs_server_set_fsinfo(server, mntfh, &fsinfo);
+	nfs_server_set_fsinfo(server, &fsinfo);
 
 	/* Get some general file system info */
 	if (server->namelen == 0) {
@@ -1193,8 +1192,6 @@ void nfs_clients_init(struct net *net)
 }
 
 #ifdef CONFIG_PROC_FS
-static struct proc_dir_entry *proc_fs_nfs;
-
 static int nfs_server_list_open(struct inode *inode, struct file *file);
 static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
 static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
@@ -1364,27 +1361,29 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
 {
 	struct nfs_server *server;
 	struct nfs_client *clp;
-	char dev[8], fsid[17];
+	char dev[13];	// 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0'
+	char fsid[34];	// 2 * 16 for %llx, 1 for ':', 1 for '\0'
 	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
 	/* display header on line 1 */
 	if (v == &nn->nfs_volume_list) {
-		seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");
+		seq_puts(m, "NV SERVER   PORT DEV          FSID"
+			    "                              FSC\n");
 		return 0;
 	}
 	/* display one transport per line on subsequent lines */
 	server = list_entry(v, struct nfs_server, master_link);
 	clp = server->nfs_client;
 
-	snprintf(dev, 8, "%u:%u",
+	snprintf(dev, sizeof(dev), "%u:%u",
 		 MAJOR(server->s_dev), MINOR(server->s_dev));
 
-	snprintf(fsid, 17, "%llx:%llx",
+	snprintf(fsid, sizeof(fsid), "%llx:%llx",
 		 (unsigned long long) server->fsid.major,
 		 (unsigned long long) server->fsid.minor);
 
 	rcu_read_lock();
-	seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
+	seq_printf(m, "v%u %s %s %-12s %-33s %s\n",
 		   clp->rpc_ops->version,
 		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
 		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
@@ -1434,27 +1433,20 @@ void nfs_fs_proc_net_exit(struct net *net)
  */
 int __init nfs_fs_proc_init(void)
 {
-	struct proc_dir_entry *p;
-
-	proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL);
-	if (!proc_fs_nfs)
+	if (!proc_mkdir("fs/nfsfs", NULL))
 		goto error_0;
 
 	/* a file of servers with which we're dealing */
-	p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
-	if (!p)
+	if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers"))
 		goto error_1;
 
 	/* a file of volumes that we have mounted */
-	p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
-	if (!p)
-		goto error_2;
-	return 0;
+	if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes"))
+		goto error_1;
 
-error_2:
-	remove_proc_entry("servers", proc_fs_nfs);
+	return 0;
 error_1:
-	remove_proc_entry("fs/nfsfs", NULL);
+	remove_proc_subtree("fs/nfsfs", NULL);
 error_0:
 	return -ENOMEM;
 }
@@ -1464,9 +1456,7 @@ error_0:
  */
 void nfs_fs_proc_exit(void)
 {
-	remove_proc_entry("volumes", proc_fs_nfs);
-	remove_proc_entry("servers", proc_fs_nfs);
-	remove_proc_entry("fs/nfsfs", NULL);
+	remove_proc_subtree("fs/nfsfs", NULL);
 }
 
 #endif /* CONFIG_PROC_FS */
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b2c8b31b2..547308a5e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1470,9 +1470,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
 {
 	int err;
 
-	if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
-		*opened |= FILE_CREATED;
-
 	err = finish_open(file, dentry, do_open, opened);
 	if (err)
 		goto out;
@@ -1771,7 +1768,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir);
 
 static void nfs_dentry_handle_enoent(struct dentry *dentry)
 {
-	if (d_really_is_positive(dentry) && !d_unhashed(dentry))
+	if (simple_positive(dentry))
 		d_delete(dentry);
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8b8d83a52..cc4fa1ed6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page)
 	return nfs_wb_page(inode, page);
 }
 
-#ifdef CONFIG_NFS_SWAP
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 						sector_t *span)
 {
-	int ret;
 	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 
 	*span = sis->pages;
 
-	rcu_read_lock();
-	ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
-	rcu_read_unlock();
-
-	return ret;
+	return rpc_clnt_swap_activate(clnt);
 }
 
 static void nfs_swap_deactivate(struct file *file)
 {
 	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 
-	rcu_read_lock();
-	xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
-	rcu_read_unlock();
+	rpc_clnt_swap_deactivate(clnt);
 }
-#endif
 
 const struct address_space_operations nfs_file_aops = {
 	.readpage = nfs_readpage,
@@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = {
 	.launder_page = nfs_launder_page,
 	.is_dirty_writeback = nfs_check_dirty_writeback,
 	.error_remove_page = generic_error_remove_page,
-#ifdef CONFIG_NFS_SWAP
 	.swap_activate = nfs_swap_activate,
 	.swap_deactivate = nfs_swap_deactivate,
-#endif
 };
 
 /*
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index a46bf6de9..b34f2e228 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -32,6 +32,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/module.h>
+#include <linux/backing-dev.h>
 
 #include <linux/sunrpc/metrics.h>
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 6f5f0f425..b3289d701 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -20,6 +20,7 @@
 #include "../nfs4trace.h"
 #include "../iostat.h"
 #include "../nfs.h"
+#include "../nfs42.h"
 
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 
@@ -182,17 +183,14 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
 
 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 {
-	struct nfs4_ff_layout_mirror *tmp;
 	int i, j;
 
 	for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
 		for (j = i + 1; j < fls->mirror_array_cnt; j++)
 			if (fls->mirror_array[i]->efficiency <
-			    fls->mirror_array[j]->efficiency) {
-				tmp = fls->mirror_array[i];
-				fls->mirror_array[i] = fls->mirror_array[j];
-				fls->mirror_array[j] = tmp;
-			}
+			    fls->mirror_array[j]->efficiency)
+				swap(fls->mirror_array[i],
+				     fls->mirror_array[j]);
 	}
 }
 
@@ -274,6 +272,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 
 		spin_lock_init(&fls->mirror_array[i]->lock);
 		fls->mirror_array[i]->ds_count = ds_count;
+		fls->mirror_array[i]->lseg = &fls->generic_hdr;
 
 		/* deviceid */
 		rc = decode_deviceid(&stream, &devid);
@@ -344,6 +343,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 			fls->mirror_array[i]->gid);
 	}
 
+	p = xdr_inline_decode(&stream, 4);
+	if (p)
+		fls->flags = be32_to_cpup(p);
+
 	ff_layout_sort_mirrors(fls);
 	rc = ff_layout_check_layout(lgr);
 	if (rc)
@@ -415,6 +418,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
 	return 1;
 }
 
+static void
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
+{
+	/* first IO request? */
+	if (atomic_inc_return(&timer->n_ops) == 1) {
+		timer->start_time = ktime_get();
+	}
+}
+
+static ktime_t
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
+{
+	ktime_t start, now;
+
+	if (atomic_dec_return(&timer->n_ops) < 0)
+		WARN_ON_ONCE(1);
+
+	now = ktime_get();
+	start = timer->start_time;
+	timer->start_time = now;
+	return ktime_sub(now, start);
+}
+
+static ktime_t
+nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
+{
+	return ktime_sub(ktime_get(), task->tk_start);
+}
+
+static bool
+nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+			    struct nfs4_ff_layoutstat *layoutstat)
+{
+	static const ktime_t notime = {0};
+	ktime_t now = ktime_get();
+
+	nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
+	if (ktime_equal(mirror->start_time, notime))
+		mirror->start_time = now;
+	if (ktime_equal(mirror->last_report_time, notime))
+		mirror->last_report_time = now;
+	if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
+			FF_LAYOUTSTATS_REPORT_INTERVAL) {
+		mirror->last_report_time = now;
+		return true;
+	}
+
+	return false;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
+		__u64 requested)
+{
+	struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+
+	iostat->ops_requested++;
+	iostat->bytes_requested += requested;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
+		__u64 requested,
+		__u64 completed,
+		ktime_t time_completed)
+{
+	struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+	ktime_t timer;
+
+	iostat->ops_completed++;
+	iostat->bytes_completed += completed;
+	iostat->bytes_not_delivered += requested - completed;
+
+	timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
+	iostat->total_busy_time =
+			ktime_add(iostat->total_busy_time, timer);
+	iostat->aggregate_completion_time =
+			ktime_add(iostat->aggregate_completion_time, time_completed);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
+		__u64 requested)
+{
+	bool report;
+
+	spin_lock(&mirror->lock);
+	report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
+	nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+	spin_unlock(&mirror->lock);
+
+	if (report)
+		pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
+		struct nfs4_ff_layout_mirror *mirror,
+		__u64 requested,
+		__u64 completed)
+{
+	spin_lock(&mirror->lock);
+	nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+			requested, completed,
+			nfs4_ff_layout_calc_completion_time(task));
+	spin_unlock(&mirror->lock);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
+		__u64 requested)
+{
+	bool report;
+
+	spin_lock(&mirror->lock);
+	report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
+	nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+	spin_unlock(&mirror->lock);
+
+	if (report)
+		pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
+		struct nfs4_ff_layout_mirror *mirror,
+		__u64 requested,
+		__u64 completed,
+		enum nfs3_stable_how committed)
+{
+	if (committed == NFS_UNSTABLE)
+		requested = completed = 0;
+
+	spin_lock(&mirror->lock);
+	nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+			requested, completed,
+			nfs4_ff_layout_calc_completion_time(task));
+	spin_unlock(&mirror->lock);
+}
+
 static int
 ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 			    struct nfs_commit_info *cinfo,
@@ -879,6 +1022,12 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 	return 0;
 }
 
+static bool
+ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
+{
+	return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
+}
+
 /*
  * We reference the rpc_cred of the first WRITE that triggers the need for
  * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
@@ -891,6 +1040,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 static void
 ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
+	if (!ff_layout_need_layoutcommit(hdr->lseg))
+		return;
+
 	pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
 			hdr->mds_offset + hdr->res.count);
 	dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
@@ -909,6 +1061,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
 static int ff_layout_read_prepare_common(struct rpc_task *task,
 					 struct nfs_pgio_header *hdr)
 {
+	nfs4_ff_layout_stat_io_start_read(
+			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+			hdr->args.count);
+
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 		rpc_exit(task, -EIO);
 		return -EIO;
@@ -962,15 +1118,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
 {
 	struct nfs_pgio_header *hdr = data;
 
-	if (ff_layout_read_prepare_common(task, hdr))
-		return;
-
 	if (ff_layout_setup_sequence(hdr->ds_clp,
 				     &hdr->args.seq_args,
 				     &hdr->res.seq_res,
 				     task))
 		return;
 
+	if (ff_layout_read_prepare_common(task, hdr))
+		return;
+
 	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
 			hdr->args.lock_context, FMODE_READ) == -EIO)
 		rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -982,6 +1138,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
 
 	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
+	nfs4_ff_layout_stat_io_end_read(task,
+			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+			hdr->args.count, hdr->res.count);
+
 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
 	    task->tk_status == 0) {
 		nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1074,7 +1234,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 		return -EAGAIN;
 	}
 
-	if (data->verf.committed == NFS_UNSTABLE)
+	if (data->verf.committed == NFS_UNSTABLE
+	    && ff_layout_need_layoutcommit(data->lseg))
 		pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
 
 	return 0;
@@ -1083,6 +1244,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 static int ff_layout_write_prepare_common(struct rpc_task *task,
 					  struct nfs_pgio_header *hdr)
 {
+	nfs4_ff_layout_stat_io_start_write(
+			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+			hdr->args.count);
+
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 		rpc_exit(task, -EIO);
 		return -EIO;
@@ -1116,15 +1281,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
 {
 	struct nfs_pgio_header *hdr = data;
 
-	if (ff_layout_write_prepare_common(task, hdr))
-		return;
-
 	if (ff_layout_setup_sequence(hdr->ds_clp,
 				     &hdr->args.seq_args,
 				     &hdr->res.seq_res,
 				     task))
 		return;
 
+	if (ff_layout_write_prepare_common(task, hdr))
+		return;
+
 	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
 			hdr->args.lock_context, FMODE_WRITE) == -EIO)
 		rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -1134,6 +1299,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
 {
 	struct nfs_pgio_header *hdr = data;
 
+	nfs4_ff_layout_stat_io_end_write(task,
+			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+			hdr->args.count, hdr->res.count,
+			hdr->res.verf->committed);
+
 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
 	    task->tk_status == 0) {
 		nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1152,8 +1322,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
 	    &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
 }
 
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+		struct nfs_commit_data *cdata)
+{
+	nfs4_ff_layout_stat_io_start_write(
+			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+			0);
+}
+
 static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
 {
+	ff_layout_commit_prepare_common(task, data);
 	rpc_call_start(task);
 }
 
@@ -1161,10 +1340,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
 {
 	struct nfs_commit_data *wdata = data;
 
-	ff_layout_setup_sequence(wdata->ds_clp,
+	if (ff_layout_setup_sequence(wdata->ds_clp,
 				 &wdata->args.seq_args,
 				 &wdata->res.seq_res,
-				 task);
+				 task))
+		return;
+	ff_layout_commit_prepare_common(task, data);
+}
+
+static void ff_layout_commit_done(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *cdata = data;
+	struct nfs_page *req;
+	__u64 count = 0;
+
+	if (task->tk_status == 0) {
+		list_for_each_entry(req, &cdata->pages, wb_list)
+			count += req->wb_bytes;
+	}
+
+	nfs4_ff_layout_stat_io_end_write(task,
+			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+			count, count, NFS_FILE_SYNC);
+
+	pnfs_generic_write_commit_done(task, data);
 }
 
 static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
@@ -1205,14 +1404,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
 	.rpc_call_prepare = ff_layout_commit_prepare_v3,
-	.rpc_call_done = pnfs_generic_write_commit_done,
+	.rpc_call_done = ff_layout_commit_done,
 	.rpc_count_stats = ff_layout_commit_count_stats,
 	.rpc_release = pnfs_generic_commit_release,
 };
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
 	.rpc_call_prepare = ff_layout_commit_prepare_v4,
-	.rpc_call_done = pnfs_generic_write_commit_done,
+	.rpc_call_done = ff_layout_commit_done,
 	.rpc_count_stats = ff_layout_commit_count_stats,
 	.rpc_release = pnfs_generic_commit_release,
 };
@@ -1256,7 +1455,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 	fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
 	if (fh)
 		hdr->args.fh = fh;
-
 	/*
 	 * Note that if we ever decide to split across DSes,
 	 * then we may need to handle dense-like offsets.
@@ -1385,6 +1583,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
 	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
 	if (fh)
 		data->args.fh = fh;
+
 	return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
 				   vers == 3 ? &ff_layout_commit_call_ops_v3 :
 					       &ff_layout_commit_call_ops_v4,
@@ -1488,6 +1687,247 @@ out:
 	dprintk("%s: Return\n", __func__);
 }
 
+static int
+ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+
+	return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
+}
+
+static size_t
+ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
+			  const int buflen)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	const struct in6_addr *addr = &sin6->sin6_addr;
+
+	/*
+	 * RFC 4291, Section 2.2.2
+	 *
+	 * Shorthanded ANY address
+	 */
+	if (ipv6_addr_any(addr))
+		return snprintf(buf, buflen, "::");
+
+	/*
+	 * RFC 4291, Section 2.2.2
+	 *
+	 * Shorthanded loopback address
+	 */
+	if (ipv6_addr_loopback(addr))
+		return snprintf(buf, buflen, "::1");
+
+	/*
+	 * RFC 4291, Section 2.2.3
+	 *
+	 * Special presentation address format for mapped v4
+	 * addresses.
+	 */
+	if (ipv6_addr_v4mapped(addr))
+		return snprintf(buf, buflen, "::ffff:%pI4",
+					&addr->s6_addr32[3]);
+
+	/*
+	 * RFC 4291, Section 2.2.1
+	 */
+	return snprintf(buf, buflen, "%pI6c", addr);
+}
+
+/* Derived from rpc_sockaddr2uaddr */
+static void
+ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
+{
+	struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
+	char portbuf[RPCBIND_MAXUADDRPLEN];
+	char addrbuf[RPCBIND_MAXUADDRLEN];
+	char *netid;
+	unsigned short port;
+	int len, netid_len;
+	__be32 *p;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
+			return;
+		port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+		netid = "tcp";
+		netid_len = 3;
+		break;
+	case AF_INET6:
+		if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
+			return;
+		port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+		netid = "tcp6";
+		netid_len = 4;
+		break;
+	default:
+		/* we only support tcp and tcp6 */
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
+	len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
+
+	p = xdr_reserve_space(xdr, 4 + netid_len);
+	xdr_encode_opaque(p, netid, netid_len);
+
+	p = xdr_reserve_space(xdr, 4 + len);
+	xdr_encode_opaque(p, addrbuf, len);
+}
+
+static void
+ff_layout_encode_nfstime(struct xdr_stream *xdr,
+			 ktime_t t)
+{
+	struct timespec64 ts;
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 12);
+	ts = ktime_to_timespec64(t);
+	p = xdr_encode_hyper(p, ts.tv_sec);
+	*p++ = cpu_to_be32(ts.tv_nsec);
+}
+
+static void
+ff_layout_encode_io_latency(struct xdr_stream *xdr,
+			    struct nfs4_ff_io_stat *stat)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 5 * 8);
+	p = xdr_encode_hyper(p, stat->ops_requested);
+	p = xdr_encode_hyper(p, stat->bytes_requested);
+	p = xdr_encode_hyper(p, stat->ops_completed);
+	p = xdr_encode_hyper(p, stat->bytes_completed);
+	p = xdr_encode_hyper(p, stat->bytes_not_delivered);
+	ff_layout_encode_nfstime(xdr, stat->total_busy_time);
+	ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
+}
+
+static void
+ff_layout_encode_layoutstats(struct xdr_stream *xdr,
+			     struct nfs42_layoutstat_args *args,
+			     struct nfs42_layoutstat_devinfo *devinfo)
+{
+	struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
+	struct nfs4_pnfs_ds_addr *da;
+	struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
+	struct nfs_fh *fh = &mirror->fh_versions[0];
+	__be32 *p, *start;
+
+	da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
+	dprintk("%s: DS %s: encoding address %s\n",
+		__func__, ds->ds_remotestr, da->da_remotestr);
+	/* layoutupdate length */
+	start = xdr_reserve_space(xdr, 4);
+	/* netaddr4 */
+	ff_layout_encode_netaddr(xdr, da);
+	/* nfs_fh4 */
+	p = xdr_reserve_space(xdr, 4 + fh->size);
+	xdr_encode_opaque(p, fh->data, fh->size);
+	/* ff_io_latency4 read */
+	spin_lock(&mirror->lock);
+	ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+	/* ff_io_latency4 write */
+	ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
+	spin_unlock(&mirror->lock);
+	/* nfstime4 */
+	ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+	/* bool */
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(false);
+
+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
+}
+
+static bool
+ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
+			       struct pnfs_layout_segment *pls,
+			       int *dev_count, int dev_limit)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+	struct nfs4_deviceid_node *dev;
+	struct nfs42_layoutstat_devinfo *devinfo;
+	int i;
+
+	for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) {
+		if (*dev_count >= dev_limit)
+			break;
+		mirror = FF_LAYOUT_COMP(pls, i);
+		if (!mirror || !mirror->mirror_ds)
+			continue;
+		dev = FF_LAYOUT_DEVID_NODE(pls, i);
+		devinfo = &args->devinfo[*dev_count];
+		memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
+		devinfo->offset = pls->pls_range.offset;
+		devinfo->length = pls->pls_range.length;
+		/* well, we don't really know if IO is continuous or not! */
+		devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
+		devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
+		devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
+		devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
+		devinfo->layout_type = LAYOUT_FLEX_FILES;
+		devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
+		devinfo->layout_private = mirror;
+		/* lseg refcount put in cleanup_layoutstats */
+		pnfs_get_lseg(pls);
+
+		++(*dev_count);
+	}
+
+	return *dev_count < dev_limit;
+}
+
+static int
+ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
+{
+	struct pnfs_layout_segment *pls;
+	int dev_count = 0;
+
+	spin_lock(&args->inode->i_lock);
+	list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
+		dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
+	}
+	spin_unlock(&args->inode->i_lock);
+	/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
+	if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
+		dprintk("%s: truncating devinfo to limit (%d:%d)\n",
+			__func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
+		dev_count = PNFS_LAYOUTSTATS_MAXDEV;
+	}
+	args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
+	if (!args->devinfo)
+		return -ENOMEM;
+
+	dev_count = 0;
+	spin_lock(&args->inode->i_lock);
+	list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
+		if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
+						    PNFS_LAYOUTSTATS_MAXDEV)) {
+			break;
+		}
+	}
+	spin_unlock(&args->inode->i_lock);
+	args->num_dev = dev_count;
+
+	return 0;
+}
+
+static void
+ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
+{
+	struct nfs4_ff_layout_mirror *mirror;
+	int i;
+
+	for (i = 0; i < data->args.num_dev; i++) {
+		mirror = data->args.devinfo[i].layout_private;
+		data->args.devinfo[i].layout_private = NULL;
+		pnfs_put_lseg(mirror->lseg);
+	}
+}
+
 static struct pnfs_layoutdriver_type flexfilelayout_type = {
 	.id			= LAYOUT_FLEX_FILES,
 	.name			= "LAYOUT_FLEX_FILES",
@@ -1510,6 +1950,8 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
 	.alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
 	.encode_layoutreturn    = ff_layout_encode_layoutreturn,
 	.sync			= pnfs_nfs_generic_sync,
+	.prepare_layoutstats	= ff_layout_prepare_layoutstats,
+	.cleanup_layoutstats	= ff_layout_cleanup_layoutstats,
 };
 
 static int __init nfs4flexfilelayout_init(void)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 070f20445..f92f9a0a8 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -9,12 +9,17 @@
 #ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
 #define FS_NFS_NFS4FLEXFILELAYOUT_H
 
+#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+
 #include "../pnfs.h"
 
 /* XXX: Let's filter out insanely large mirror count for now to avoid oom
  * due to network error etc. */
 #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
 
+/* LAYOUTSTATS report interval in ms */
+#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
+
 struct nfs4_ff_ds_version {
 	u32				version;
 	u32				minor_version;
@@ -41,24 +46,48 @@ struct nfs4_ff_layout_ds_err {
 	struct nfs4_deviceid		deviceid;
 };
 
+struct nfs4_ff_io_stat {
+	__u64				ops_requested;
+	__u64				bytes_requested;
+	__u64				ops_completed;
+	__u64				bytes_completed;
+	__u64				bytes_not_delivered;
+	ktime_t				total_busy_time;
+	ktime_t				aggregate_completion_time;
+};
+
+struct nfs4_ff_busy_timer {
+	ktime_t start_time;
+	atomic_t n_ops;
+};
+
+struct nfs4_ff_layoutstat {
+	struct nfs4_ff_io_stat io_stat;
+	struct nfs4_ff_busy_timer busy_timer;
+};
+
 struct nfs4_ff_layout_mirror {
+	struct pnfs_layout_segment	*lseg; /* back pointer */
 	u32				ds_count;
 	u32				efficiency;
 	struct nfs4_ff_layout_ds	*mirror_ds;
 	u32				fh_versions_cnt;
 	struct nfs_fh			*fh_versions;
 	nfs4_stateid			stateid;
-	struct nfs4_string		user_name;
-	struct nfs4_string		group_name;
 	u32				uid;
 	u32				gid;
 	struct rpc_cred			*cred;
 	spinlock_t			lock;
+	struct nfs4_ff_layoutstat	read_stat;
+	struct nfs4_ff_layoutstat	write_stat;
+	ktime_t				start_time;
+	ktime_t				last_report_time;
 };
 
 struct nfs4_ff_layout_segment {
 	struct pnfs_layout_segment	generic_hdr;
 	u64				stripe_unit;
+	u32				flags;
 	u32				mirror_array_cnt;
 	struct nfs4_ff_layout_mirror	**mirror_array;
 };
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5d25b9d97..0adc7d245 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -442,8 +442,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			inode->i_version = fattr->change_attr;
-		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
-			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+		else
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_PAGECACHE);
 		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
 			inode->i_size = nfs_size_to_loff_t(fattr->size);
 		else
@@ -678,6 +679,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	if (!err) {
 		generic_fillattr(inode, stat);
 		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+		if (S_ISDIR(inode->i_mode))
+			stat->blksize = NFS_SERVER(inode)->dtsize;
 	}
 out:
 	trace_nfs_getattr_exit(inode, err);
@@ -1689,7 +1692,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				nfs_force_lookup_revalidate(inode);
 			inode->i_version = fattr->change_attr;
 		}
-	} else if (server->caps & NFS_CAP_CHANGE_ATTR)
+	} else
 		nfsi->cache_validity |= save_cache_validity;
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
@@ -2008,17 +2011,15 @@ static int __init init_nfs_fs(void)
 	if (err)
 		goto out1;
 
-#ifdef CONFIG_PROC_FS
 	rpc_proc_register(&init_net, &nfs_rpcstat);
-#endif
-	if ((err = register_nfs_fs()) != 0)
+
+	err = register_nfs_fs();
+	if (err)
 		goto out0;
 
 	return 0;
 out0:
-#ifdef CONFIG_PROC_FS
 	rpc_proc_unregister(&init_net, "nfs");
-#endif
 	nfs_destroy_directcache();
 out1:
 	nfs_destroy_writepagecache();
@@ -2049,9 +2050,7 @@ static void __exit exit_nfs_fs(void)
 	nfs_destroy_nfspagecache();
 	nfs_fscache_unregister();
 	unregister_pernet_subsys(&nfs_net_ops);
-#ifdef CONFIG_PROC_FS
 	rpc_proc_unregister(&init_net, "nfs");
-#endif
 	unregister_nfs_fs();
 	nfs_fs_proc_exit();
 	nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9e6475bc5..9b372b845 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -296,6 +296,22 @@ extern struct rpc_procinfo nfs4_procedures[];
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags);
+static inline struct nfs4_label *
+nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
+{
+	if (!dst || !src)
+		return NULL;
+
+	if (src->len > NFS4_MAXLABELLEN)
+		return NULL;
+
+	dst->lfs = src->lfs;
+	dst->pi = src->pi;
+	dst->len = src->len;
+	memcpy(dst->label, src->label, src->len);
+
+	return dst;
+}
 static inline void nfs4_label_free(struct nfs4_label *label)
 {
 	if (label) {
@@ -316,6 +332,11 @@ static inline void nfs4_label_free(void *label) {}
 static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
 {
 }
+static inline struct nfs4_label *
+nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
+{
+	return NULL;
+}
 #endif /* CONFIG_NFS_V4_SECURITY_LABEL */
 
 /* proc.c */
@@ -607,7 +628,7 @@ void nfs_mark_page_unstable(struct page *page)
 	struct inode *inode = page_file_mapping(page)->host;
 
 	inc_zone_page_state(page, NR_UNSTABLE_NFS);
-	inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
 	 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 7afb8947d..ff66ae700 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -5,11 +5,18 @@
 #ifndef __LINUX_FS_NFS_NFS4_2_H
 #define __LINUX_FS_NFS_NFS4_2_H
 
+/*
+ * FIXME:  four LAYOUTSTATS calls per compound at most! Do we need to support
+ * more? Need to consider not to pre-alloc too much for a compound.
+ */
+#define PNFS_LAYOUTSTATS_MAXDEV (4)
+
 /* nfs4.2proc.c */
 int nfs42_proc_allocate(struct file *, loff_t, loff_t);
 int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
-
+int nfs42_proc_layoutstats_generic(struct nfs_server *,
+				   struct nfs42_layoutstat_data *);
 /* nfs4.2xdr.h */
 extern struct rpc_procinfo nfs4_2_procedures[];
 
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 3a9e75235..d731bbf97 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -10,6 +10,11 @@
 #include <linux/nfs_fs.h>
 #include "nfs4_fs.h"
 #include "nfs42.h"
+#include "iostat.h"
+#include "pnfs.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
 
 static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
 				fmode_t fmode)
@@ -130,7 +135,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
 	return err;
 }
 
-loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
 {
 	struct inode *inode = file_inode(filep);
 	struct nfs42_seek_args args = {
@@ -165,3 +170,102 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
 
 	return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
 }
+
+loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+{
+	struct nfs_server *server = NFS_SERVER(file_inode(filep));
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = _nfs42_proc_llseek(filep, offset, whence);
+		if (err == -ENOTSUPP)
+			return -EOPNOTSUPP;
+		err = nfs4_handle_exception(server, err, &exception);
+	} while (exception.retry);
+
+	return err;
+}
+
+
+static void
+nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs42_layoutstat_data *data = calldata;
+	struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+	nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
+			     &data->res.seq_res, task);
+}
+
+static void
+nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs42_layoutstat_data *data = calldata;
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return;
+
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -ENOTSUPP:
+	case -EOPNOTSUPP:
+		NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+	default:
+		dprintk("%s server returns %d\n", __func__, task->tk_status);
+	}
+}
+
+static void
+nfs42_layoutstat_release(void *calldata)
+{
+	struct nfs42_layoutstat_data *data = calldata;
+	struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+
+	if (nfss->pnfs_curr_ld->cleanup_layoutstats)
+		nfss->pnfs_curr_ld->cleanup_layoutstats(data);
+
+	pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
+	smp_mb__before_atomic();
+	clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
+	smp_mb__after_atomic();
+	nfs_iput_and_deactive(data->inode);
+	kfree(data->args.devinfo);
+	kfree(data);
+}
+
+static const struct rpc_call_ops nfs42_layoutstat_ops = {
+	.rpc_call_prepare = nfs42_layoutstat_prepare,
+	.rpc_call_done = nfs42_layoutstat_done,
+	.rpc_release = nfs42_layoutstat_release,
+};
+
+int nfs42_proc_layoutstats_generic(struct nfs_server *server,
+				   struct nfs42_layoutstat_data *data)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs42_layoutstat_ops,
+		.callback_data = data,
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct rpc_task *task;
+
+	data->inode = nfs_igrab_and_active(data->args.inode);
+	if (!data->inode) {
+		nfs42_layoutstat_release(data);
+		return -EAGAIN;
+	}
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+	task = rpc_run_task(&task_setup);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	return 0;
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 1a25b2724..a6bd27da6 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -4,6 +4,8 @@
 #ifndef __LINUX_FS_NFS_NFS4_2XDR_H
 #define __LINUX_FS_NFS_NFS4_2XDR_H
 
+#include "nfs42.h"
+
 #define encode_fallocate_maxsz		(encode_stateid_maxsz + \
 					 2 /* offset */ + \
 					 2 /* length */)
@@ -22,6 +24,16 @@
 					 1 /* whence */ + \
 					 2 /* offset */ + \
 					 2 /* length */)
+#define encode_io_info_maxsz		4
+#define encode_layoutstats_maxsz	(op_decode_hdr_maxsz + \
+					2 /* offset */ + \
+					2 /* length */ + \
+					encode_stateid_maxsz + \
+					encode_io_info_maxsz + \
+					encode_io_info_maxsz + \
+					1 /* opaque devaddr4 length */ + \
+					XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
+#define decode_layoutstats_maxsz	(op_decode_hdr_maxsz)
 
 #define NFS4_enc_allocate_sz		(compound_encode_hdr_maxsz + \
 					 encode_putfh_maxsz + \
@@ -45,6 +57,14 @@
 #define NFS4_dec_seek_sz		(compound_decode_hdr_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_seek_maxsz)
+#define NFS4_enc_layoutstats_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putfh_maxsz + \
+					 PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz)
+#define NFS4_dec_layoutstats_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putfh_maxsz + \
+					 PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
 
 
 static void encode_fallocate(struct xdr_stream *xdr,
@@ -81,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr,
 	encode_uint32(xdr, args->sa_what);
 }
 
+static void encode_layoutstats(struct xdr_stream *xdr,
+			       struct nfs42_layoutstat_args *args,
+			       struct nfs42_layoutstat_devinfo *devinfo,
+			       struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr);
+	p = reserve_space(xdr, 8 + 8);
+	p = xdr_encode_hyper(p, devinfo->offset);
+	p = xdr_encode_hyper(p, devinfo->length);
+	encode_nfs4_stateid(xdr, &args->stateid);
+	p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4);
+	p = xdr_encode_hyper(p, devinfo->read_count);
+	p = xdr_encode_hyper(p, devinfo->read_bytes);
+	p = xdr_encode_hyper(p, devinfo->write_count);
+	p = xdr_encode_hyper(p, devinfo->write_bytes);
+	p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data,
+			NFS4_DEVICEID4_SIZE);
+	/* Encode layoutupdate4 */
+	*p++ = cpu_to_be32(devinfo->layout_type);
+	if (devinfo->layoutstats_encode != NULL)
+		devinfo->layoutstats_encode(xdr, args, devinfo);
+	else
+		encode_uint32(xdr, 0);
+}
+
 /*
  * Encode ALLOCATE request
  */
@@ -137,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode LAYOUTSTATS request
+ */
+static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs42_layoutstat_args *args)
+{
+	int i;
+
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+	for (i = 0; i < args->num_dev; i++)
+		encode_layoutstats(xdr, args, &args->devinfo[i], &hdr);
+	encode_nops(&hdr);
+}
+
 static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
 	return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -169,6 +238,12 @@ out_overflow:
 	return -EIO;
 }
 
+static int decode_layoutstats(struct xdr_stream *xdr,
+			      struct nfs42_layoutstat_res *res)
+{
+	return decode_op_hdr(xdr, OP_LAYOUTSTATS);
+}
+
 /*
  * Decode ALLOCATE request
  */
@@ -246,4 +321,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
 out:
 	return status;
 }
+
+/*
+ * Decode LAYOUTSTATS request
+ */
+static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs42_layoutstat_res *res)
+{
+	struct compound_hdr hdr;
+	int status, i;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+	for (i = 0; i < res->num_dev; i++) {
+		status = decode_layoutstats(xdr, res);
+		if (status)
+			goto out;
+	}
+out:
+	res->rpc_status = status;
+	return status;
+}
+
 #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index fdef424b0..ea3bee919 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception
 extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
 			  struct rpc_message *, struct nfs4_sequence_args *,
 			  struct nfs4_sequence_res *, int);
+extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int);
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index e42be52a8..3aa6a9ba5 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -676,7 +676,6 @@ found:
 		break;
 	}
 
-	/* No matching nfs_client found. */
 	spin_unlock(&nn->nfs_client_lock);
 	dprintk("NFS: <-- %s status = %d\n", __func__, status);
 	nfs_put_client(prev);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index f58c17b3b..dcd39d4e2 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -41,6 +41,10 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 
 	dprintk("NFS: open file(%pd2)\n", dentry);
 
+	err = nfs_check_flags(openflags);
+	if (err)
+		return err;
+
 	if ((openflags & O_ACCMODE) == 3)
 		openflags--;
 
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
index c0b3a16b4..039b3eb6d 100644
--- a/fs/nfs/nfs4getroot.c
+++ b/fs/nfs/nfs4getroot.c
@@ -35,13 +35,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p
 		goto out;
 	}
 
-	if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-		printk(KERN_ERR "nfs4_get_rootfh:"
-		       " getroot obtained referral\n");
-		ret = -EREMOTE;
-		goto out;
-	}
-
 	memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
 out:
 	nfs_free_fattr(fsinfo.fattr);
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 2e1737c40..535dfc69c 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -494,12 +494,7 @@ nfs_idmap_delete(struct nfs_client *clp)
 
 int nfs_idmap_init(void)
 {
-	int ret;
-	ret = nfs_idmap_init_keyring();
-	if (ret != 0)
-		goto out;
-out:
-	return ret;
+	return nfs_idmap_init_keyring();
 }
 
 void nfs_idmap_quit(void)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d3f205126..3acb1eb72 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -356,6 +356,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
 		case 0:
 			return 0;
 		case -NFS4ERR_OPENMODE:
+		case -NFS4ERR_DELEG_REVOKED:
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_BAD_STATEID:
 			if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
 				nfs4_inode_return_delegation(inode);
 				exception->retry = 1;
@@ -367,15 +370,6 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
 			if (ret < 0)
 				break;
 			goto wait_on_recovery;
-		case -NFS4ERR_DELEG_REVOKED:
-		case -NFS4ERR_ADMIN_REVOKED:
-		case -NFS4ERR_BAD_STATEID:
-			if (state == NULL)
-				break;
-			ret = nfs4_schedule_stateid_recovery(server, state);
-			if (ret < 0)
-				break;
-			goto wait_on_recovery;
 		case -NFS4ERR_EXPIRED:
 			if (state != NULL) {
 				ret = nfs4_schedule_stateid_recovery(server, state);
@@ -473,7 +467,10 @@ static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
 
 static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
 {
-	do_renew_lease(server->nfs_client, timestamp);
+	struct nfs_client *clp = server->nfs_client;
+
+	if (!nfs4_has_session(clp))
+		do_renew_lease(clp, timestamp);
 }
 
 struct nfs4_call_sync_data {
@@ -482,8 +479,8 @@ struct nfs4_call_sync_data {
 	struct nfs4_sequence_res *seq_res;
 };
 
-static void nfs4_init_sequence(struct nfs4_sequence_args *args,
-			       struct nfs4_sequence_res *res, int cache_reply)
+void nfs4_init_sequence(struct nfs4_sequence_args *args,
+			struct nfs4_sequence_res *res, int cache_reply)
 {
 	args->sa_slot = NULL;
 	args->sa_cache_this = cache_reply;
@@ -622,8 +619,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 		clp = session->clp;
 		do_renew_lease(clp, res->sr_timestamp);
 		/* Check sequence flags */
-		if (res->sr_status_flags != 0)
-			nfs4_schedule_lease_recovery(clp);
+		nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
 		nfs41_update_target_slotid(slot->table, slot, res);
 		break;
 	case 1:
@@ -916,6 +912,7 @@ struct nfs4_opendata {
 	struct nfs_open_confirmres c_res;
 	struct nfs4_string owner_name;
 	struct nfs4_string group_name;
+	struct nfs4_label *a_label;
 	struct nfs_fattr f_attr;
 	struct nfs4_label *f_label;
 	struct dentry *dir;
@@ -1019,6 +1016,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	if (IS_ERR(p->f_label))
 		goto err_free_p;
 
+	p->a_label = nfs4_label_alloc(server, gfp_mask);
+	if (IS_ERR(p->a_label))
+		goto err_free_f;
+
 	alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
 	p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
 	if (IS_ERR(p->o_arg.seqid))
@@ -1047,7 +1048,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p->o_arg.server = server;
 	p->o_arg.bitmask = nfs4_bitmask(server, label);
 	p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
-	p->o_arg.label = label;
+	p->o_arg.label = nfs4_label_copy(p->a_label, label);
 	p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
 	switch (p->o_arg.claim) {
 	case NFS4_OPEN_CLAIM_NULL:
@@ -1080,6 +1081,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	return p;
 
 err_free_label:
+	nfs4_label_free(p->a_label);
+err_free_f:
 	nfs4_label_free(p->f_label);
 err_free_p:
 	kfree(p);
@@ -1099,6 +1102,7 @@ static void nfs4_opendata_free(struct kref *kref)
 		nfs4_put_open_state(p->state);
 	nfs4_put_state_owner(p->owner);
 
+	nfs4_label_free(p->a_label);
 	nfs4_label_free(p->f_label);
 
 	dput(p->dir);
@@ -1556,6 +1560,13 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
 	struct nfs4_state *newstate;
 	int ret;
 
+	if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+	     opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) &&
+	    (opendata->o_arg.u.delegation_type & fmode) != fmode)
+		/* This mode can't have been delegated, so we must have
+		 * a valid open_stateid to cover it - not need to reclaim.
+		 */
+		return 0;
 	opendata->o_arg.open_flags = 0;
 	opendata->o_arg.fmode = fmode;
 	opendata->o_arg.share_access = nfs4_map_atomic_open_share(
@@ -1687,6 +1698,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
 					"%d.\n", __func__, err);
 		case 0:
 		case -ENOENT:
+		case -EAGAIN:
 		case -ESTALE:
 			break;
 		case -NFS4ERR_BADSESSION:
@@ -3358,6 +3370,8 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
 			goto out;
 		case -NFS4ERR_MOVED:
 			err = nfs4_get_referral(client, dir, name, fattr, fhandle);
+			if (err == -NFS4ERR_MOVED)
+				err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
 			goto out;
 		case -NFS4ERR_WRONGSEC:
 			err = -EPERM;
@@ -4958,49 +4972,128 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
 	memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
 
-static unsigned int
-nfs4_init_nonuniform_client_string(struct nfs_client *clp,
-				   char *buf, size_t len)
+static int
+nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 {
-	unsigned int result;
+	int result;
+	size_t len;
+	char *str;
+	bool retried = false;
 
 	if (clp->cl_owner_id != NULL)
-		return strlcpy(buf, clp->cl_owner_id, len);
+		return 0;
+retry:
+	rcu_read_lock();
+	len = 10 + strlen(clp->cl_ipaddr) + 1 +
+		strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
+		1 +
+		strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
+		1;
+	rcu_read_unlock();
+
+	if (len > NFS4_OPAQUE_LIMIT + 1)
+		return -EINVAL;
+
+	/*
+	 * Since this string is allocated at mount time, and held until the
+	 * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+	 * about a memory-reclaim deadlock.
+	 */
+	str = kmalloc(len, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
 
 	rcu_read_lock();
-	result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
-				clp->cl_ipaddr,
-				rpc_peeraddr2str(clp->cl_rpcclient,
-							RPC_DISPLAY_ADDR),
-				rpc_peeraddr2str(clp->cl_rpcclient,
-							RPC_DISPLAY_PROTO));
+	result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
+			clp->cl_ipaddr,
+			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
 	rcu_read_unlock();
-	clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
-	return result;
+
+	/* Did something change? */
+	if (result >= len) {
+		kfree(str);
+		if (retried)
+			return -EINVAL;
+		retried = true;
+		goto retry;
+	}
+	clp->cl_owner_id = str;
+	return 0;
 }
 
-static unsigned int
-nfs4_init_uniform_client_string(struct nfs_client *clp,
-				char *buf, size_t len)
+static int
+nfs4_init_uniquifier_client_string(struct nfs_client *clp)
+{
+	int result;
+	size_t len;
+	char *str;
+
+	len = 10 + 10 + 1 + 10 + 1 +
+		strlen(nfs4_client_id_uniquifier) + 1 +
+		strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+	if (len > NFS4_OPAQUE_LIMIT + 1)
+		return -EINVAL;
+
+	/*
+	 * Since this string is allocated at mount time, and held until the
+	 * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+	 * about a memory-reclaim deadlock.
+	 */
+	str = kmalloc(len, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+			clp->rpc_ops->version, clp->cl_minorversion,
+			nfs4_client_id_uniquifier,
+			clp->cl_rpcclient->cl_nodename);
+	if (result >= len) {
+		kfree(str);
+		return -EINVAL;
+	}
+	clp->cl_owner_id = str;
+	return 0;
+}
+
+static int
+nfs4_init_uniform_client_string(struct nfs_client *clp)
 {
-	const char *nodename = clp->cl_rpcclient->cl_nodename;
-	unsigned int result;
+	int result;
+	size_t len;
+	char *str;
 
 	if (clp->cl_owner_id != NULL)
-		return strlcpy(buf, clp->cl_owner_id, len);
+		return 0;
 
 	if (nfs4_client_id_uniquifier[0] != '\0')
-		result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
-				clp->rpc_ops->version,
-				clp->cl_minorversion,
-				nfs4_client_id_uniquifier,
-				nodename);
-	else
-		result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
-				clp->rpc_ops->version, clp->cl_minorversion,
-				nodename);
-	clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
-	return result;
+		return nfs4_init_uniquifier_client_string(clp);
+
+	len = 10 + 10 + 1 + 10 + 1 +
+		strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+	if (len > NFS4_OPAQUE_LIMIT + 1)
+		return -EINVAL;
+
+	/*
+	 * Since this string is allocated at mount time, and held until the
+	 * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+	 * about a memory-reclaim deadlock.
+	 */
+	str = kmalloc(len, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+			clp->rpc_ops->version, clp->cl_minorversion,
+			clp->cl_rpcclient->cl_nodename);
+	if (result >= len) {
+		kfree(str);
+		return -EINVAL;
+	}
+	clp->cl_owner_id = str;
+	return 0;
 }
 
 /*
@@ -5047,7 +5140,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 	struct nfs4_setclientid setclientid = {
 		.sc_verifier = &sc_verifier,
 		.sc_prog = program,
-		.sc_cb_ident = clp->cl_cb_ident,
+		.sc_clnt = clp,
 	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -5067,16 +5160,15 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 
 	/* nfs_client_id4 */
 	nfs4_init_boot_verifier(clp, &sc_verifier);
+
 	if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
-		setclientid.sc_name_len =
-				nfs4_init_uniform_client_string(clp,
-						setclientid.sc_name,
-						sizeof(setclientid.sc_name));
+		status = nfs4_init_uniform_client_string(clp);
 	else
-		setclientid.sc_name_len =
-				nfs4_init_nonuniform_client_string(clp,
-						setclientid.sc_name,
-						sizeof(setclientid.sc_name));
+		status = nfs4_init_nonuniform_client_string(clp);
+
+	if (status)
+		goto out;
+
 	/* cb_client4 */
 	setclientid.sc_netid_len =
 				nfs4_init_callback_netid(clp,
@@ -5086,9 +5178,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 				sizeof(setclientid.sc_uaddr), "%s.%u.%u",
 				clp->cl_ipaddr, port >> 8, port & 255);
 
-	dprintk("NFS call  setclientid auth=%s, '%.*s'\n",
+	dprintk("NFS call  setclientid auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
-		setclientid.sc_name_len, setclientid.sc_name);
+		clp->cl_owner_id);
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task)) {
 		status = PTR_ERR(task);
@@ -5360,15 +5452,15 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
 	return err;
 }
 
-static int do_vfs_lock(struct file *file, struct file_lock *fl)
+static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
 {
 	int res = 0;
 	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
 		case FL_POSIX:
-			res = posix_lock_file_wait(file, fl);
+			res = posix_lock_inode_wait(inode, fl);
 			break;
 		case FL_FLOCK:
-			res = flock_lock_file_wait(file, fl);
+			res = flock_lock_inode_wait(inode, fl);
 			break;
 		default:
 			BUG();
@@ -5428,7 +5520,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 	switch (task->tk_status) {
 		case 0:
 			renew_lease(calldata->server, calldata->timestamp);
-			do_vfs_lock(calldata->fl.fl_file, &calldata->fl);
+			do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl);
 			if (nfs4_update_lock_stateid(calldata->lsp,
 					&calldata->res.stateid))
 				break;
@@ -5536,7 +5628,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	mutex_lock(&sp->so_delegreturn_mutex);
 	/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
 	down_read(&nfsi->rwsem);
-	if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
+	if (do_vfs_lock(inode, request) == -ENOENT) {
 		up_read(&nfsi->rwsem);
 		mutex_unlock(&sp->so_delegreturn_mutex);
 		goto out;
@@ -5677,7 +5769,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 				data->timestamp);
 		if (data->arg.new_lock) {
 			data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
-			if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) {
+			if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) {
 				rpc_restart_call_prepare(task);
 				break;
 			}
@@ -5919,7 +6011,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 	if (status != 0)
 		goto out;
 	request->fl_flags |= FL_ACCESS;
-	status = do_vfs_lock(request->fl_file, request);
+	status = do_vfs_lock(state->inode, request);
 	if (status < 0)
 		goto out;
 	down_read(&nfsi->rwsem);
@@ -5927,7 +6019,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 		/* Yes: cache locks! */
 		/* ...but avoid races with delegation recall... */
 		request->fl_flags = fl_flags & ~FL_SLEEP;
-		status = do_vfs_lock(request->fl_file, request);
+		status = do_vfs_lock(state->inode, request);
 		up_read(&nfsi->rwsem);
 		goto out;
 	}
@@ -6849,11 +6941,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	};
 
 	nfs4_init_boot_verifier(clp, &verifier);
-	args.id_len = nfs4_init_uniform_client_string(clp, args.id,
-							sizeof(args.id));
-	dprintk("NFS call  exchange_id auth=%s, '%.*s'\n",
+
+	status = nfs4_init_uniform_client_string(clp);
+	if (status)
+		goto out;
+
+	dprintk("NFS call  exchange_id auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
-		args.id_len, args.id);
+		clp->cl_owner_id);
 
 	res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
 					GFP_NOFS);
@@ -6888,7 +6983,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 		/* unsupported! */
 		WARN_ON_ONCE(1);
 		status = -EINVAL;
-		goto out_server_scope;
+		goto out_impl_id;
 	}
 
 	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@@ -6916,6 +7011,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 		/* use the most recent implementation id */
 		kfree(clp->cl_implid);
 		clp->cl_implid = res.impl_id;
+		res.impl_id = NULL;
 
 		if (clp->cl_serverscope != NULL &&
 		    !nfs41_same_server_scope(clp->cl_serverscope,
@@ -6929,15 +7025,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 
 		if (clp->cl_serverscope == NULL) {
 			clp->cl_serverscope = res.server_scope;
-			goto out;
+			res.server_scope = NULL;
 		}
-	} else
-		kfree(res.impl_id);
+	}
 
-out_server_owner:
-	kfree(res.server_owner);
+out_impl_id:
+	kfree(res.impl_id);
 out_server_scope:
 	kfree(res.server_scope);
+out_server_owner:
+	kfree(res.server_owner);
 out:
 	if (clp->cl_implid != NULL)
 		dprintk("NFS reply exchange_id: Server Implementation ID: "
@@ -7487,13 +7584,8 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
 		goto out;
 	}
 	ret = rpc_wait_for_completion_task(task);
-	if (!ret) {
-		struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
-
-		if (task->tk_status == 0)
-			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+	if (!ret)
 		ret = task->tk_status;
-	}
 	rpc_put_task(task);
 out:
 	dprintk("<-- %s status=%d\n", __func__, ret);
@@ -7881,16 +7973,17 @@ static void nfs4_layoutreturn_release(void *calldata)
 {
 	struct nfs4_layoutreturn *lrp = calldata;
 	struct pnfs_layout_hdr *lo = lrp->args.layout;
+	LIST_HEAD(freeme);
 
 	dprintk("--> %s\n", __func__);
 	spin_lock(&lo->plh_inode->i_lock);
 	if (lrp->res.lrs_present)
 		pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+	pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
 	pnfs_clear_layoutreturn_waitbit(lo);
-	clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
-	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 	lo->plh_block_lgets--;
 	spin_unlock(&lo->plh_inode->i_lock);
+	pnfs_free_lseg_list(&freeme);
 	pnfs_put_layout_hdr(lrp->args.layout);
 	nfs_iput_and_deactive(lrp->inode);
 	kfree(calldata);
@@ -8064,9 +8157,8 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
 	struct rpc_task *task;
 	int status = 0;
 
-	dprintk("NFS: %4d initiating layoutcommit call. sync %d "
-		"lbw: %llu inode %lu\n",
-		data->task.tk_pid, sync,
+	dprintk("NFS: initiating layoutcommit call. sync %d "
+		"lbw: %llu inode %lu\n", sync,
 		data->args.lastbytewritten,
 		data->args.inode->i_ino);
 
@@ -8505,7 +8597,6 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.minor_version = 0,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
-		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK,
 	.init_client = nfs40_init_client,
 	.shutdown_client = nfs40_shutdown_client,
@@ -8531,7 +8622,6 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.minor_version = 1,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
-		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1,
@@ -8554,13 +8644,13 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 	.minor_version = 2,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
-		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1
 		| NFS_CAP_ALLOCATE
 		| NFS_CAP_DEALLOCATE
-		| NFS_CAP_SEEK,
+		| NFS_CAP_SEEK
+		| NFS_CAP_LAYOUTSTATS,
 	.init_client = nfs41_init_client,
 	.shutdown_client = nfs41_shutdown_client,
 	.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ddef1dc80..f2e2ad894 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -309,7 +309,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 
 	if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
 		goto do_confirm;
-	nfs4_begin_drain_session(clp);
 	status = nfs4_proc_exchange_id(clp, cred);
 	if (status != 0)
 		goto out;
@@ -1832,6 +1831,7 @@ static int nfs4_establish_lease(struct nfs_client *clp)
 		clp->cl_mvops->reboot_recovery_ops;
 	int status;
 
+	nfs4_begin_drain_session(clp);
 	cred = nfs4_get_clid_cred(clp);
 	if (cred == NULL)
 		return -ENOENT;
@@ -2191,25 +2191,35 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 	}
 }
 
-static void nfs41_handle_state_revoked(struct nfs_client *clp)
+static void nfs41_handle_all_state_revoked(struct nfs_client *clp)
 {
 	nfs4_reset_all_state(clp);
 	dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
 }
 
+static void nfs41_handle_some_state_revoked(struct nfs_client *clp)
+{
+	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+	nfs4_schedule_state_manager(clp);
+
+	dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
+}
+
 static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
 {
-	/* This will need to handle layouts too */
-	nfs_expire_all_delegations(clp);
+	/* FIXME: For now, we destroy all layouts. */
+	pnfs_destroy_all_layouts(clp);
+	/* FIXME: For now, we test all delegations+open state+locks. */
+	nfs41_handle_some_state_revoked(clp);
 	dprintk("%s: Recallable state revoked on server %s!\n", __func__,
 			clp->cl_hostname);
 }
 
 static void nfs41_handle_backchannel_fault(struct nfs_client *clp)
 {
-	nfs_expire_all_delegations(clp);
-	if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
-		nfs4_schedule_state_manager(clp);
+	set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+	nfs4_schedule_state_manager(clp);
+
 	dprintk("%s: server %s declared a backchannel fault\n", __func__,
 			clp->cl_hostname);
 }
@@ -2231,10 +2241,11 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 
 	if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
 		nfs41_handle_server_reboot(clp);
-	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
-			    SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
+	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED))
+		nfs41_handle_all_state_revoked(clp);
+	if (flags & (SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
 			    SEQ4_STATUS_ADMIN_STATE_REVOKED))
-		nfs41_handle_state_revoked(clp);
+		nfs41_handle_some_state_revoked(clp);
 	if (flags & SEQ4_STATUS_LEASE_MOVED)
 		nfs4_schedule_lease_moved_recovery(clp);
 	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0aea97841..558cd65db 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -139,7 +139,8 @@ static int nfs4_stat_to_errno(int);
 #define encode_setclientid_maxsz \
 				(op_encode_hdr_maxsz + \
 				XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
-				XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
+				/* client name */ \
+				1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
 				1 /* sc_prog */ + \
 				1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
 				1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
@@ -288,7 +289,8 @@ static int nfs4_stat_to_errno(int);
 #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
 				encode_verifier_maxsz + \
 				1 /* co_ownerid.len */ + \
-				XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
+				/* eia_clientowner */ \
+				1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
 				1 /* flags */ + \
 				1 /* spa_how */ + \
 				/* max is SP4_MACH_CRED (for now) */ + \
@@ -1667,13 +1669,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 	encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
 	encode_nfs4_verifier(xdr, setclientid->sc_verifier);
 
-	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
+	encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id),
+			setclientid->sc_clnt->cl_owner_id);
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
 	p = reserve_space(xdr, 4);
-	*p = cpu_to_be32(setclientid->sc_cb_ident);
+	*p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident);
 }
 
 static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
@@ -1747,7 +1750,8 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 	encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
 	encode_nfs4_verifier(xdr, args->verifier);
 
-	encode_string(xdr, args->id_len, args->id);
+	encode_string(xdr, strlen(args->client->cl_owner_id),
+			args->client->cl_owner_id);
 
 	encode_uint32(xdr, args->flags);
 	encode_uint32(xdr, args->state_protect.how);
@@ -7427,6 +7431,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(SEEK,		enc_seek,		dec_seek),
 	PROC(ALLOCATE,		enc_allocate,		dec_allocate),
 	PROC(DEALLOCATE,	enc_deallocate,		dec_deallocate),
+	PROC(LAYOUTSTATS,	enc_layoutstats,	dec_layoutstats),
 #endif /* CONFIG_NFS_V4_2 */
 };
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7b4552678..4984bbe55 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -636,9 +636,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
 
 	hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
 
-	dprintk("NFS: %5u initiated pgio call "
+	dprintk("NFS: initiated pgio call "
 		"(req %s/%llu, %u bytes @ offset %llu)\n",
-		hdr->task.tk_pid,
 		hdr->inode->i_sb->s_id,
 		(unsigned long long)NFS_FILEID(hdr->inode),
 		hdr->args.count,
@@ -690,8 +689,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
 static void nfs_pgio_release(void *calldata)
 {
 	struct nfs_pgio_header *hdr = calldata;
-	if (hdr->rw_ops->rw_release)
-		hdr->rw_ops->rw_release(hdr);
 	nfs_pgio_data_destroy(hdr);
 	hdr->completion_ops->completion(hdr);
 }
@@ -711,7 +708,9 @@ static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
  * nfs_pageio_init - initialise a page io descriptor
  * @desc: pointer to descriptor
  * @inode: pointer to inode
- * @doio: pointer to io function
+ * @pg_ops: pointer to pageio operations
+ * @compl_ops: pointer to pageio completion operations
+ * @rw_ops: pointer to nfs read/write operations
  * @bsize: io block size
  * @io_flags: extra parameters for the io function
  */
@@ -1101,8 +1100,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
 		mirror->pg_base = 0;
 		mirror->pg_recoalesce = 0;
 
-		desc->pg_moreio = 0;
-
 		while (!list_empty(&head)) {
 			struct nfs_page *req;
 
@@ -1189,6 +1186,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
  * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
  *				nfs_pageio_descriptor
  * @desc: pointer to io descriptor
+ * @mirror_idx: pointer to mirror index
  */
 static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
 				       u32 mirror_idx)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d47c18868..70bf706b1 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -35,6 +35,7 @@
 #include "iostat.h"
 #include "nfs4trace.h"
 #include "delegation.h"
+#include "nfs42.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS
 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -351,7 +352,7 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
 {
 	struct pnfs_layout_segment *s;
 
-	if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+	if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
 		return false;
 
 	list_for_each_entry(s, &lo->plh_segs, pls_list)
@@ -361,6 +362,18 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
 	return true;
 }
 
+static bool
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+{
+	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+		return false;
+	lo->plh_return_iomode = 0;
+	lo->plh_block_lgets++;
+	pnfs_get_layout_hdr(lo);
+	clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
+	return true;
+}
+
 static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
 		struct pnfs_layout_hdr *lo, struct inode *inode)
 {
@@ -371,17 +384,16 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
 	if (pnfs_layout_need_return(lo, lseg)) {
 		nfs4_stateid stateid;
 		enum pnfs_iomode iomode;
+		bool send;
 
 		stateid = lo->plh_stateid;
 		iomode = lo->plh_return_iomode;
-		/* decreased in pnfs_send_layoutreturn() */
-		lo->plh_block_lgets++;
-		lo->plh_return_iomode = 0;
+		send = pnfs_prepare_layoutreturn(lo);
 		spin_unlock(&inode->i_lock);
-		pnfs_get_layout_hdr(lo);
-
-		/* Send an async layoutreturn so we dont deadlock */
-		pnfs_send_layoutreturn(lo, stateid, iomode, false);
+		if (send) {
+			/* Send an async layoutreturn so we dont deadlock */
+			pnfs_send_layoutreturn(lo, stateid, iomode, false);
+		}
 	} else
 		spin_unlock(&inode->i_lock);
 }
@@ -410,6 +422,10 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 		pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
 
 	if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+			spin_unlock(&inode->i_lock);
+			return;
+		}
 		pnfs_get_layout_hdr(lo);
 		pnfs_layout_remove_lseg(lo, lseg);
 		spin_unlock(&inode->i_lock);
@@ -450,6 +466,8 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
 		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 	if (atomic_dec_and_test(&lseg->pls_refcount)) {
 		struct pnfs_layout_hdr *lo = lseg->pls_layout;
+		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+			return;
 		pnfs_get_layout_hdr(lo);
 		pnfs_layout_remove_lseg(lo, lseg);
 		pnfs_free_lseg_async(lseg);
@@ -923,6 +941,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
 	clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
 	smp_mb__after_atomic();
 	wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 }
 
 static int
@@ -977,6 +996,7 @@ _pnfs_return_layout(struct inode *ino)
 	LIST_HEAD(tmp_list);
 	nfs4_stateid stateid;
 	int status = 0, empty;
+	bool send;
 
 	dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
 
@@ -1006,17 +1026,18 @@ _pnfs_return_layout(struct inode *ino)
 	/* Don't send a LAYOUTRETURN if list was initially empty */
 	if (empty) {
 		spin_unlock(&ino->i_lock);
-		pnfs_put_layout_hdr(lo);
 		dprintk("NFS: %s no layout segments to return\n", __func__);
-		goto out;
+		goto out_put_layout_hdr;
 	}
 
 	set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-	lo->plh_block_lgets++;
+	send = pnfs_prepare_layoutreturn(lo);
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&tmp_list);
-
-	status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+	if (send)
+		status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+out_put_layout_hdr:
+	pnfs_put_layout_hdr(lo);
 out:
 	dprintk("<-- %s status: %d\n", __func__, status);
 	return status;
@@ -1096,13 +1117,9 @@ bool pnfs_roc(struct inode *ino)
 out_noroc:
 	if (lo) {
 		stateid = lo->plh_stateid;
-		layoutreturn =
-			test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
-					   &lo->plh_flags);
-		if (layoutreturn) {
-			lo->plh_block_lgets++;
-			pnfs_get_layout_hdr(lo);
-		}
+		if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+					   &lo->plh_flags))
+			layoutreturn = pnfs_prepare_layoutreturn(lo);
 	}
 	spin_unlock(&ino->i_lock);
 	if (layoutreturn) {
@@ -1145,15 +1162,18 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
 	struct pnfs_layout_segment *lseg;
 	nfs4_stateid stateid;
 	u32 current_seqid;
-	bool found = false, layoutreturn = false;
+	bool layoutreturn = false;
 
 	spin_lock(&ino->i_lock);
-	list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
-		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
-			rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-			found = true;
-			goto out;
-		}
+	list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) {
+		if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags))
+			continue;
+		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+			continue;
+		rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+		spin_unlock(&ino->i_lock);
+		return true;
+	}
 	lo = nfsi->layout;
 	current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
 
@@ -1161,23 +1181,19 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
 	 * a barrier, we choose the worst-case barrier.
 	 */
 	*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
-out:
-	if (!found) {
-		stateid = lo->plh_stateid;
-		layoutreturn =
-			test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
-					   &lo->plh_flags);
-		if (layoutreturn) {
-			lo->plh_block_lgets++;
-			pnfs_get_layout_hdr(lo);
-		}
-	}
+	stateid = lo->plh_stateid;
+	if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+					   &lo->plh_flags))
+		layoutreturn = pnfs_prepare_layoutreturn(lo);
+	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+		rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+
 	spin_unlock(&ino->i_lock);
 	if (layoutreturn) {
-		rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
 		pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
+		return true;
 	}
-	return found;
+	return false;
 }
 
 /*
@@ -1694,7 +1710,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
 	spin_lock(&inode->i_lock);
 	/* set failure bit so that pnfs path will be retried later */
 	pnfs_layout_set_fail_bit(lo, iomode);
-	set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
 	if (lo->plh_return_iomode == 0)
 		lo->plh_return_iomode = range.iomode;
 	else if (lo->plh_return_iomode != range.iomode)
@@ -2206,13 +2221,12 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 	if (ld->prepare_layoutcommit) {
 		status = ld->prepare_layoutcommit(&data->args);
 		if (status) {
+			put_rpccred(data->cred);
 			spin_lock(&inode->i_lock);
 			set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
 			if (end_pos > nfsi->layout->plh_lwb)
 				nfsi->layout->plh_lwb = end_pos;
-			spin_unlock(&inode->i_lock);
-			put_rpccred(data->cred);
-			goto clear_layoutcommitting;
+			goto out_unlock;
 		}
 	}
 
@@ -2250,3 +2264,63 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 	}
 	return thp;
 }
+
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int
+pnfs_report_layoutstat(struct inode *inode)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs42_layoutstat_data *data;
+	struct pnfs_layout_hdr *hdr;
+	int status = 0;
+
+	if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
+		goto out;
+
+	if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
+		goto out;
+
+	if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
+		goto out;
+
+	spin_lock(&inode->i_lock);
+	if (!NFS_I(inode)->layout) {
+		spin_unlock(&inode->i_lock);
+		goto out;
+	}
+	hdr = NFS_I(inode)->layout;
+	pnfs_get_layout_hdr(hdr);
+	spin_unlock(&inode->i_lock);
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data) {
+		status = -ENOMEM;
+		goto out_put;
+	}
+
+	data->args.fh = NFS_FH(inode);
+	data->args.inode = inode;
+	nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
+	status = ld->prepare_layoutstats(&data->args);
+	if (status)
+		goto out_free;
+
+	status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
+
+out:
+	dprintk("%s returns %d\n", __func__, status);
+	return status;
+
+out_free:
+	kfree(data);
+out_put:
+	pnfs_put_layout_hdr(hdr);
+	smp_mb__before_atomic();
+	clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
+	smp_mb__after_atomic();
+	goto out;
+}
+EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
+#endif
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1e6308f82..3e6ab7bfb 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -178,6 +178,8 @@ struct pnfs_layoutdriver_type {
 	void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
 				     struct xdr_stream *xdr,
 				     const struct nfs4_layoutcommit_args *args);
+	int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
+	void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data);
 };
 
 struct pnfs_layout_hdr {
@@ -290,7 +292,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 void pnfs_error_mark_layout_for_return(struct inode *inode,
 				       struct pnfs_layout_segment *lseg);
-
 /* nfs4_deviceid_flags */
 enum {
 	NFS_DEVICEID_INVALID = 0,       /* set when MDS clientid recalled */
@@ -689,4 +690,14 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
 
 #endif /* CONFIG_NFS_V4_1 */
 
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int pnfs_report_layoutstat(struct inode *inode);
+#else
+static inline int
+pnfs_report_layoutstat(struct inode *inode)
+{
+	return 0;
+}
+#endif
+
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f175b833b..aa62004f1 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2847,7 +2847,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp)
 	*((unsigned int *)kp->arg) = num;
 	return 0;
 }
-static struct kernel_param_ops param_ops_portnr = {
+static const struct kernel_param_ops param_ops_portnr = {
 	.set = param_set_portnr,
 	.get = param_get_uint,
 };
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 2d5620065..b6de433da 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -20,7 +20,6 @@
 #include <linux/stat.h>
 #include <linux/mm.h>
 #include <linux/string.h>
-#include <linux/namei.h>
 
 /* Symlink caching in the page cache is even more simplistic
  * and straight-forward than readdir caching.
@@ -43,7 +42,7 @@ error:
 	return -EIO;
 }
 
-static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct inode *inode = d_inode(dentry);
 	struct page *page;
@@ -51,19 +50,13 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 
 	err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
 	if (err)
-		goto read_failed;
+		return err;
 	page = read_cache_page(&inode->i_data, 0,
 				(filler_t *)nfs_symlink_filler, inode);
-	if (IS_ERR(page)) {
-		err = page;
-		goto read_failed;
-	}
-	nd_set_link(nd, kmap(page));
-	return page;
-
-read_failed:
-	nd_set_link(nd, err);
-	return NULL;
+	if (IS_ERR(page))
+		return ERR_CAST(page);
+	*cookie = page;
+	return kmap(page);
 }
 
 /*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index daf355642..75a35a1af 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -853,7 +853,8 @@ static void
 nfs_clear_page_commit(struct page *page)
 {
 	dec_zone_page_state(page, NR_UNSTABLE_NFS);
-	dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
+	dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
+		    WB_RECLAIMABLE);
 }
 
 /* Called holding inode (/cinfo) lock */
@@ -1348,11 +1349,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
-{
-	/* do nothing! */
-}
-
 /*
  * Special version of should_remove_suid() that ignores capabilities.
  */
@@ -1383,24 +1379,27 @@ static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr,
 {
 	struct nfs_pgio_args *argp = &hdr->args;
 	struct nfs_pgio_res *resp = &hdr->res;
+	u64 size = argp->offset + resp->count;
 
 	if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+		fattr->size = size;
+	if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) {
+		fattr->valid &= ~NFS_ATTR_FATTR_SIZE;
 		return;
-	if (argp->offset + resp->count != fattr->size)
-		return;
-	if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode))
+	}
+	if (size != fattr->size)
 		return;
 	/* Set attribute barrier */
 	nfs_fattr_set_barrier(fattr);
+	/* ...and update size */
+	fattr->valid |= NFS_ATTR_FATTR_SIZE;
 }
 
 void nfs_writeback_update_inode(struct nfs_pgio_header *hdr)
 {
-	struct nfs_fattr *fattr = hdr->res.fattr;
+	struct nfs_fattr *fattr = &hdr->fattr;
 	struct inode *inode = hdr->inode;
 
-	if (fattr == NULL)
-		return;
 	spin_lock(&inode->i_lock);
 	nfs_writeback_check_extend(hdr, fattr);
 	nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
@@ -1556,7 +1555,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
 	/* Set up the initial task struct.  */
 	nfs_ops->commit_setup(data, &msg);
 
-	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+	dprintk("NFS: initiated commit call\n");
 
 	nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client,
 		NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg);
@@ -2013,7 +2012,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = {
 	.rw_mode		= FMODE_WRITE,
 	.rw_alloc_header	= nfs_writehdr_alloc,
 	.rw_free_header		= nfs_writehdr_free,
-	.rw_release		= nfs_writeback_release_common,
 	.rw_done		= nfs_writeback_done,
 	.rw_result		= nfs_writeback_result,
 	.rw_initiate		= nfs_initiate_write,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index e4b2b4322..f6e7cbaba 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -805,7 +805,7 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
 
 static __be32
 compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
-		const char *name, int namlen)
+		 const char *name, int namlen, u64 ino)
 {
 	struct svc_export	*exp;
 	struct dentry		*dparent, *dchild;
@@ -830,19 +830,21 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		goto out;
 	if (d_really_is_negative(dchild))
 		goto out;
+	if (dchild->d_inode->i_ino != ino)
+		goto out;
 	rv = fh_compose(fhp, exp, dchild, &cd->fh);
 out:
 	dput(dchild);
 	return rv;
 }
 
-static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino)
 {
 	struct svc_fh	*fh = &cd->scratch;
 	__be32 err;
 
 	fh_init(fh, NFS3_FHSIZE);
-	err = compose_entry_fh(cd, fh, name, namlen);
+	err = compose_entry_fh(cd, fh, name, namlen, ino);
 	if (err) {
 		*p++ = 0;
 		*p++ = 0;
@@ -927,7 +929,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
 		p = encode_entry_baggage(cd, p, name, namlen, ino);
 
 		if (plus)
-			p = encode_entryplus_baggage(cd, p, name, namlen);
+			p = encode_entryplus_baggage(cd, p, name, namlen, ino);
 		num_entry_words = p - cd->buffer;
 	} else if (*(page+1) != NULL) {
 		/* temporarily encode entry into next page, then move back to
@@ -941,7 +943,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
 		p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
 
 		if (plus)
-			p1 = encode_entryplus_baggage(cd, p1, name, namlen);
+			p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino);
 
 		/* determine entry word length and lengths to go in pages */
 		num_entry_words = p1 - tmp;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 67242bf7c..eb5accf1b 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -52,10 +52,6 @@
 #define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE)
 #define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)
 
-/* We don't support these bits; insist they be neither allowed nor denied */
-#define NFS4_MASK_UNSUPP (NFS4_ACE_DELETE | NFS4_ACE_WRITE_OWNER \
-		| NFS4_ACE_READ_NAMED_ATTRS | NFS4_ACE_WRITE_NAMED_ATTRS)
-
 /* flags used to simulate posix default ACLs */
 #define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
 		| NFS4_ACE_DIRECTORY_INHERIT_ACE)
@@ -64,9 +60,6 @@
 		| NFS4_ACE_INHERIT_ONLY_ACE \
 		| NFS4_ACE_IDENTIFIER_GROUP)
 
-#define MASK_EQUAL(mask1, mask2) \
-	( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
-
 static u32
 mask_from_posix(unsigned short perm, unsigned int flags)
 {
@@ -126,11 +119,6 @@ low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
 		*mode |= ACL_EXECUTE;
 }
 
-struct ace_container {
-	struct nfs4_ace  *ace;
-	struct list_head  ace_l;
-};
-
 static short ace2type(struct nfs4_ace *);
 static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
 				unsigned int);
@@ -384,7 +372,6 @@ pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2)
 static void
 sort_pacl_range(struct posix_acl *pacl, int start, int end) {
 	int sorted = 0, i;
-	struct posix_acl_entry tmp;
 
 	/* We just do a bubble sort; easy to do in place, and we're not
 	 * expecting acl's to be long enough to justify anything more. */
@@ -394,9 +381,8 @@ sort_pacl_range(struct posix_acl *pacl, int start, int end) {
 			if (pace_gt(&pacl->a_entries[i],
 				    &pacl->a_entries[i+1])) {
 				sorted = 0;
-				tmp = pacl->a_entries[i];
-				pacl->a_entries[i] = pacl->a_entries[i+1];
-				pacl->a_entries[i+1] = tmp;
+				swap(pacl->a_entries[i],
+				     pacl->a_entries[i + 1]);
 			}
 		}
 	}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 5694cfb7a..a49201835 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -455,6 +455,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
 	if (unlikely(status || cb->cb_status))
 		return status;
 
+	cb->cb_update_seq_nr = true;
 	return decode_cb_sequence4resok(xdr, cb);
 }
 
@@ -875,6 +876,8 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 	u32 minorversion = clp->cl_minorversion;
 
 	cb->cb_minorversion = minorversion;
+	cb->cb_update_seq_nr = false;
+	cb->cb_status = 0;
 	if (minorversion) {
 		if (!nfsd41_cb_get_slot(clp, task))
 			return;
@@ -891,9 +894,16 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 		clp->cl_minorversion);
 
 	if (clp->cl_minorversion) {
-		/* No need for lock, access serialized in nfsd4_cb_prepare */
-		if (!task->tk_status)
+		/*
+		 * No need for lock, access serialized in nfsd4_cb_prepare
+		 *
+		 * RFC5661 20.9.3
+		 * If CB_SEQUENCE returns an error, then the state of the slot
+		 * (sequence ID, cached reply) MUST NOT change.
+		 */
+		if (cb->cb_update_seq_nr)
 			++clp->cl_cb_session->se_cb_seq_nr;
+
 		clear_bit(0, &clp->cl_cb_slot_busy);
 		rpc_wake_up_next(&clp->cl_cb_waitq);
 		dprintk("%s: freed slot, new seqid=%d\n", __func__,
@@ -1090,6 +1100,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 	cb->cb_ops = ops;
 	INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
 	cb->cb_status = 0;
+	cb->cb_update_seq_nr = false;
 	cb->cb_need_restart = false;
 }
 
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 6904213a4..ebf90e487 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -212,6 +212,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	BUG_ON(!ls->ls_file);
 
 	if (nfsd4_layout_setlease(ls)) {
+		fput(ls->ls_file);
 		put_nfs4_file(fp);
 		kmem_cache_free(nfs4_layout_stateid_cache, ls);
 		return NULL;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 864e2003e..90cfda753 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -760,8 +760,6 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	__be32 status;
 
-	/* no need to check permission - this will be done in nfsd_read() */
-
 	read->rd_filp = NULL;
 	if (read->rd_offset >= OFFSET_MAX)
 		return nfserr_inval;
@@ -778,9 +776,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
 
 	/* check stateid */
-	if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
-						 cstate, &read->rd_stateid,
-						 RD_STATE, &read->rd_filp))) {
+	status = nfs4_preprocess_stateid_op(rqstp, cstate, &read->rd_stateid,
+			RD_STATE, &read->rd_filp, &read->rd_tmp_file);
+	if (status) {
 		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
 		goto out;
 	}
@@ -924,8 +922,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	int err;
 
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
-		status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
-			&setattr->sa_stateid, WR_STATE, NULL);
+		status = nfs4_preprocess_stateid_op(rqstp, cstate,
+			&setattr->sa_stateid, WR_STATE, NULL, NULL);
 		if (status) {
 			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
 			return status;
@@ -986,13 +984,11 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	unsigned long cnt;
 	int nvecs;
 
-	/* no need to check permission - this will be done in nfsd_write() */
-
 	if (write->wr_offset >= OFFSET_MAX)
 		return nfserr_inval;
 
-	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
-					cstate, stateid, WR_STATE, &filp);
+	status = nfs4_preprocess_stateid_op(rqstp, cstate, stateid, WR_STATE,
+			&filp, NULL);
 	if (status) {
 		dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
 		return status;
@@ -1005,11 +1001,10 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nvecs = fill_in_write_vector(rqstp->rq_vec, write);
 	WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
 
-	status =  nfsd_write(rqstp, &cstate->current_fh, filp,
-			     write->wr_offset, rqstp->rq_vec, nvecs,
-			     &cnt, &write->wr_how_written);
-	if (filp)
-		fput(filp);
+	status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
+				write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
+				&write->wr_how_written);
+	fput(filp);
 
 	write->wr_bytes_written = cnt;
 
@@ -1023,15 +1018,13 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	__be32 status = nfserr_notsupp;
 	struct file *file;
 
-	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+	status = nfs4_preprocess_stateid_op(rqstp, cstate,
 					    &fallocate->falloc_stateid,
-					    WR_STATE, &file);
+					    WR_STATE, &file, NULL);
 	if (status != nfs_ok) {
 		dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
 		return status;
 	}
-	if (!file)
-		return nfserr_bad_stateid;
 
 	status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file,
 				     fallocate->falloc_offset,
@@ -1064,15 +1057,13 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	__be32 status;
 	struct file *file;
 
-	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+	status = nfs4_preprocess_stateid_op(rqstp, cstate,
 					    &seek->seek_stateid,
-					    RD_STATE, &file);
+					    RD_STATE, &file, NULL);
 	if (status) {
 		dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
 		return status;
 	}
-	if (!file)
-		return nfserr_bad_stateid;
 
 	switch (seek->seek_whence) {
 	case NFS4_CONTENT_DATA:
@@ -1732,10 +1723,6 @@ encode_op:
 			be32_to_cpu(status));
 
 		nfsd4_cstate_clear_replay(cstate);
-		/* XXX Ugh, we need to get rid of this kind of special case: */
-		if (op->opnum == OP_READ && op->u.read.rd_filp)
-			fput(op->u.read.rd_filp);
-
 		nfsd4_increment_op_stats(op->opnum);
 	}
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6e13504f7..95202719a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3861,7 +3861,7 @@ static __be32
 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
 {
 	__be32 status;
-	unsigned char old_deny_bmap;
+	unsigned char old_deny_bmap = stp->st_deny_bmap;
 
 	if (!test_access(open->op_share_access, stp))
 		return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
@@ -3870,7 +3870,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
 	spin_lock(&fp->fi_lock);
 	status = nfs4_file_check_deny(fp, open->op_share_deny);
 	if (status == nfs_ok) {
-		old_deny_bmap = stp->st_deny_bmap;
 		set_deny(open->op_share_deny, stp);
 		fp->fi_share_deny |=
 				(open->op_share_deny & NFS4_SHARE_DENY_BOTH);
@@ -4577,6 +4576,9 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 static struct file *
 nfs4_find_file(struct nfs4_stid *s, int flags)
 {
+	if (!s)
+		return NULL;
+
 	switch (s->sc_type) {
 	case NFS4_DELEG_STID:
 		if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file))
@@ -4605,27 +4607,63 @@ nfs4_check_olstateid(struct svc_fh *fhp, struct nfs4_ol_stateid *ols, int flags)
 	return nfs4_check_openmode(ols, flags);
 }
 
+static __be32
+nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
+		struct file **filpp, bool *tmp_file, int flags)
+{
+	int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE;
+	struct file *file;
+	__be32 status;
+
+	file = nfs4_find_file(s, flags);
+	if (file) {
+		status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+				acc | NFSD_MAY_OWNER_OVERRIDE);
+		if (status) {
+			fput(file);
+			return status;
+		}
+
+		*filpp = file;
+	} else {
+		status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp);
+		if (status)
+			return status;
+
+		if (tmp_file)
+			*tmp_file = true;
+	}
+
+	return 0;
+}
+
 /*
  * Checks for stateid operations
  */
 __be32
-nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
-			   stateid_t *stateid, int flags, struct file **filpp)
+nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		int flags, struct file **filpp, bool *tmp_file)
 {
 	struct svc_fh *fhp = &cstate->current_fh;
 	struct inode *ino = d_inode(fhp->fh_dentry);
+	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-	struct nfs4_stid *s;
+	struct nfs4_stid *s = NULL;
 	__be32 status;
 
 	if (filpp)
 		*filpp = NULL;
+	if (tmp_file)
+		*tmp_file = false;
 
 	if (grace_disallows_io(net, ino))
 		return nfserr_grace;
 
-	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-		return check_special_stateids(net, fhp, stateid, flags);
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
+		status = check_special_stateids(net, fhp, stateid, flags);
+		goto done;
+	}
 
 	status = nfsd4_lookup_stateid(cstate, stateid,
 				NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
@@ -4653,13 +4691,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
 		goto out;
 	status = nfs4_check_fh(fhp, s);
 
-	if (!status && filpp) {
-		*filpp = nfs4_find_file(s, flags);
-		if (!*filpp)
-			status = nfserr_serverfault;
-	}
+done:
+	if (!status && filpp)
+		status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags);
 out:
-	nfs4_put_stid(s);
+	if (s)
+		nfs4_put_stid(s);
 	return status;
 }
 
@@ -5512,7 +5549,7 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct
 	__be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
 	if (!err) {
 		err = nfserrno(vfs_test_lock(file, lock));
-		nfsd_close(file);
+		fput(file);
 	}
 	return err;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d4d84451e..75e0563c0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -33,6 +33,7 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/statfs.h>
@@ -2229,7 +2230,6 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 	u32 rdattr_err = 0;
 	__be32 status;
 	int err;
-	int aclsupport = 0;
 	struct nfs4_acl *acl = NULL;
 	void *context = NULL;
 	int contextlen;
@@ -2275,19 +2275,15 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 			goto out;
 		fhp = tempfh;
 	}
-	if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
-			| FATTR4_WORD0_SUPPORTED_ATTRS)) {
+	if (bmval0 & FATTR4_WORD0_ACL) {
 		err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
-		aclsupport = (err == 0);
-		if (bmval0 & FATTR4_WORD0_ACL) {
-			if (err == -EOPNOTSUPP)
-				bmval0 &= ~FATTR4_WORD0_ACL;
-			else if (err == -EINVAL) {
-				status = nfserr_attrnotsupp;
-				goto out;
-			} else if (err != 0)
-				goto out_nfserr;
-		}
+		if (err == -EOPNOTSUPP)
+			bmval0 &= ~FATTR4_WORD0_ACL;
+		else if (err == -EINVAL) {
+			status = nfserr_attrnotsupp;
+			goto out;
+		} else if (err != 0)
+			goto out_nfserr;
 	}
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
@@ -2339,7 +2335,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 		u32 word1 = nfsd_suppattrs1(minorversion);
 		u32 word2 = nfsd_suppattrs2(minorversion);
 
-		if (!aclsupport)
+		if (!IS_POSIXACL(dentry->d_inode))
 			word0 &= ~FATTR4_WORD0_ACL;
 		if (!contextsupport)
 			word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
@@ -2487,7 +2483,7 @@ out_acl:
 		p = xdr_reserve_space(xdr, 4);
 		if (!p)
 			goto out_resource;
-		*p++ = cpu_to_be32(aclsupport ?
+		*p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ?
 			ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
 	}
 	if (bmval0 & FATTR4_WORD0_CANSETTIME) {
@@ -3423,52 +3419,51 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 	unsigned long maxcount;
 	struct xdr_stream *xdr = &resp->xdr;
 	struct file *file = read->rd_filp;
-	struct svc_fh *fhp = read->rd_fhp;
 	int starting_len = xdr->buf->len;
-	struct raparms *ra;
+	struct raparms *ra = NULL;
 	__be32 *p;
-	__be32 err;
 
 	if (nfserr)
-		return nfserr;
+		goto out;
 
 	p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
 	if (!p) {
 		WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
-		return nfserr_resource;
+		nfserr = nfserr_resource;
+		goto out;
 	}
-	if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
+	if (resp->xdr.buf->page_len &&
+	    test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
 		WARN_ON_ONCE(1);
-		return nfserr_resource;
+		nfserr = nfserr_resource;
+		goto out;
 	}
 	xdr_commit_encode(xdr);
 
 	maxcount = svc_max_payload(resp->rqstp);
-	maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
+	maxcount = min_t(unsigned long, maxcount,
+			 (xdr->buf->buflen - xdr->buf->len));
 	maxcount = min_t(unsigned long, maxcount, read->rd_length);
 
-	if (read->rd_filp)
-		err = nfsd_permission(resp->rqstp, fhp->fh_export,
-				fhp->fh_dentry,
-				NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
-	else
-		err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
-						&file, &ra);
-	if (err)
-		goto err_truncate;
+	if (read->rd_tmp_file)
+		ra = nfsd_init_raparms(file);
 
-	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
-		err = nfsd4_encode_splice_read(resp, read, file, maxcount);
+	if (file->f_op->splice_read &&
+	    test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
+		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
 	else
-		err = nfsd4_encode_readv(resp, read, file, maxcount);
+		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
 
-	if (!read->rd_filp)
-		nfsd_put_tmp_read_open(file, ra);
+	if (ra)
+		nfsd_put_raparams(file, ra);
 
-err_truncate:
-	if (err)
+	if (nfserr)
 		xdr_truncate_encode(xdr, starting_len);
-	return err;
+
+out:
+	if (file)
+		fput(file);
+	return nfserr;
 }
 
 static __be32
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index aecbcd34d..4cd78ef4c 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -59,13 +59,61 @@ static __be32
 nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
 					  struct nfsd_attrstat  *resp)
 {
+	struct iattr *iap = &argp->attrs;
+	struct svc_fh *fhp;
 	__be32 nfserr;
+
 	dprintk("nfsd: SETATTR  %s, valid=%x, size=%ld\n",
 		SVCFH_fmt(&argp->fh),
 		argp->attrs.ia_valid, (long) argp->attrs.ia_size);
 
-	fh_copy(&resp->fh, &argp->fh);
-	nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0);
+	fhp = fh_copy(&resp->fh, &argp->fh);
+
+	/*
+	 * NFSv2 does not differentiate between "set-[ac]time-to-now"
+	 * which only requires access, and "set-[ac]time-to-X" which
+	 * requires ownership.
+	 * So if it looks like it might be "set both to the same time which
+	 * is close to now", and if inode_change_ok fails, then we
+	 * convert to "set to now" instead of "set to explicit time"
+	 *
+	 * We only call inode_change_ok as the last test as technically
+	 * it is not an interface that we should be using.
+	 */
+#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
+#define	MAX_TOUCH_TIME_ERROR (30*60)
+	if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET &&
+	    iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) {
+		/*
+		 * Looks probable.
+		 *
+		 * Now just make sure time is in the right ballpark.
+		 * Solaris, at least, doesn't seem to care what the time
+		 * request is.  We require it be within 30 minutes of now.
+		 */
+		time_t delta = iap->ia_atime.tv_sec - get_seconds();
+		struct inode *inode;
+
+		nfserr = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+		if (nfserr)
+			goto done;
+		inode = d_inode(fhp->fh_dentry);
+
+		if (delta < 0)
+			delta = -delta;
+		if (delta < MAX_TOUCH_TIME_ERROR &&
+		    inode_change_ok(inode, iap) != 0) {
+			/*
+			 * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
+			 * This will cause notify_change to set these times
+			 * to "now"
+			 */
+			iap->ia_valid &= ~BOTH_TIME_SET;
+		}
+	}
+
+	nfserr = nfsd_setattr(rqstp, fhp, iap, 0, (time_t)0);
+done:
 	return nfsd_return_attrs(nfserr, resp);
 }
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index dbc4f85a5..4874ce515 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,6 +68,7 @@ struct nfsd4_callback {
 	struct nfsd4_callback_ops *cb_ops;
 	struct work_struct cb_work;
 	int cb_status;
+	bool cb_update_seq_nr;
 	bool cb_need_restart;
 };
 
@@ -582,9 +583,9 @@ enum nfsd4_cb_op {
 struct nfsd4_compound_state;
 struct nfsd_net;
 
-extern __be32 nfs4_preprocess_stateid_op(struct net *net,
-		struct nfsd4_compound_state *cstate,
-		stateid_t *stateid, int flags, struct file **filp);
+extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		int flags, struct file **filp, bool *tmp_file);
 __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		     stateid_t *stateid, unsigned char typemask,
 		     struct nfs4_stid **s, struct nfsd_net *nn);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 84d770be0..b5e077a6e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -302,42 +302,6 @@ commit_metadata(struct svc_fh *fhp)
 static void
 nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
 {
-	/*
-	 * NFSv2 does not differentiate between "set-[ac]time-to-now"
-	 * which only requires access, and "set-[ac]time-to-X" which
-	 * requires ownership.
-	 * So if it looks like it might be "set both to the same time which
-	 * is close to now", and if inode_change_ok fails, then we
-	 * convert to "set to now" instead of "set to explicit time"
-	 *
-	 * We only call inode_change_ok as the last test as technically
-	 * it is not an interface that we should be using.
-	 */
-#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
-#define	MAX_TOUCH_TIME_ERROR (30*60)
-	if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET &&
-	    iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) {
-		/*
-		 * Looks probable.
-		 *
-		 * Now just make sure time is in the right ballpark.
-		 * Solaris, at least, doesn't seem to care what the time
-		 * request is.  We require it be within 30 minutes of now.
-		 */
-		time_t delta = iap->ia_atime.tv_sec - get_seconds();
-		if (delta < 0)
-			delta = -delta;
-		if (delta < MAX_TOUCH_TIME_ERROR &&
-		    inode_change_ok(inode, iap) != 0) {
-			/*
-			 * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
-			 * This will cause notify_change to set these times
-			 * to "now"
-			 */
-			iap->ia_valid &= ~BOTH_TIME_SET;
-		}
-	}
-
 	/* sanitize the mode change */
 	if (iap->ia_valid & ATTR_MODE) {
 		iap->ia_mode &= S_IALLUGO;
@@ -538,16 +502,11 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			   struct file *file, loff_t offset, loff_t len,
 			   int flags)
 {
-	__be32 err;
 	int error;
 
 	if (!S_ISREG(file_inode(file)->i_mode))
 		return nfserr_inval;
 
-	err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE);
-	if (err)
-		return err;
-
 	error = vfs_fallocate(file, flags, offset, len);
 	if (!error)
 		error = commit_metadata(fhp);
@@ -744,7 +703,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 
 	host_err = ima_file_check(file, may_flags, 0);
 	if (host_err) {
-		nfsd_close(file);
+		fput(file);
 		goto out_nfserr;
 	}
 
@@ -761,23 +720,12 @@ out:
 	return err;
 }
 
-/*
- * Close a file.
- */
-void
-nfsd_close(struct file *filp)
-{
-	fput(filp);
-}
-
-/*
- * Obtain the readahead parameters for the file
- * specified by (dev, ino).
- */
-
-static inline struct raparms *
-nfsd_get_raparms(dev_t dev, ino_t ino)
+struct raparms *
+nfsd_init_raparms(struct file *file)
 {
+	struct inode *inode = file_inode(file);
+	dev_t dev = inode->i_sb->s_dev;
+	ino_t ino = inode->i_ino;
 	struct raparms	*ra, **rap, **frap = NULL;
 	int depth = 0;
 	unsigned int hash;
@@ -814,9 +762,23 @@ found:
 	ra->p_count++;
 	nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
 	spin_unlock(&rab->pb_lock);
+
+	if (ra->p_set)
+		file->f_ra = ra->p_ra;
 	return ra;
 }
 
+void nfsd_put_raparams(struct file *file, struct raparms *ra)
+{
+	struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+
+	spin_lock(&rab->pb_lock);
+	ra->p_ra = file->f_ra;
+	ra->p_set = 1;
+	ra->p_count--;
+	spin_unlock(&rab->pb_lock);
+}
+
 /*
  * Grab and keep cached pages associated with a file in the svc_rqst
  * so that they can be passed to the network sendmsg/sendpage routines
@@ -945,7 +907,7 @@ static int wait_for_concurrent_writes(struct file *file)
 	return err;
 }
 
-static __be32
+__be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
 				unsigned long *cnt, int *stablep)
@@ -1009,40 +971,6 @@ out_nfserr:
 	return err;
 }
 
-__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp,
-		struct file **file, struct raparms **ra)
-{
-	struct inode *inode;
-	__be32 err;
-
-	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file);
-	if (err)
-		return err;
-
-	inode = file_inode(*file);
-
-	/* Get readahead parameters */
-	*ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
-
-	if (*ra && (*ra)->p_set)
-		(*file)->f_ra = (*ra)->p_ra;
-	return nfs_ok;
-}
-
-void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra)
-{
-	/* Write back readahead params */
-	if (ra) {
-		struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
-		spin_lock(&rab->pb_lock);
-		ra->p_ra = file->f_ra;
-		ra->p_set = 1;
-		ra->p_count--;
-		spin_unlock(&rab->pb_lock);
-	}
-	nfsd_close(file);
-}
-
 /*
  * Read data from a file. count must contain the requested read count
  * on entry. On return, *count contains the number of bytes actually read.
@@ -1055,13 +983,15 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct raparms	*ra;
 	__be32 err;
 
-	err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra);
+	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
 	if (err)
 		return err;
 
+	ra = nfsd_init_raparms(file);
 	err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
-
-	nfsd_put_tmp_read_open(file, ra);
+	if (ra)
+		nfsd_put_raparams(file, ra);
+	fput(file);
 
 	return err;
 }
@@ -1093,7 +1023,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		if (cnt)
 			err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
 					     cnt, stablep);
-		nfsd_close(file);
+		fput(file);
 	}
 out:
 	return err;
@@ -1138,7 +1068,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			err = nfserr_notsupp;
 	}
 
-	nfsd_close(file);
+	fput(file);
 out:
 	return err;
 }
@@ -1977,7 +1907,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 	if (err == nfserr_eof || err == nfserr_toosmall)
 		err = nfs_ok; /* can still be found in ->err */
 out_close:
-	nfsd_close(file);
+	fput(file);
 out:
 	return err;
 }
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 2050cb016..5be875e3e 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -71,11 +71,7 @@ __be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
 #endif /* CONFIG_NFSD_V3 */
 __be32		nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
 				int, struct file **);
-void		nfsd_close(struct file *);
 struct raparms;
-__be32		nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *,
-				struct file **, struct raparms **);
-void		nfsd_put_tmp_read_open(struct file *, struct raparms *);
 __be32		nfsd_splice_read(struct svc_rqst *,
 				struct file *, loff_t, unsigned long *);
 __be32		nfsd_readv(struct file *, loff_t, struct kvec *, int,
@@ -84,6 +80,10 @@ __be32 		nfsd_read(struct svc_rqst *, struct svc_fh *,
 				loff_t, struct kvec *, int, unsigned long *);
 __be32 		nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
 				loff_t, struct kvec *,int, unsigned long *, int *);
+__be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+				struct file *file, loff_t offset,
+				struct kvec *vec, int vlen, unsigned long *cnt,
+				int *stablep);
 __be32		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
 				char *, int *);
 __be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
@@ -104,6 +104,9 @@ __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
 				struct dentry *, int);
 
+struct raparms *nfsd_init_raparms(struct file *file);
+void		nfsd_put_raparams(struct file *file, struct raparms *ra);
+
 static inline int fh_want_write(struct svc_fh *fh)
 {
 	int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 2f8c092be..9f991007a 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -273,6 +273,7 @@ struct nfsd4_read {
 	u32		rd_length;          /* request */
 	int		rd_vlen;
 	struct file     *rd_filp;
+	bool		rd_tmp_file;
 	
 	struct svc_rqst *rd_rqstp;          /* response */
 	struct svc_fh * rd_fhp;             /* response */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 0ee0bed36..6b8b92b19 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 /*
  * Return the offset into page `page_nr' of the last valid
  * byte in that page, plus one.
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 258d9fe25..4a73d6dff 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -307,31 +307,13 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
 static ssize_t
 nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 {
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = file->f_mapping->host;
-	size_t count = iov_iter_count(iter);
-	ssize_t size;
+	struct inode *inode = file_inode(iocb->ki_filp);
 
 	if (iov_iter_rw(iter) == WRITE)
 		return 0;
 
 	/* Needs synchronization with the cleaner */
-	size = blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block);
-
-	/*
-	 * In case of error extending write may have instantiated a few
-	 * blocks outside i_size. Trim these off again.
-	 */
-	if (unlikely(iov_iter_rw(iter) == WRITE && size < 0)) {
-		loff_t isize = i_size_read(inode);
-		loff_t end = offset + count;
-
-		if (end > isize)
-			nilfs_write_failed(mapping, end);
-	}
-
-	return size;
+	return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block);
 }
 
 const struct address_space_operations nilfs_aops = {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 9a20e513d..aba43811d 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1369,7 +1369,6 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case NILFS_IOCTL_SYNC:
 	case NILFS_IOCTL_RESIZE:
 	case NILFS_IOCTL_SET_ALLOC_RANGE:
-	case FITRIM:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 22180836e..37dd6b05b 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -496,8 +496,7 @@ static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 {
 	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
 
-	if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
-	     fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
+	if (fh_len < NILFS_FID_SIZE_NON_CONNECTABLE ||
 	    (fh_type != FILEID_NILFS_WITH_PARENT &&
 	     fh_type != FILEID_NILFS_WITHOUT_PARENT))
 		return NULL;
@@ -510,7 +509,7 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
 {
 	struct nilfs_fid *fid = (struct nilfs_fid *)fh;
 
-	if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
+	if (fh_len < NILFS_FID_SIZE_CONNECTABLE ||
 	    fh_type != FILEID_NILFS_WITH_PARENT)
 		return NULL;
 
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index dc3a9efda..42468e5ab 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct nilfs_segment_buffer *segbuf = bio->bi_private;
 
-	if (err == -EOPNOTSUPP) {
-		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		/* to be detected by nilfs_segbuf_submit_bio() */
-	}
-
 	if (!uptodate)
 		atomic_inc(&segbuf->sb_err);
 
@@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
 
 	bio->bi_end_io = nilfs_end_bio_write;
 	bio->bi_private = segbuf;
-	bio_get(bio);
 	submit_bio(mode, bio);
 	segbuf->sb_nbio++;
-	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-		bio_put(bio);
-		err = -EOPNOTSUPP;
-		goto failed;
-	}
-	bio_put(bio);
 
 	wi->bio = NULL;
 	wi->rest_blocks -= wi->end - wi->start;
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 06ca6bc4a..d16b62cb2 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -22,7 +22,6 @@
 #include <linux/srcu.h>
 #include <linux/rculist.h>
 #include <linux/wait.h>
-#include <linux/module.h>
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
@@ -73,7 +72,6 @@ void fsnotify_get_group(struct fsnotify_group *group)
 {
 	atomic_inc(&group->refcnt);
 }
-EXPORT_SYMBOL(fsnotify_get_group);
 
 /*
  * Drop a reference to a group.  Free it if it's through.
@@ -83,7 +81,6 @@ void fsnotify_put_group(struct fsnotify_group *group)
 	if (atomic_dec_and_test(&group->refcnt))
 		fsnotify_final_destroy_group(group);
 }
-EXPORT_SYMBOL(fsnotify_put_group);
 
 /*
  * Create a new fsnotify_group and hold a reference for the group returned.
@@ -112,7 +109,6 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 
 	return group;
 }
-EXPORT_SYMBOL(fsnotify_alloc_group);
 
 int fsnotify_fasync(int fd, struct file *file, int on)
 {
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 450648697..5b1e2a497 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -26,7 +26,7 @@
 #include <linux/fs.h> /* struct inode */
 #include <linux/fsnotify_backend.h>
 #include <linux/idr.h>
-#include <linux/init.h> /* module_init */
+#include <linux/init.h> /* fs_initcall */
 #include <linux/inotify.h>
 #include <linux/kernel.h> /* roundup() */
 #include <linux/namei.h> /* LOOKUP_FOLLOW */
@@ -812,4 +812,4 @@ static int __init inotify_user_setup(void)
 
 	return 0;
 }
-module_init(inotify_user_setup);
+fs_initcall(inotify_user_setup);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 39a838ad2..39ddcaf09 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -109,7 +109,6 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 		mark->free_mark(mark);
 	}
 }
-EXPORT_SYMBOL(fsnotify_put_mark);
 
 /* Calculate mask of events for a list of marks */
 u32 fsnotify_recalc_mask(struct hlist_head *head)
@@ -203,7 +202,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	fsnotify_destroy_mark_locked(mark, group);
 	mutex_unlock(&group->mark_mutex);
 }
-EXPORT_SYMBOL(fsnotify_destroy_mark);
 
 /*
  * Destroy all marks in the given list. The marks must be already detached from
@@ -378,7 +376,6 @@ err:
 
 	return ret;
 }
-EXPORT_SYMBOL(fsnotify_add_mark);
 
 int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
 		      struct inode *inode, struct vfsmount *mnt, int allow_dups)
@@ -478,7 +475,6 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 	atomic_set(&mark->refcnt, 1);
 	mark->free_mark = free_mark;
 }
-EXPORT_SYMBOL(fsnotify_init_mark);
 
 static int fsnotify_mark_destroy(void *ignored)
 {
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7bb487e66..262561fea 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -382,7 +382,7 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
 	base_ni = ni;
 	if (NInoAttr(ni))
 		base_ni = ni->ext.base_ntfs_ino;
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (unlikely(err))
 		goto out;
 	/*
@@ -525,7 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 				}
 			}
 			err = add_to_page_cache_lru(*cached_page, mapping,
-					index, GFP_KERNEL);
+					index,
+					GFP_KERNEL & mapping_gfp_mask(mapping));
 			if (unlikely(err)) {
 				if (err == -EEXIST)
 					continue;
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 76b6cfb57..b3c3469de 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -239,7 +239,7 @@ typedef struct {
  */
 static inline ntfs_inode *NTFS_I(struct inode *inode)
 {
-	return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode);
+	return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode);
 }
 
 static inline struct inode *VFS_I(ntfs_inode *ni)
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index a44b14cbc..ab172e5f5 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -85,12 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
 
 static inline void ntfs_free(void *addr)
 {
-	if (!is_vmalloc_addr(addr)) {
-		kfree(addr);
-		/* free_page((unsigned long)addr); */
-		return;
-	}
-	vfree(addr);
+	kvfree(addr);
 }
 
 #endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 0f35b80d1..443abecf0 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -35,7 +35,7 @@
  * ntfs_lookup - find the inode represented by a dentry in a directory inode
  * @dir_ino:	directory inode in which to look for the inode
  * @dent:	dentry representing the inode to look for
- * @nd:		lookup nameidata
+ * @flags:	lookup flags
  *
  * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
  * in the directory inode @dir_ino and if found attaches the inode to the
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2d7f76e52..5997c00a1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2925,7 +2925,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
 	struct ocfs2_path *right_path = NULL;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
 
-	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+	if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
+		return 0;
 
 	*empty_extent_path = NULL;
 
@@ -4311,13 +4312,13 @@ out:
 	return ret;
 }
 
-static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
 			       struct ocfs2_path *path,
 			       struct ocfs2_extent_list *el, int index,
-			       struct ocfs2_extent_rec *split_rec)
+			       struct ocfs2_extent_rec *split_rec,
+			       struct ocfs2_merge_ctxt *ctxt)
 {
-	int status;
+	int status = 0;
 	enum ocfs2_contig_type ret = CONTIG_NONE;
 	u32 left_cpos, right_cpos;
 	struct ocfs2_extent_rec *rec = NULL;
@@ -4336,8 +4337,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
 
 		if (left_cpos != 0) {
 			left_path = ocfs2_new_path_from_path(path);
-			if (!left_path)
+			if (!left_path) {
+				status = -ENOMEM;
+				mlog_errno(status);
 				goto exit;
+			}
 
 			status = ocfs2_find_path(et->et_ci, left_path,
 						 left_cpos);
@@ -4392,8 +4396,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
 			goto free_left_path;
 
 		right_path = ocfs2_new_path_from_path(path);
-		if (!right_path)
+		if (!right_path) {
+			status = -ENOMEM;
+			mlog_errno(status);
 			goto free_left_path;
+		}
 
 		status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
 		if (status)
@@ -4433,7 +4440,10 @@ free_right_path:
 free_left_path:
 	ocfs2_free_path(left_path);
 exit:
-	return ret;
+	if (status == 0)
+		ctxt->c_contig_type = ret;
+
+	return status;
 }
 
 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
@@ -5039,9 +5049,14 @@ int ocfs2_split_extent(handle_t *handle,
 		goto out;
 	}
 
-	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
-							    split_index,
-							    split_rec);
+	ret = ocfs2_figure_merge_contig_type(et, path, el,
+					     split_index,
+					     split_rec,
+					     &ctxt);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
 	/*
 	 * The core merge / split code wants to know how much room is
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9ea701270..0f5fd9db8 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -523,7 +523,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 	unsigned long len = bh_result->b_size;
-	unsigned int clusters_to_alloc = 0;
+	unsigned int clusters_to_alloc = 0, contig_clusters = 0;
 
 	cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
 
@@ -560,8 +560,10 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		/* fill hole, allocate blocks can't be larger than the size
 		 * of the hole */
 		clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
-		if (clusters_to_alloc > contig_blocks)
-			clusters_to_alloc = contig_blocks;
+		contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
+				contig_blocks);
+		if (clusters_to_alloc > contig_clusters)
+			clusters_to_alloc = contig_clusters;
 
 		/* allocate extent and insert them into the extent tree */
 		ret = ocfs2_extend_allocation(inode, cpos,
@@ -619,9 +621,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	/* this io's submitter should not have unlocked this before we could */
 	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
 
-	if (ocfs2_iocb_is_sem_locked(iocb))
-		ocfs2_iocb_clear_sem_locked(iocb);
-
 	if (ocfs2_iocb_is_unaligned_aio(iocb)) {
 		ocfs2_iocb_clear_unaligned_aio(iocb);
 
@@ -925,13 +924,23 @@ clean_orphan:
 		int update_isize = written > 0 ? 1 : 0;
 		loff_t end = update_isize ? offset + written : 0;
 
-		tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+		tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
+		if (tmp_ret < 0) {
+			ret = tmp_ret;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
 				update_isize, end);
 		if (tmp_ret < 0) {
 			ret = tmp_ret;
+			mlog_errno(ret);
 			goto out;
 		}
 
+		ocfs2_inode_unlock(inode, 1);
+
 		tmp_ret = jbd2_journal_force_commit(journal);
 		if (tmp_ret < 0) {
 			ret = tmp_ret;
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index dd59599b0..24e496d6b 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -79,7 +79,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
 enum ocfs2_iocb_lock_bits {
 	OCFS2_IOCB_RW_LOCK = 0,
 	OCFS2_IOCB_RW_LOCK_LEVEL,
-	OCFS2_IOCB_SEM,
 	OCFS2_IOCB_UNALIGNED_IO,
 	OCFS2_IOCB_NUM_LOCKS
 };
@@ -88,12 +87,6 @@ enum ocfs2_iocb_lock_bits {
 	clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_rw_locked_level(iocb) \
 	test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_sem_locked(iocb) \
-	set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_sem_locked(iocb) \
-	clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_sem_locked(iocb) \
-	test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 
 #define ocfs2_iocb_set_unaligned_aio(iocb) \
 	set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index af7598bff..dfe162f5f 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
 	return count;
 }
 
+void __mlog_printk(const u64 *mask, const char *func, int line,
+		   const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	const char *level;
+	const char *prefix = "";
+
+	if (!__mlog_test_u64(*mask, mlog_and_bits) ||
+	    __mlog_test_u64(*mask, mlog_not_bits))
+		return;
+
+	if (*mask & ML_ERROR) {
+		level = KERN_ERR;
+		prefix = "ERROR: ";
+	} else if (*mask & ML_NOTICE) {
+		level = KERN_NOTICE;
+	} else {
+		level = KERN_INFO;
+	}
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%s(%s,%u,%u):%s:%d %s%pV",
+	       level, current->comm, task_pid_nr(current),
+	       raw_smp_processor_id(), func, line, prefix, &vaf);
+
+	va_end(args);
+}
+EXPORT_SYMBOL_GPL(__mlog_printk);
+
 struct mlog_attribute {
 	struct attribute attr;
 	u64 mask;
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7fdc25a4d..308ea0eb3 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -162,38 +162,20 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 
 #endif
 
-/*
- * smp_processor_id() "helpfully" screams when called outside preemptible
- * regions in current kernels.  sles doesn't have the variants that don't
- * scream.  just do this instead of trying to guess which we're building
- * against.. *sigh*.
- */
-#define __mlog_cpu_guess ({		\
-	unsigned long _cpu = get_cpu();	\
-	put_cpu();			\
-	_cpu;				\
-})
+__printf(4, 5)
+void __mlog_printk(const u64 *m, const char *func, int line,
+		   const char *fmt, ...);
 
-/* In the following two macros, the whitespace after the ',' just
- * before ##args is intentional. Otherwise, gcc 2.95 will eat the
- * previous token if args expands to nothing.
+/*
+ * Testing before the __mlog_printk call lets the compiler eliminate the
+ * call completely when (m & ML_ALLOWED_BITS) is 0.
  */
-#define __mlog_printk(level, fmt, args...)				\
-	printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm,		\
-	       task_pid_nr(current), __mlog_cpu_guess,			\
-	       __PRETTY_FUNCTION__, __LINE__ , ##args)
-
-#define mlog(mask, fmt, args...) do {					\
-	u64 __m = MLOG_MASK_PREFIX | (mask);				\
-	if ((__m & ML_ALLOWED_BITS) &&					\
-	    __mlog_test_u64(__m, mlog_and_bits) &&			\
-	    !__mlog_test_u64(__m, mlog_not_bits)) {			\
-		if (__m & ML_ERROR)					\
-			__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args);	\
-		else if (__m & ML_NOTICE)				\
-			__mlog_printk(KERN_NOTICE, fmt , ##args);	\
-		else __mlog_printk(KERN_INFO, fmt , ##args);		\
-	}								\
+#define mlog(mask, fmt, ...)						\
+do {									\
+	u64 _m = MLOG_MASK_PREFIX | (mask);				\
+	if (_m & ML_ALLOWED_BITS)					\
+		__mlog_printk(&_m, __func__, __LINE__, fmt,		\
+			      ##__VA_ARGS__);				\
 } while (0)
 
 #define mlog_errno(st) ({						\
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 56c403a56..2d0acd667 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -2204,7 +2204,7 @@ out:
 	kfree(o2net_hand);
 	kfree(o2net_keep_req);
 	kfree(o2net_keep_resp);
-
+	o2net_debugfs_exit();
 	o2quo_exit();
 	return -ENOMEM;
 }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ccd4dcfc3..02878a83f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1617,7 +1617,7 @@ int __ocfs2_add_entry(handle_t *handle,
 	struct ocfs2_dir_entry *de, *de1;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
 	struct super_block *sb = dir->i_sb;
-	int retval, status;
+	int retval;
 	unsigned int size = sb->s_blocksize;
 	struct buffer_head *insert_bh = lookup->dl_leaf_bh;
 	char *data_start = insert_bh->b_data;
@@ -1695,25 +1695,25 @@ int __ocfs2_add_entry(handle_t *handle,
 			}
 
 			if (insert_bh == parent_fe_bh)
-				status = ocfs2_journal_access_di(handle,
+				retval = ocfs2_journal_access_di(handle,
 								 INODE_CACHE(dir),
 								 insert_bh,
 								 OCFS2_JOURNAL_ACCESS_WRITE);
 			else {
-				status = ocfs2_journal_access_db(handle,
+				retval = ocfs2_journal_access_db(handle,
 								 INODE_CACHE(dir),
 								 insert_bh,
 					      OCFS2_JOURNAL_ACCESS_WRITE);
 
-				if (ocfs2_dir_indexed(dir)) {
-					status = ocfs2_dx_dir_insert(dir,
+				if (!retval && ocfs2_dir_indexed(dir))
+					retval = ocfs2_dx_dir_insert(dir,
 								handle,
 								lookup);
-					if (status) {
-						mlog_errno(status);
-						goto bail;
-					}
-				}
+			}
+
+			if (retval) {
+				mlog_errno(retval);
+				goto bail;
 			}
 
 			/* By now the buffer is marked for journaling */
@@ -3543,13 +3543,10 @@ static void dx_leaf_sort_swap(void *a, void *b, int size)
 {
 	struct ocfs2_dx_entry *entry1 = a;
 	struct ocfs2_dx_entry *entry2 = b;
-	struct ocfs2_dx_entry tmp;
 
 	BUG_ON(size != sizeof(*entry1));
 
-	tmp = *entry1;
-	*entry1 = *entry2;
-	*entry2 = tmp;
+	swap(*entry1, *entry2);
 }
 
 static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fae17c640..e88ccf8c8 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
 
 /* will exit holding res->spinlock, but may drop in function */
 void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
-void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
 
 /* will exit holding res->spinlock, but may drop in function */
 static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d8b670cbd..719f7f4c7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -37,6 +37,7 @@
 #include <linux/falloc.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 #include <cluster/masklog.h>
 
@@ -2250,7 +2251,7 @@ out:
 static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 				    struct iov_iter *from)
 {
-	int direct_io, appending, rw_level, have_alloc_sem  = 0;
+	int direct_io, appending, rw_level;
 	int can_do_direct, has_refcount = 0;
 	ssize_t written = 0;
 	ssize_t ret;
@@ -2279,16 +2280,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 
 	mutex_lock(&inode->i_mutex);
 
-	ocfs2_iocb_clear_sem_locked(iocb);
-
 relock:
-	/* to match setattr's i_mutex -> rw_lock ordering */
-	if (direct_io) {
-		have_alloc_sem = 1;
-		/* communicate with ocfs2_dio_end_io */
-		ocfs2_iocb_set_sem_locked(iocb);
-	}
-
 	/*
 	 * Concurrent O_DIRECT writes are allowed with
 	 * mount_option "coherency=buffered".
@@ -2298,7 +2290,7 @@ relock:
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
 		mlog_errno(ret);
-		goto out_sems;
+		goto out_mutex;
 	}
 
 	/*
@@ -2347,7 +2339,6 @@ relock:
 	if (direct_io && !can_do_direct) {
 		ocfs2_rw_unlock(inode, rw_level);
 
-		have_alloc_sem = 0;
 		rw_level = -1;
 
 		direct_io = 0;
@@ -2416,7 +2407,6 @@ no_sync:
 	 */
 	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
 		rw_level = -1;
-		have_alloc_sem = 0;
 		unaligned_dio = 0;
 	}
 
@@ -2429,10 +2419,7 @@ out:
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
 
-out_sems:
-	if (have_alloc_sem)
-		ocfs2_iocb_clear_sem_locked(iocb);
-
+out_mutex:
 	mutex_unlock(&inode->i_mutex);
 
 	if (written)
@@ -2473,7 +2460,7 @@ bail:
 static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
 				   struct iov_iter *to)
 {
-	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
+	int ret = 0, rw_level = -1, lock_level = 0;
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = file_inode(filp);
 
@@ -2490,16 +2477,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
 		goto bail;
 	}
 
-	ocfs2_iocb_clear_sem_locked(iocb);
-
 	/*
 	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
 	 * need locks to protect pending reads from racing with truncate.
 	 */
 	if (iocb->ki_flags & IOCB_DIRECT) {
-		have_alloc_sem = 1;
-		ocfs2_iocb_set_sem_locked(iocb);
-
 		ret = ocfs2_rw_lock(inode, 0);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -2535,13 +2517,9 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
 	/* see ocfs2_file_write_iter */
 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
 		rw_level = -1;
-		have_alloc_sem = 0;
 	}
 
 bail:
-	if (have_alloc_sem)
-		ocfs2_iocb_clear_sem_locked(iocb);
-
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
 
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 53e6c40ed..3cb097ccc 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -980,7 +980,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case OCFS2_IOC_GROUP_EXTEND:
 	case OCFS2_IOC_GROUP_ADD:
 	case OCFS2_IOC_GROUP_ADD64:
-	case FITRIM:
 		break;
 	case OCFS2_IOC_REFLINK:
 		if (copy_from_user(&args, argp, sizeof(args)))
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ff5319282..7c099f703 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -108,7 +108,7 @@ struct ocfs2_replay_map {
 	unsigned char rm_replay_slots[0];
 };
 
-void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
 {
 	if (!osb->replay_map)
 		return;
@@ -153,7 +153,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
 	return 0;
 }
 
-void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
 		enum ocfs2_orphan_reco_type orphan_reco_type)
 {
 	struct ocfs2_replay_map *replay_map = osb->replay_map;
@@ -173,7 +173,7 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
 	replay_map->rm_state = REPLAY_DONE;
 }
 
-void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+static void ocfs2_free_replay_slots(struct ocfs2_super *osb)
 {
 	struct ocfs2_replay_map *replay_map = osb->replay_map;
 
@@ -571,9 +571,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
 	     (unsigned long)bh,
 	     (unsigned long long)bh->b_blocknr);
 
-	/* We aren't guaranteed to have the superblock here - but if we
-	 * don't, it'll just crash. */
-	ocfs2_error(bh->b_assoc_map->host->i_sb,
+	ocfs2_error(bh->b_bdev->bd_super,
 		    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
 }
 
@@ -775,7 +773,20 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
 	trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
 
 	status = jbd2_journal_dirty_metadata(handle, bh);
-	BUG_ON(status);
+	if (status) {
+		mlog_errno(status);
+		if (!is_handle_aborted(handle)) {
+			journal_t *journal = handle->h_transaction->t_journal;
+			struct super_block *sb = bh->b_bdev->bd_super;
+
+			mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
+					"Aborting transaction and journal.\n");
+			handle->h_err = status;
+			jbd2_journal_abort_handle(handle);
+			jbd2_journal_abort(journal, status);
+			ocfs2_abort(sb, "Journal already aborted.\n");
+		}
+	}
 }
 
 #define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -1884,7 +1895,7 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
  * hasn't happened.  The node queues a scan and increments the
  * sequence number in the LVB.
  */
-void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 {
 	struct ocfs2_orphan_scan *os;
 	int status, i;
@@ -1933,7 +1944,7 @@ out:
 }
 
 /* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
-void ocfs2_orphan_scan_work(struct work_struct *work)
+static void ocfs2_orphan_scan_work(struct work_struct *work)
 {
 	struct ocfs2_orphan_scan *os;
 	struct ocfs2_super *osb;
@@ -2137,6 +2148,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 	struct inode *inode = NULL;
 	struct inode *iter;
 	struct ocfs2_inode_info *oi;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di = NULL;
 
 	trace_ocfs2_recover_orphans(slot);
 
@@ -2157,16 +2170,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		iter = oi->ip_next_orphan;
 		oi->ip_next_orphan = NULL;
 
+		ret = ocfs2_rw_lock(inode, 1);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto next;
+		}
 		/*
 		 * We need to take and drop the inode lock to
 		 * force read inode from disk.
 		 */
-		ret = ocfs2_inode_lock(inode, NULL, 0);
+		ret = ocfs2_inode_lock(inode, &di_bh, 1);
 		if (ret) {
 			mlog_errno(ret);
-			goto next;
+			goto unlock_rw;
 		}
-		ocfs2_inode_unlock(inode, 0);
+
+		di = (struct ocfs2_dinode *)di_bh->b_data;
 
 		if (inode->i_nlink == 0) {
 			spin_lock(&oi->ip_lock);
@@ -2174,43 +2193,30 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 			 * ocfs2_delete_inode. */
 			oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
 			spin_unlock(&oi->ip_lock);
-		} else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
-			struct buffer_head *di_bh = NULL;
-
-			ret = ocfs2_rw_lock(inode, 1);
-			if (ret) {
-				mlog_errno(ret);
-				goto next;
-			}
-
-			ret = ocfs2_inode_lock(inode, &di_bh, 1);
-			if (ret < 0) {
-				ocfs2_rw_unlock(inode, 1);
-				mlog_errno(ret);
-				goto next;
-			}
-
+		} else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
+				(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
 			ret = ocfs2_truncate_file(inode, di_bh,
 					i_size_read(inode));
-			ocfs2_inode_unlock(inode, 1);
-			ocfs2_rw_unlock(inode, 1);
-			brelse(di_bh);
 			if (ret < 0) {
 				if (ret != -ENOSPC)
 					mlog_errno(ret);
-				goto next;
+				goto unlock_inode;
 			}
 
-			ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+			ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
 			if (ret)
 				mlog_errno(ret);
 
 			wake_up(&OCFS2_I(inode)->append_dio_wq);
 		} /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-
+unlock_inode:
+		ocfs2_inode_unlock(inode, 1);
+unlock_rw:
+		ocfs2_rw_unlock(inode, 1);
 next:
 		iput(inode);
-
+		brelse(di_bh);
+		di_bh = NULL;
 		inode = iter;
 	}
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 176fe6afd..6e6abb93f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1116,8 +1116,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 	int inode1_is_ancestor, inode2_is_ancestor;
 	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
 	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
-	struct buffer_head **tmpbh;
-	struct inode *tmpinode;
 
 	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
 				(unsigned long long)oi2->ip_blkno);
@@ -1148,13 +1146,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 				(oi1->ip_blkno < oi2->ip_blkno &&
 				inode2_is_ancestor == 0)) {
 			/* switch id1 and id2 around */
-			tmpbh = bh2;
-			bh2 = bh1;
-			bh1 = tmpbh;
-
-			tmpinode = inode2;
-			inode2 = inode1;
-			inode1 = tmpinode;
+			swap(bh2, bh1);
+			swap(inode2, inode1);
 		}
 		/* lock id2 */
 		status = ocfs2_inode_lock_nested(inode2, bh2, 1,
@@ -2670,30 +2663,22 @@ bail:
 }
 
 int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
-		struct inode *inode, int update_isize,
-		loff_t end)
+		struct inode *inode, struct buffer_head *di_bh,
+		int update_isize, loff_t end)
 {
 	struct inode *orphan_dir_inode = NULL;
 	struct buffer_head *orphan_dir_bh = NULL;
-	struct buffer_head *di_bh = NULL;
-	struct ocfs2_dinode *di = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	handle_t *handle = NULL;
 	int status = 0;
 
-	status = ocfs2_inode_lock(inode, &di_bh, 1);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-	di = (struct ocfs2_dinode *) di_bh->b_data;
-
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
 			ORPHAN_DIR_SYSTEM_INODE,
 			le16_to_cpu(di->i_dio_orphaned_slot));
 	if (!orphan_dir_inode) {
 		status = -ENOENT;
 		mlog_errno(status);
-		goto bail_unlock_inode;
+		goto bail;
 	}
 
 	mutex_lock(&orphan_dir_inode->i_mutex);
@@ -2702,7 +2687,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
 		mutex_unlock(&orphan_dir_inode->i_mutex);
 		iput(orphan_dir_inode);
 		mlog_errno(status);
-		goto bail_unlock_inode;
+		goto bail;
 	}
 
 	handle = ocfs2_start_trans(osb,
@@ -2749,10 +2734,6 @@ bail_unlock_orphan:
 	brelse(orphan_dir_bh);
 	iput(orphan_dir_inode);
 
-bail_unlock_inode:
-	ocfs2_inode_unlock(inode, 1);
-	brelse(di_bh);
-
 bail:
 	return status;
 }
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 5ddecce17..e173329eb 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
 		struct inode *inode);
 int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
-		struct inode *inode, int update_isize,
-		loff_t end);
+		struct inode *inode, struct buffer_head *di_bh,
+		int update_isize, loff_t end);
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 				   struct inode *new_inode,
 				   struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 460c6c37e..690ddc601 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -717,6 +717,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
 	return (u64)clusters << c_to_b_bits;
 }
 
+static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb,
+		u64 blocks)
+{
+	int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+			sb->s_blocksize_bits;
+
+	blocks += (1 << b_to_c_bits) - 1;
+	return (u32)(blocks >> b_to_c_bits);
+}
+
 static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
 					   u64 blocks)
 {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d8c6af101..b69dd14c0 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1406,11 +1406,9 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
 
 static void swap_refcount_rec(void *a, void *b, int size)
 {
-	struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+	struct ocfs2_refcount_rec *l = a, *r = b;
 
-	tmp = *l;
-	*l = *r;
-	*r = tmp;
+	swap(*l, *r);
 }
 
 /*
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03bfbf3d..889f3796a 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7271,7 +7271,7 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
 			       name, value, size, flags);
 }
 
-int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
 		     void *fs_info)
 {
 	const struct xattr *xattr;
diff --git a/fs/open.c b/fs/open.c
index 9d175dbea..e09950bae 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -54,8 +54,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 		newattrs.ia_valid |= ATTR_FILE;
 	}
 
-	/* Remove suid/sgid on truncate too */
-	ret = should_remove_suid(dentry);
+	/* Remove suid, sgid, and file capabilities on truncate too */
+	ret = dentry_needs_remove_privs(dentry);
+	if (ret < 0)
+		return ret;
 	if (ret)
 		newattrs.ia_valid |= ret | ATTR_FORCE;
 
@@ -65,7 +67,6 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	mutex_unlock(&dentry->d_inode->i_mutex);
 	return ret;
 }
-EXPORT_SYMBOL(do_truncate);
 
 long vfs_truncate(struct path *path, loff_t length)
 {
@@ -371,7 +372,7 @@ retry:
 	if (res)
 		goto out;
 
-	inode = path.dentry->d_inode;
+	inode = d_backing_inode(path.dentry);
 
 	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		/*
@@ -680,21 +681,20 @@ int open_check_o_direct(struct file *f)
 	}
 	return 0;
 }
-EXPORT_SYMBOL(open_check_o_direct);
 
 static int do_dentry_open(struct file *f,
+			  struct inode *inode,
 			  int (*open)(struct inode *, struct file *),
 			  const struct cred *cred)
 {
 	static const struct file_operations empty_fops = {};
-	struct inode *inode;
 	int error;
 
 	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
 
 	path_get(&f->f_path);
-	inode = f->f_inode = f->f_path.dentry->d_inode;
+	f->f_inode = inode;
 	f->f_mapping = inode->i_mapping;
 
 	if (unlikely(f->f_flags & O_PATH)) {
@@ -798,7 +798,8 @@ int finish_open(struct file *file, struct dentry *dentry,
 	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
 
 	file->f_path.dentry = dentry;
-	error = do_dentry_open(file, open, current_cred());
+	error = do_dentry_open(file, d_backing_inode(dentry), open,
+			       current_cred());
 	if (!error)
 		*opened |= FILE_OPENED;
 
@@ -827,6 +828,34 @@ int finish_no_open(struct file *file, struct dentry *dentry)
 }
 EXPORT_SYMBOL(finish_no_open);
 
+char *file_path(struct file *filp, char *buf, int buflen)
+{
+	return d_path(&filp->f_path, buf, buflen);
+}
+EXPORT_SYMBOL(file_path);
+
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @file: newly allocated file with f_flag initialized
+ * @cred: credentials to use
+ */
+int vfs_open(const struct path *path, struct file *file,
+	     const struct cred *cred)
+{
+	struct dentry *dentry = path->dentry;
+	struct inode *inode = dentry->d_inode;
+
+	file->f_path = *path;
+	if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
+		inode = dentry->d_op->d_select_inode(dentry, file->f_flags);
+		if (IS_ERR(inode))
+			return PTR_ERR(inode);
+	}
+
+	return do_dentry_open(file, inode, NULL, cred);
+}
+
 struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *cred)
 {
@@ -858,26 +887,6 @@ struct file *dentry_open(const struct path *path, int flags,
 }
 EXPORT_SYMBOL(dentry_open);
 
-/**
- * vfs_open - open the file at the given path
- * @path: path to open
- * @filp: newly allocated file with f_flag initialized
- * @cred: credentials to use
- */
-int vfs_open(const struct path *path, struct file *filp,
-	     const struct cred *cred)
-{
-	struct inode *inode = path->dentry->d_inode;
-
-	if (inode->i_op->dentry_open)
-		return inode->i_op->dentry_open(path->dentry, filp, cred);
-	else {
-		filp->f_path = *path;
-		return do_dentry_open(filp, NULL, cred);
-	}
-}
-EXPORT_SYMBOL(vfs_open);
-
 static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
 {
 	int lookup_flags = 0;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 04f124884..d9da5a4e9 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -140,11 +140,12 @@ struct ovl_link_data {
 	void *cookie;
 };
 
-static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
 {
-	void *ret;
 	struct dentry *realdentry;
 	struct inode *realinode;
+	struct ovl_link_data *data = NULL;
+	const char *ret;
 
 	realdentry = ovl_dentry_real(dentry);
 	realinode = realdentry->d_inode;
@@ -152,28 +153,28 @@ static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (WARN_ON(!realinode->i_op->follow_link))
 		return ERR_PTR(-EPERM);
 
-	ret = realinode->i_op->follow_link(realdentry, nd);
-	if (IS_ERR(ret))
-		return ret;
-
 	if (realinode->i_op->put_link) {
-		struct ovl_link_data *data;
-
 		data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
-		if (!data) {
-			realinode->i_op->put_link(realdentry, nd, ret);
+		if (!data)
 			return ERR_PTR(-ENOMEM);
-		}
 		data->realdentry = realdentry;
-		data->cookie = ret;
+	}
 
-		return data;
-	} else {
-		return NULL;
+	ret = realinode->i_op->follow_link(realdentry, cookie);
+	if (IS_ERR_OR_NULL(ret)) {
+		kfree(data);
+		return ret;
 	}
+
+	if (data)
+		data->cookie = *cookie;
+
+	*cookie = data;
+
+	return ret;
 }
 
-static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+static void ovl_put_link(struct inode *unused, void *c)
 {
 	struct inode *realinode;
 	struct ovl_link_data *data = c;
@@ -182,7 +183,7 @@ static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
 		return;
 
 	realinode = data->realdentry->d_inode;
-	realinode->i_op->put_link(data->realdentry, nd, data->cookie);
+	realinode->i_op->put_link(realinode, data->cookie);
 	kfree(data);
 }
 
@@ -336,37 +337,33 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
 	return true;
 }
 
-static int ovl_dentry_open(struct dentry *dentry, struct file *file,
-		    const struct cred *cred)
+struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
 {
 	int err;
 	struct path realpath;
 	enum ovl_path_type type;
-	bool want_write = false;
+
+	if (d_is_dir(dentry))
+		return d_backing_inode(dentry);
 
 	type = ovl_path_real(dentry, &realpath);
-	if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
-		want_write = true;
+	if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
 		err = ovl_want_write(dentry);
 		if (err)
-			goto out;
+			return ERR_PTR(err);
 
-		if (file->f_flags & O_TRUNC)
+		if (file_flags & O_TRUNC)
 			err = ovl_copy_up_last(dentry, NULL, true);
 		else
 			err = ovl_copy_up(dentry);
+		ovl_drop_write(dentry);
 		if (err)
-			goto out_drop_write;
+			return ERR_PTR(err);
 
 		ovl_path_upper(dentry, &realpath);
 	}
 
-	err = vfs_open(&realpath, file, cred);
-out_drop_write:
-	if (want_write)
-		ovl_drop_write(dentry);
-out:
-	return err;
+	return d_backing_inode(realpath.dentry);
 }
 
 static const struct inode_operations ovl_file_inode_operations = {
@@ -377,7 +374,6 @@ static const struct inode_operations ovl_file_inode_operations = {
 	.getxattr	= ovl_getxattr,
 	.listxattr	= ovl_listxattr,
 	.removexattr	= ovl_removexattr,
-	.dentry_open	= ovl_dentry_open,
 };
 
 static const struct inode_operations ovl_symlink_inode_operations = {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 17ac5afc9..ea5a40b06 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -173,6 +173,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
 		     void *value, size_t size);
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
 int ovl_removexattr(struct dentry *dentry, const char *name);
+struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
 
 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
 			    struct ovl_entry *oe);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index bf8537c7f..7466ff339 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -273,8 +273,56 @@ static void ovl_dentry_release(struct dentry *dentry)
 	}
 }
 
+static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	unsigned int i;
+	int ret = 1;
+
+	for (i = 0; i < oe->numlower; i++) {
+		struct dentry *d = oe->lowerstack[i].dentry;
+
+		if (d->d_flags & DCACHE_OP_REVALIDATE) {
+			ret = d->d_op->d_revalidate(d, flags);
+			if (ret < 0)
+				return ret;
+			if (!ret) {
+				if (!(flags & LOOKUP_RCU))
+					d_invalidate(d);
+				return -ESTALE;
+			}
+		}
+	}
+	return 1;
+}
+
+static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	unsigned int i;
+	int ret = 1;
+
+	for (i = 0; i < oe->numlower; i++) {
+		struct dentry *d = oe->lowerstack[i].dentry;
+
+		if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) {
+			ret = d->d_op->d_weak_revalidate(d, flags);
+			if (ret <= 0)
+				break;
+		}
+	}
+	return ret;
+}
+
 static const struct dentry_operations ovl_dentry_operations = {
 	.d_release = ovl_dentry_release,
+	.d_select_inode = ovl_d_select_inode,
+};
+
+static const struct dentry_operations ovl_reval_dentry_operations = {
+	.d_release = ovl_dentry_release,
+	.d_revalidate = ovl_dentry_revalidate,
+	.d_weak_revalidate = ovl_dentry_weak_revalidate,
 };
 
 static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
@@ -288,6 +336,20 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
 	return oe;
 }
 
+static bool ovl_dentry_remote(struct dentry *dentry)
+{
+	return dentry->d_flags &
+		(DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+}
+
+static bool ovl_dentry_weird(struct dentry *dentry)
+{
+	return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
+				  DCACHE_MANAGE_TRANSIT |
+				  DCACHE_OP_HASH |
+				  DCACHE_OP_COMPARE);
+}
+
 static inline struct dentry *ovl_lookup_real(struct dentry *dir,
 					     struct qstr *name)
 {
@@ -303,6 +365,10 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
 	} else if (!dentry->d_inode) {
 		dput(dentry);
 		dentry = NULL;
+	} else if (ovl_dentry_weird(dentry)) {
+		dput(dentry);
+		/* Don't support traversing automounts and other weirdness */
+		dentry = ERR_PTR(-EREMOTE);
 	}
 	return dentry;
 }
@@ -350,6 +416,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			goto out;
 
 		if (this) {
+			if (unlikely(ovl_dentry_remote(this))) {
+				dput(this);
+				err = -EREMOTE;
+				goto out;
+			}
 			if (ovl_is_whiteout(this)) {
 				dput(this);
 				this = NULL;
@@ -694,25 +765,6 @@ static void ovl_unescape(char *s)
 	}
 }
 
-static bool ovl_is_allowed_fs_type(struct dentry *root)
-{
-	const struct dentry_operations *dop = root->d_op;
-
-	/*
-	 * We don't support:
-	 *  - automount filesystems
-	 *  - filesystems with revalidate (FIXME for lower layer)
-	 *  - filesystems with case insensitive names
-	 */
-	if (dop &&
-	    (dop->d_manage || dop->d_automount ||
-	     dop->d_revalidate || dop->d_weak_revalidate ||
-	     dop->d_compare || dop->d_hash)) {
-		return false;
-	}
-	return true;
-}
-
 static int ovl_mount_dir_noesc(const char *name, struct path *path)
 {
 	int err = -EINVAL;
@@ -727,7 +779,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
 		goto out;
 	}
 	err = -EINVAL;
-	if (!ovl_is_allowed_fs_type(path->dentry)) {
+	if (ovl_dentry_weird(path->dentry)) {
 		pr_err("overlayfs: filesystem on '%s' not supported\n", name);
 		goto out_put;
 	}
@@ -751,13 +803,21 @@ static int ovl_mount_dir(const char *name, struct path *path)
 	if (tmp) {
 		ovl_unescape(tmp);
 		err = ovl_mount_dir_noesc(tmp, path);
+
+		if (!err)
+			if (ovl_dentry_remote(path->dentry)) {
+				pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n",
+				       tmp);
+				path_put(path);
+				err = -EINVAL;
+			}
 		kfree(tmp);
 	}
 	return err;
 }
 
 static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
-			 int *stack_depth)
+			 int *stack_depth, bool *remote)
 {
 	int err;
 	struct kstatfs statfs;
@@ -774,6 +834,9 @@ static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
 	*namelen = max(*namelen, statfs.f_namelen);
 	*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
 
+	if (ovl_dentry_remote(path->dentry))
+		*remote = true;
+
 	return 0;
 
 out_put:
@@ -827,6 +890,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned int numlower;
 	unsigned int stacklen = 0;
 	unsigned int i;
+	bool remote = false;
 	int err;
 
 	err = -ENOMEM;
@@ -900,7 +964,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	lower = lowertmp;
 	for (numlower = 0; numlower < stacklen; numlower++) {
 		err = ovl_lower_dir(lower, &stack[numlower],
-				    &ufs->lower_namelen, &sb->s_stack_depth);
+				    &ufs->lower_namelen, &sb->s_stack_depth,
+				    &remote);
 		if (err)
 			goto out_put_lowerpath;
 
@@ -958,7 +1023,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	if (!ufs->upper_mnt)
 		sb->s_flags |= MS_RDONLY;
 
-	sb->s_d_op = &ovl_dentry_operations;
+	if (remote)
+		sb->s_d_op = &ovl_reval_dentry_operations;
+	else
+		sb->s_d_op = &ovl_dentry_operations;
 
 	err = -ENOMEM;
 	oe = ovl_alloc_entry(numlower);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 84bb65b83..4fb17ded7 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode,
 		struct posix_acl **default_acl, struct posix_acl **acl)
 {
 	struct posix_acl *p;
+	struct posix_acl *clone;
 	int ret;
 
+	*acl = NULL;
+	*default_acl = NULL;
+
 	if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
-		goto no_acl;
+		return 0;
 
 	p = get_acl(dir, ACL_TYPE_DEFAULT);
-	if (IS_ERR(p)) {
-		if (p == ERR_PTR(-EOPNOTSUPP))
-			goto apply_umask;
-		return PTR_ERR(p);
+	if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
+		*mode &= ~current_umask();
+		return 0;
 	}
+	if (IS_ERR(p))
+		return PTR_ERR(p);
 
-	if (!p)
-		goto apply_umask;
-
-	*acl = posix_acl_clone(p, GFP_NOFS);
-	if (!*acl)
+	clone = posix_acl_clone(p, GFP_NOFS);
+	if (!clone)
 		goto no_mem;
 
-	ret = posix_acl_create_masq(*acl, mode);
+	ret = posix_acl_create_masq(clone, mode);
 	if (ret < 0)
 		goto no_mem_clone;
 
-	if (ret == 0) {
-		posix_acl_release(*acl);
-		*acl = NULL;
-	}
+	if (ret == 0)
+		posix_acl_release(clone);
+	else
+		*acl = clone;
 
-	if (!S_ISDIR(*mode)) {
+	if (!S_ISDIR(*mode))
 		posix_acl_release(p);
-		*default_acl = NULL;
-	} else {
+	else
 		*default_acl = p;
-	}
-	return 0;
 
-apply_umask:
-	*mode &= ~current_umask();
-no_acl:
-	*default_acl = NULL;
-	*acl = NULL;
 	return 0;
 
 no_mem_clone:
-	posix_acl_release(*acl);
+	posix_acl_release(clone);
 no_mem:
 	posix_acl_release(p);
 	return -ENOMEM;
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 2183fcf41..1ade1206b 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -71,3 +71,13 @@ config PROC_PAGE_MONITOR
 	  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
 	  /proc/kpagecount, and /proc/kpageflags. Disabling these
           interfaces will reduce the size of the kernel by approximately 4kb.
+
+config PROC_CHILDREN
+	bool "Include /proc/<pid>/task/<tid>/children file"
+	default n
+	help
+	  Provides a fast way to retrieve first level children pids of a task. See
+	  <file:Documentation/filesystems/proc.txt> for more information.
+
+	  Say Y if you are running any user-space software which takes benefit from
+	  this interface. For example, rkt is such a piece of software.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fd02a9ebf..ce065cf31 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk)
 {
 	unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
 
+	/*
+	 * Parked tasks do not run; they sit in __kthread_parkme().
+	 * Without this check, we would report them as running, which is
+	 * clearly wrong, so we report them as sleeping instead.
+	 */
+	if (tsk->state == TASK_PARKED)
+		state = TASK_INTERRUPTIBLE;
+
 	BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
 
 	return task_state_array[fls(state)];
@@ -569,7 +577,7 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#ifdef CONFIG_PROC_CHILDREN
 static struct pid *
 get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
 {
@@ -692,4 +700,4 @@ const struct file_operations proc_tid_children_operations = {
 	.llseek  = seq_lseek,
 	.release = children_seq_release,
 };
-#endif /* CONFIG_CHECKPOINT_RESTORE */
+#endif /* CONFIG_PROC_CHILDREN */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c439a9dcc..aa50d1ac2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -196,18 +196,210 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
 	return result;
 }
 
-static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
-			    struct pid *pid, struct task_struct *task)
+static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
+				     size_t _count, loff_t *pos)
 {
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	char *page;
+	unsigned long count = _count;
+	unsigned long arg_start, arg_end, env_start, env_end;
+	unsigned long len1, len2, len;
+	unsigned long p;
+	char c;
+	ssize_t rv;
+
+	BUG_ON(*pos < 0);
+
+	tsk = get_proc_task(file_inode(file));
+	if (!tsk)
+		return -ESRCH;
+	mm = get_task_mm(tsk);
+	put_task_struct(tsk);
+	if (!mm)
+		return 0;
+	/* Check if process spawned far enough to have cmdline. */
+	if (!mm->env_end) {
+		rv = 0;
+		goto out_mmput;
+	}
+
+	page = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!page) {
+		rv = -ENOMEM;
+		goto out_mmput;
+	}
+
+	down_read(&mm->mmap_sem);
+	arg_start = mm->arg_start;
+	arg_end = mm->arg_end;
+	env_start = mm->env_start;
+	env_end = mm->env_end;
+	up_read(&mm->mmap_sem);
+
+	BUG_ON(arg_start > arg_end);
+	BUG_ON(env_start > env_end);
+
+	len1 = arg_end - arg_start;
+	len2 = env_end - env_start;
+
+	/* Empty ARGV. */
+	if (len1 == 0) {
+		rv = 0;
+		goto out_free_page;
+	}
 	/*
-	 * Rely on struct seq_operations::show() being called once
-	 * per internal buffer allocation. See single_open(), traverse().
+	 * Inherently racy -- command line shares address space
+	 * with code and data.
 	 */
-	BUG_ON(m->size < PAGE_SIZE);
-	m->count += get_cmdline(task, m->buf, PAGE_SIZE);
-	return 0;
+	rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
+	if (rv <= 0)
+		goto out_free_page;
+
+	rv = 0;
+
+	if (c == '\0') {
+		/* Command line (set of strings) occupies whole ARGV. */
+		if (len1 <= *pos)
+			goto out_free_page;
+
+		p = arg_start + *pos;
+		len = len1 - *pos;
+		while (count > 0 && len > 0) {
+			unsigned int _count;
+			int nr_read;
+
+			_count = min3(count, len, PAGE_SIZE);
+			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			if (nr_read < 0)
+				rv = nr_read;
+			if (nr_read <= 0)
+				goto out_free_page;
+
+			if (copy_to_user(buf, page, nr_read)) {
+				rv = -EFAULT;
+				goto out_free_page;
+			}
+
+			p	+= nr_read;
+			len	-= nr_read;
+			buf	+= nr_read;
+			count	-= nr_read;
+			rv	+= nr_read;
+		}
+	} else {
+		/*
+		 * Command line (1 string) occupies ARGV and maybe
+		 * extends into ENVP.
+		 */
+		if (len1 + len2 <= *pos)
+			goto skip_argv_envp;
+		if (len1 <= *pos)
+			goto skip_argv;
+
+		p = arg_start + *pos;
+		len = len1 - *pos;
+		while (count > 0 && len > 0) {
+			unsigned int _count, l;
+			int nr_read;
+			bool final;
+
+			_count = min3(count, len, PAGE_SIZE);
+			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			if (nr_read < 0)
+				rv = nr_read;
+			if (nr_read <= 0)
+				goto out_free_page;
+
+			/*
+			 * Command line can be shorter than whole ARGV
+			 * even if last "marker" byte says it is not.
+			 */
+			final = false;
+			l = strnlen(page, nr_read);
+			if (l < nr_read) {
+				nr_read = l;
+				final = true;
+			}
+
+			if (copy_to_user(buf, page, nr_read)) {
+				rv = -EFAULT;
+				goto out_free_page;
+			}
+
+			p	+= nr_read;
+			len	-= nr_read;
+			buf	+= nr_read;
+			count	-= nr_read;
+			rv	+= nr_read;
+
+			if (final)
+				goto out_free_page;
+		}
+skip_argv:
+		/*
+		 * Command line (1 string) occupies ARGV and
+		 * extends into ENVP.
+		 */
+		if (len1 <= *pos) {
+			p = env_start + *pos - len1;
+			len = len1 + len2 - *pos;
+		} else {
+			p = env_start;
+			len = len2;
+		}
+		while (count > 0 && len > 0) {
+			unsigned int _count, l;
+			int nr_read;
+			bool final;
+
+			_count = min3(count, len, PAGE_SIZE);
+			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			if (nr_read < 0)
+				rv = nr_read;
+			if (nr_read <= 0)
+				goto out_free_page;
+
+			/* Find EOS. */
+			final = false;
+			l = strnlen(page, nr_read);
+			if (l < nr_read) {
+				nr_read = l;
+				final = true;
+			}
+
+			if (copy_to_user(buf, page, nr_read)) {
+				rv = -EFAULT;
+				goto out_free_page;
+			}
+
+			p	+= nr_read;
+			len	-= nr_read;
+			buf	+= nr_read;
+			count	-= nr_read;
+			rv	+= nr_read;
+
+			if (final)
+				goto out_free_page;
+		}
+skip_argv_envp:
+		;
+	}
+
+out_free_page:
+	free_page((unsigned long)page);
+out_mmput:
+	mmput(mm);
+	if (rv > 0)
+		*pos += rv;
+	return rv;
 }
 
+static const struct file_operations proc_pid_cmdline_ops = {
+	.read	= proc_pid_cmdline_read,
+	.llseek	= generic_file_llseek,
+};
+
 static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
 			 struct pid *pid, struct task_struct *task)
 {
@@ -304,15 +496,18 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 }
 #endif
 
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
 /*
  * Provides /proc/PID/schedstat
  */
 static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
 			      struct pid *pid, struct task_struct *task)
 {
-	seq_printf(m, "%llu %llu %lu\n",
-		   (unsigned long long)tsk_seruntime(task),
+	if (unlikely(!sched_info_on()))
+		seq_printf(m, "0 0 0\n");
+	else
+		seq_printf(m, "%llu %llu %lu\n",
+		   (unsigned long long)task->se.sum_exec_runtime,
 		   (unsigned long long)task->sched_info.run_delay,
 		   task->sched_info.pcount);
 
@@ -1380,7 +1575,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 		return -ENOENT;
 }
 
-static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct inode *inode = d_inode(dentry);
 	struct path path;
@@ -1394,7 +1589,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (error)
 		goto out;
 
-	nd_jump_link(nd, &path);
+	nd_jump_link(&path);
 	return NULL;
 out:
 	return ERR_PTR(error);
@@ -1744,7 +1939,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
 	down_read(&mm->mmap_sem);
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
-		*path = vma_pr_or_file(vma)->f_path;
+		*path = vma->vm_file->f_path;
 		path_get(path);
 		rc = 0;
 	}
@@ -2572,7 +2767,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	ONE("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
-	ONE("cmdline",    S_IRUGO, proc_pid_cmdline),
+	REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
 	ONE("stat",       S_IRUGO, proc_tgid_stat),
 	ONE("statm",      S_IRUGO, proc_pid_statm),
 	REG("maps",       S_IRUGO, proc_pid_maps_operations),
@@ -2600,7 +2795,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_STACKTRACE
 	ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
 	ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
@@ -2918,11 +3113,11 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	ONE("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
-	ONE("cmdline",   S_IRUGO, proc_pid_cmdline),
+	REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
 	ONE("stat",      S_IRUGO, proc_tid_stat),
 	ONE("statm",     S_IRUGO, proc_pid_statm),
 	REG("maps",      S_IRUGO, proc_tid_maps_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#ifdef CONFIG_PROC_CHILDREN
 	REG("children",  S_IRUGO, proc_tid_children_operations),
 #endif
 #ifdef CONFIG_NUMA
@@ -2948,7 +3143,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_STACKTRACE
 	ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
 	ONE("schedstat", S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e3eb55246..bd95b9fde 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -23,7 +23,6 @@
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/magic.h>
-#include <linux/namei.h>
 
 #include <asm/uaccess.h>
 
@@ -394,16 +393,16 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
 
-static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct proc_dir_entry *pde = PDE(d_inode(dentry));
 	if (unlikely(!use_pde(pde)))
 		return ERR_PTR(-EINVAL);
-	nd_set_link(nd, pde->data);
-	return pde;
+	*cookie = pde;
+	return pde->data;
 }
 
-static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+static void proc_put_link(struct inode *unused, void *p)
 {
 	unuse_pde(p);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 91a4e6426..92e6726f6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 			     roundup(sizeof(CORE_STR), 4)) +
 			roundup(sizeof(struct elf_prstatus), 4) +
 			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(sizeof(struct task_struct), 4);
+			roundup(arch_task_struct_size, 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	/* set up the task structure */
 	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= sizeof(struct task_struct);
+	notes[2].datasz	= arch_task_struct_size;
 	notes[2].data	= current;
 
 	nhdr->p_filesz	+= notesize(&notes[2]);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index e512642db..f6e8354b8 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -30,7 +30,7 @@ static const struct proc_ns_operations *ns_entries[] = {
 	&mntns_operations,
 };
 
-static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct inode *inode = d_inode(dentry);
 	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
@@ -45,7 +45,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (ptrace_may_access(task, PTRACE_MODE_READ)) {
 		error = ns_get_path(&ns_path, task, ns_ops);
 		if (!error)
-			nd_jump_link(nd, &ns_path);
+			nd_jump_link(&ns_path);
 	}
 	put_task_struct(task);
 	return error;
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 139718132..f8595e8b5 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -45,10 +45,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 	file = region->vm_file;
 
 	if (file) {
-		struct inode *inode;
-
-		file = vmr_pr_or_file(region);
-		inode = file_inode(file);
+		struct inode *inode = file_inode(region->vm_file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 	}
@@ -67,7 +64,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_path(m, &file->f_path, "");
+		seq_file_path(m, file, "");
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 6195b4a7c..113b8d061 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -1,5 +1,4 @@
 #include <linux/sched.h>
-#include <linux/namei.h>
 #include <linux/slab.h>
 #include <linux/pid_namespace.h>
 #include "internal.h"
@@ -19,21 +18,20 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
 	return readlink_copy(buffer, buflen, tmp);
 }
 
-static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char *name = ERR_PTR(-ENOENT);
-	if (tgid) {
-		/* 11 for max length of signed int in decimal + NULL term */
-		name = kmalloc(12, GFP_KERNEL);
-		if (!name)
-			name = ERR_PTR(-ENOMEM);
-		else
-			sprintf(name, "%d", tgid);
-	}
-	nd_set_link(nd, name);
-	return NULL;
+	char *name;
+
+	if (!tgid)
+		return ERR_PTR(-ENOENT);
+	/* 11 for max length of signed int in decimal + NULL term */
+	name = kmalloc(12, GFP_KERNEL);
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+	sprintf(name, "%d", tgid);
+	return *cookie = name;
 }
 
 static const struct inode_operations proc_self_inode_operations = {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9afa35d04..ca1e09188 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -279,10 +279,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	const char *name = NULL;
 
 	if (file) {
-		struct inode *inode;
-
-		file = vma_pr_or_file(vma);
-		inode = file_inode(file);
+		struct inode *inode = file_inode(vma->vm_file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -313,7 +310,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	 */
 	if (file) {
 		seq_pad(m, ' ');
-		seq_path(m, &file->f_path, "\n");
+		seq_file_path(m, file, "\n");
 		goto done;
 	}
 
@@ -1482,7 +1479,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md = &numa_priv->md;
-	struct file *file = vma_pr_or_file(vma);
+	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
 	struct mm_walk walk = {
 		.hugetlb_entry = gather_hugetlb_stats,
@@ -1512,7 +1509,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 
 	if (file) {
 		seq_puts(m, " file=");
-		seq_path(m, &file->f_path, "\n\t= ");
+		seq_file_path(m, file, "\n\t= ");
 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
 		seq_puts(m, " heap");
 	} else {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 174020784..e0d64c92e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -160,10 +160,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 	file = vma->vm_file;
 
 	if (file) {
-		struct inode *inode;
-
-		file = vma_pr_or_file(vma);
-		inode = file_inode(file);
+		struct inode *inode = file_inode(vma->vm_file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -183,7 +180,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_path(m, &file->f_path, "");
+		seq_file_path(m, file, "");
 	} else if (mm) {
 		pid_t tid = pid_of_stack(priv, vma, is_pid);
 
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index a8371993b..947b0f4fd 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -1,5 +1,4 @@
 #include <linux/sched.h>
-#include <linux/namei.h>
 #include <linux/slab.h>
 #include <linux/pid_namespace.h>
 #include "internal.h"
@@ -20,21 +19,20 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
 	return readlink_copy(buffer, buflen, tmp);
 }
 
-static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	pid_t pid = task_pid_nr_ns(current, ns);
-	char *name = ERR_PTR(-ENOENT);
-	if (pid) {
-		name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
-		if (!name)
-			name = ERR_PTR(-ENOMEM);
-		else
-			sprintf(name, "%d/task/%d", tgid, pid);
-	}
-	nd_set_link(nd, name);
-	return NULL;
+	char *name;
+
+	if (!pid)
+		return ERR_PTR(-ENOENT);
+	name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+	sprintf(name, "%d/task/%d", tgid, pid);
+	return *cookie = name;
 }
 
 static const struct inode_operations proc_thread_self_inode_operations = {
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 8db932da4..8ebd9a334 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -17,7 +17,8 @@
 
 static unsigned mounts_poll(struct file *file, poll_table *wait)
 {
-	struct proc_mounts *p = proc_mounts(file->private_data);
+	struct seq_file *m = file->private_data;
+	struct proc_mounts *p = m->private;
 	struct mnt_namespace *ns = p->ns;
 	unsigned res = POLLIN | POLLRDNORM;
 	int event;
@@ -25,8 +26,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &p->ns->poll, wait);
 
 	event = ACCESS_ONCE(ns->event);
-	if (p->m.poll_event != event) {
-		p->m.poll_event = event;
+	if (m->poll_event != event) {
+		m->poll_event = event;
 		res |= POLLERR | POLLPRI;
 	}
 
@@ -92,7 +93,7 @@ static void show_type(struct seq_file *m, struct super_block *sb)
 
 static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
 {
-	struct proc_mounts *p = proc_mounts(m);
+	struct proc_mounts *p = m->private;
 	struct mount *r = real_mount(mnt);
 	int err = 0;
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
@@ -126,7 +127,7 @@ out:
 
 static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 {
-	struct proc_mounts *p = proc_mounts(m);
+	struct proc_mounts *p = m->private;
 	struct mount *r = real_mount(mnt);
 	struct super_block *sb = mnt->mnt_sb;
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
@@ -186,7 +187,7 @@ out:
 
 static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
 {
-	struct proc_mounts *p = proc_mounts(m);
+	struct proc_mounts *p = m->private;
 	struct mount *r = real_mount(mnt);
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
 	struct super_block *sb = mnt_path.dentry->d_sb;
@@ -236,6 +237,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 	struct mnt_namespace *ns = NULL;
 	struct path root;
 	struct proc_mounts *p;
+	struct seq_file *m;
 	int ret = -EINVAL;
 
 	if (!task)
@@ -260,26 +262,21 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 	task_unlock(task);
 	put_task_struct(task);
 
-	ret = -ENOMEM;
-	p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
-	if (!p)
+	ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts));
+	if (ret)
 		goto err_put_path;
 
-	file->private_data = &p->m;
-	ret = seq_open(file, &mounts_op);
-	if (ret)
-		goto err_free;
+	m = file->private_data;
+	m->poll_event = ns->event;
 
+	p = m->private;
 	p->ns = ns;
 	p->root = root;
-	p->m.poll_event = ns->event;
 	p->show = show;
 	p->cached_event = ~0ULL;
 
 	return 0;
 
- err_free:
-	kfree(p);
  err_put_path:
 	path_put(&root);
  err_put_ns:
@@ -290,10 +287,11 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 
 static int mounts_release(struct inode *inode, struct file *file)
 {
-	struct proc_mounts *p = proc_mounts(file->private_data);
+	struct seq_file *m = file->private_data;
+	struct proc_mounts *p = m->private;
 	path_put(&p->root);
 	put_mnt_ns(p->ns);
-	return seq_release(inode, file);
+	return seq_release_private(inode, file);
 }
 
 static int mounts_open(struct inode *inode, struct file *file)
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index c4c9a10c5..791743dee 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -299,7 +299,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		bool compressed;
 		size_t total_len;
 
-		if (big_oops_buf) {
+		if (big_oops_buf && is_locked) {
 			dst = big_oops_buf;
 			hsize = sprintf(dst, "%s#%d Part%u\n", why,
 							oopscount, part);
@@ -456,6 +456,12 @@ int pstore_register(struct pstore_info *psi)
 		add_timer(&pstore_timer);
 	}
 
+	/*
+	 * Update the module parameter backend, so it is visible
+	 * through /sys/module/pstore/parameters/backend
+	 */
+	backend = psi->name;
+
 	pr_info("Registered %s as persistent store backend\n", psi->name);
 
 	return 0;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 44a549bee..6c26c4daa 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -186,12 +186,34 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 	ssize_t size;
 	ssize_t ecc_notice_size;
 	struct ramoops_context *cxt = psi->data;
-	struct persistent_ram_zone *prz;
-	int header_length;
+	struct persistent_ram_zone *prz = NULL;
+	int header_length = 0;
+
+	/* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
+	 * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have
+	 * valid time stamps, so it is initialized to zero.
+	 */
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	*compressed = false;
+
+	/* Find the next valid persistent_ram_zone for DMESG */
+	while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
+		prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
+					   cxt->max_dump_cnt, id, type,
+					   PSTORE_TYPE_DMESG, 1);
+		if (!prz_ok(prz))
+			continue;
+		header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz),
+						      time, compressed);
+		/* Clear and skip this DMESG record if it has no valid header */
+		if (!header_length) {
+			persistent_ram_free_old(prz);
+			persistent_ram_zap(prz);
+			prz = NULL;
+		}
+	}
 
-	prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
-				   cxt->max_dump_cnt, id, type,
-				   PSTORE_TYPE_DMESG, 1);
 	if (!prz_ok(prz))
 		prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
 					   1, id, type, PSTORE_TYPE_CONSOLE, 0);
@@ -204,13 +226,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 	if (!prz_ok(prz))
 		return 0;
 
-	if (!persistent_ram_old(prz))
-		return 0;
-
-	size = persistent_ram_old_size(prz);
-	header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), time,
-			compressed);
-	size -= header_length;
+	size = persistent_ram_old_size(prz) - header_length;
 
 	/* ECC correction notice */
 	ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
@@ -394,18 +410,16 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
 	}
 
 	for (i = 0; i < cxt->max_dump_cnt; i++) {
-		size_t sz = cxt->record_size;
-
-		cxt->przs[i] = persistent_ram_new(*paddr, sz, 0,
+		cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0,
 						  &cxt->ecc_info,
 						  cxt->memtype);
 		if (IS_ERR(cxt->przs[i])) {
 			err = PTR_ERR(cxt->przs[i]);
 			dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
-				sz, (unsigned long long)*paddr, err);
+				cxt->record_size, (unsigned long long)*paddr, err);
 			goto fail_prz;
 		}
-		*paddr += sz;
+		*paddr += cxt->record_size;
 	}
 
 	return 0;
@@ -608,7 +622,7 @@ static void ramoops_register_dummy(void)
 
 	dummy_data->mem_size = mem_size;
 	dummy_data->mem_address = mem_address;
-	dummy_data->mem_type = 0;
+	dummy_data->mem_type = mem_type;
 	dummy_data->record_size = record_size;
 	dummy_data->console_size = ramoops_console_size;
 	dummy_data->ftrace_size = ramoops_ftrace_size;
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 8d64bb536..e1f37278c 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -32,11 +32,6 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n)
 	return page;
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 static unsigned last_entry(struct inode *inode, unsigned long page_nr)
 {
 	unsigned long last_byte = inode->i_size;
diff --git a/fs/read_write.c b/fs/read_write.c
index 8ace6ec15..819ef3faf 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -494,30 +494,6 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 }
 EXPORT_SYMBOL(__vfs_write);
 
-vfs_readf_t vfs_readf(struct file *file)
-{
-	const struct file_operations *fop = file->f_op;
-
-	if (fop->read)
-		return fop->read;
-	if (fop->read_iter)
-		return new_sync_read;
-	return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL(vfs_readf);
-
-vfs_writef_t vfs_writef(struct file *file)
-{
-	const struct file_operations *fop = file->f_op;
-
-	if (fop->write)
-		return fop->write;
-	if (fop->write_iter)
-		return new_sync_write;
-	return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL(vfs_writef);
-
 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 {
 	mm_segment_t old_fs;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0111ad046..0e4cf7281 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -21,6 +21,7 @@
 #include "xattr.h"
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/quotaops.h>
@@ -588,8 +589,7 @@ static struct kmem_cache *reiserfs_inode_cachep;
 static struct inode *reiserfs_alloc_inode(struct super_block *sb)
 {
 	struct reiserfs_inode_info *ei;
-	ei = (struct reiserfs_inode_info *)
-	    kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
+	ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	atomic_set(&ei->openers, 0);
diff --git a/fs/select.c b/fs/select.c
index f684c750e..015547330 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -189,7 +189,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	 * doesn't imply write barrier and the users expect write
 	 * barrier semantics on wakeup functions.  The following
 	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-	 * and is paired with set_mb() in poll_schedule_timeout.
+	 * and is paired with smp_store_mb() in poll_schedule_timeout.
 	 */
 	smp_wmb();
 	pwq->triggered = 1;
@@ -244,7 +244,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 	/*
 	 * Prepare for the next iteration.
 	 *
-	 * The following set_mb() serves two purposes.  First, it's
+	 * The following smp_store_mb() serves two purposes.  First, it's
 	 * the counterpart rmb of the wmb in pollwake() such that data
 	 * written before wake up is always visible after wake up.
 	 * Second, the full barrier guarantees that triggered clearing
@@ -252,7 +252,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 	 * this problem doesn't exist for the first iteration as
 	 * add_wait_queue() has full barrier semantics.
 	 */
-	set_mb(pwq->triggered, 0);
+	smp_store_mb(pwq->triggered, 0);
 
 	return rc;
 }
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 555f82155..ce9e39fd5 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,18 +48,21 @@ static void *seq_buf_alloc(unsigned long size)
  *	ERR_PTR(error).  In the end of sequence they return %NULL. ->show()
  *	returns 0 in case of success and negative number in case of error.
  *	Returning SEQ_SKIP means "discard this element and move on".
+ *	Note: seq_open() will allocate a struct seq_file and store its
+ *	pointer in @file->private_data. This pointer should not be modified.
  */
 int seq_open(struct file *file, const struct seq_operations *op)
 {
-	struct seq_file *p = file->private_data;
+	struct seq_file *p;
+
+	WARN_ON(file->private_data);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	file->private_data = p;
 
-	if (!p) {
-		p = kmalloc(sizeof(*p), GFP_KERNEL);
-		if (!p)
-			return -ENOMEM;
-		file->private_data = p;
-	}
-	memset(p, 0, sizeof(*p));
 	mutex_init(&p->lock);
 	p->op = op;
 #ifdef CONFIG_USER_NS
@@ -487,6 +490,20 @@ int seq_path(struct seq_file *m, const struct path *path, const char *esc)
 }
 EXPORT_SYMBOL(seq_path);
 
+/**
+ * seq_file_path - seq_file interface to print a pathname of a file
+ * @m: the seq_file handle
+ * @file: the struct file to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path to the file.
+ */
+int seq_file_path(struct seq_file *m, struct file *file, const char *esc)
+{
+	return seq_path(m, &file->f_path, esc);
+}
+EXPORT_SYMBOL(seq_file_path);
+
 /*
  * Same as seq_path, but relative to supplied root.
  */
@@ -538,6 +555,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
 
 	return res;
 }
+EXPORT_SYMBOL(seq_dentry);
 
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
diff --git a/fs/splice.c b/fs/splice.c
index bfb3324fc..5fc1e50a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -261,6 +261,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(splice_to_pipe);
 
 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 {
@@ -359,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 				break;
 
 			error = add_to_page_cache_lru(page, mapping, index,
-						GFP_KERNEL);
+					GFP_KERNEL & mapping_gfp_mask(mapping));
 			if (unlikely(error)) {
 				page_cache_release(page);
 				if (error == -EEXIST)
@@ -1101,8 +1102,8 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 /*
  * Attempt to initiate a splice from pipe to file.
  */
-long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-		    loff_t *ppos, size_t len, unsigned int flags)
+static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+			   loff_t *ppos, size_t len, unsigned int flags)
 {
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 				loff_t *, size_t, unsigned int);
@@ -1114,14 +1115,13 @@ long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 
 	return splice_write(pipe, out, ppos, len, flags);
 }
-EXPORT_SYMBOL(do_splice_from);
 
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-long do_splice_to(struct file *in, loff_t *ppos,
-		  struct pipe_inode_info *pipe, size_t len,
-		  unsigned int flags)
+static long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags)
 {
 	ssize_t (*splice_read)(struct file *, loff_t *,
 			       struct pipe_inode_info *, size_t, unsigned int);
@@ -1141,7 +1141,6 @@ long do_splice_to(struct file *in, loff_t *ppos,
 
 	return splice_read(in, ppos, pipe, len, flags);
 }
-EXPORT_SYMBOL(do_splice_to);
 
 /**
  * splice_direct_to_actor - splices data directly between two non-pipes
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index 73588e770..d09fcd6fb 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -49,6 +49,6 @@ struct squashfs_inode_info {
 
 static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
 {
-	return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+	return container_of(inode, struct squashfs_inode_info, vfs_inode);
 }
 #endif
diff --git a/fs/super.c b/fs/super.c
index 6fb221e8d..b61372354 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,7 +36,7 @@
 #include "internal.h"
 
 
-LIST_HEAD(super_blocks);
+static LIST_HEAD(super_blocks);
 static DEFINE_SPINLOCK(sb_lock);
 
 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
@@ -842,7 +842,7 @@ int get_anon_bdev(dev_t *p)
 	else if (error)
 		return -EAGAIN;
 
-	if (dev == (1 << MINORBITS)) {
+	if (dev >= (1 << MINORBITS)) {
 		spin_lock(&unnamed_dev_lock);
 		ida_remove(&unnamed_dev_ida, dev);
 		if (unnamed_dev_start > dev)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 7c2867b44..6c95628ea 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -90,7 +90,7 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
 		return 0;
 
 	if (size) {
-		if (pos > size)
+		if (pos >= size)
 			return 0;
 		if (pos + count > size)
 			count = size - pos;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index b400c0437..39a019936 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -135,7 +135,7 @@ static int internal_create_group(struct kobject *kobj, int update,
  * This function creates a group for the first time.  It will explicitly
  * warn and error if any of the attribute files being created already exist.
  *
- * Returns 0 on success or error.
+ * Returns 0 on success or error code on failure.
  */
 int sysfs_create_group(struct kobject *kobj,
 		       const struct attribute_group *grp)
@@ -155,7 +155,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_group);
  * It will explicitly warn and error if any of the attribute files being
  * created already exist.
  *
- * Returns 0 on success or error code from sysfs_create_group on error.
+ * Returns 0 on success or error code from sysfs_create_group on failure.
  */
 int sysfs_create_groups(struct kobject *kobj,
 			const struct attribute_group **groups)
@@ -193,7 +193,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_groups);
  * The primary use for this function is to call it after making a change
  * that affects group visibility.
  *
- * Returns 0 on success or error.
+ * Returns 0 on success or error code on failure.
  */
 int sysfs_update_group(struct kobject *kobj,
 		       const struct attribute_group *grp)
diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile
index 3591f9d7a..7a75e70a4 100644
--- a/fs/sysv/Makefile
+++ b/fs/sysv/Makefile
@@ -5,4 +5,4 @@
 obj-$(CONFIG_SYSV_FS) += sysv.o
 
 sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \
-	     namei.o super.o symlink.o
+	     namei.o super.o
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 8f3555f00..63c1bcb22 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -33,11 +33,6 @@ static inline void dir_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
 {
 	struct address_space *mapping = page->mapping;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 88956309c..590ad9206 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -166,8 +166,9 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
 			inode->i_op = &sysv_symlink_inode_operations;
 			inode->i_mapping->a_ops = &sysv_aops;
 		} else {
-			inode->i_op = &sysv_fast_symlink_inode_operations;
-			nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+			inode->i_op = &simple_symlink_inode_operations;
+			inode->i_link = (char *)SYSV_I(inode)->i_data;
+			nd_terminate_link(inode->i_link, inode->i_size,
 				sizeof(SYSV_I(inode)->i_data) - 1);
 		}
 	} else
diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c
deleted file mode 100644
index d3fa0d703..000000000
--- a/fs/sysv/symlink.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  linux/fs/sysv/symlink.c
- *
- *  Handling of System V filesystem fast symlinks extensions.
- *  Aug 2001, Christoph Hellwig (hch@infradead.org)
- */
-
-#include "sysv.h"
-#include <linux/namei.h>
-
-static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	nd_set_link(nd, (char *)SYSV_I(d_inode(dentry))->i_data);
-	return NULL;
-}
-
-const struct inode_operations sysv_fast_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= sysv_follow_link,
-};
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 69d488986..6c212288a 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -73,7 +73,7 @@ struct sysv_inode_info {
 
 static inline struct sysv_inode_info *SYSV_I(struct inode *inode)
 {
-	return list_entry(inode, struct sysv_inode_info, vfs_inode);
+	return container_of(inode, struct sysv_inode_info, vfs_inode);
 }
 
 static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb)
@@ -161,7 +161,6 @@ extern ino_t sysv_inode_by_name(struct dentry *);
 
 extern const struct inode_operations sysv_file_inode_operations;
 extern const struct inode_operations sysv_dir_inode_operations;
-extern const struct inode_operations sysv_fast_symlink_inode_operations;
 extern const struct file_operations sysv_file_operations;
 extern const struct file_operations sysv_dir_operations;
 extern const struct address_space_operations sysv_aops;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index a43df11a1..cbc8d5d27 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -496,16 +496,11 @@ struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *pare
 	return dentry;
 }
 
-static inline int tracefs_positive(struct dentry *dentry)
-{
-	return dentry->d_inode && !d_unhashed(dentry);
-}
-
 static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
 {
 	int ret = 0;
 
-	if (tracefs_positive(dentry)) {
+	if (simple_positive(dentry)) {
 		if (dentry->d_inode) {
 			dget(dentry);
 			switch (dentry->d_inode->i_mode & S_IFMT) {
@@ -582,7 +577,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 	 */
 	spin_lock(&parent->d_lock);
 	list_for_each_entry(child, &parent->d_subdirs, d_child) {
-		if (!tracefs_positive(child))
+		if (!simple_positive(child))
 			continue;
 
 		/* perhaps simple_empty(child) makes more sense */
@@ -603,7 +598,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 		 * from d_subdirs. When releasing the parent->d_lock we can
 		 * no longer trust that the next pointer is valid.
 		 * Restart the loop. We'll skip this one with the
-		 * tracefs_positive() check.
+		 * simple_positive() check.
 		 */
 		goto loop;
 	}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 27060fc85..5c27c66c2 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -889,6 +889,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
 
 	memcpy(ui->data, symname, len);
 	((char *)ui->data)[len] = '\0';
+	inode->i_link = ui->data;
 	/*
 	 * The terminating zero byte is not written to the flash media and it
 	 * is put just to make later in-memory string processing simpler. Thus,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 35efc103c..a3dfe2ae7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -51,7 +51,6 @@
 
 #include "ubifs.h"
 #include <linux/mount.h>
-#include <linux/namei.h>
 #include <linux/slab.h>
 
 static int read_block(struct inode *inode, void *addr, unsigned int block,
@@ -1300,14 +1299,6 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset,
 	ClearPageChecked(page);
 }
 
-static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct ubifs_inode *ui = ubifs_inode(d_inode(dentry));
-
-	nd_set_link(nd, ui->data);
-	return NULL;
-}
-
 int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
@@ -1570,7 +1561,7 @@ const struct inode_operations ubifs_file_inode_operations = {
 
 const struct inode_operations ubifs_symlink_inode_operations = {
 	.readlink    = generic_readlink,
-	.follow_link = ubifs_follow_link,
+	.follow_link = simple_follow_link,
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
 	.setxattr    = ubifs_setxattr,
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 75e6f04bb..9547a2786 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -195,6 +195,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 		}
 		memcpy(ui->data, ino->data, ui->data_len);
 		((char *)ui->data)[ui->data_len] = '\0';
+		inode->i_link = ui->data;
 		break;
 	case S_IFBLK:
 	case S_IFCHR:
@@ -2245,7 +2246,9 @@ static int __init ubifs_init(void)
 	if (!ubifs_inode_slab)
 		return -ENOMEM;
 
-	register_shrinker(&ubifs_shrinker_info);
+	err = register_shrinker(&ubifs_shrinker_info);
+	if (err)
+		goto out_slab;
 
 	err = ubifs_compressors_init();
 	if (err)
@@ -2269,6 +2272,7 @@ out_compr:
 	ubifs_compressors_exit();
 out_shrinker:
 	unregister_shrinker(&ubifs_shrinker_info);
+out_slab:
 	kmem_cache_destroy(ubifs_inode_slab);
 	return err;
 }
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 541a12b57..541d9c650 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -168,7 +168,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
 		}
 
 		flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
-		if (!flen)
+		if (flen < 0)
 			continue;
 
 		tloc = lelb_to_cpu(cfi.icb.extLocation);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7a95b8fed..bddf3d071 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -152,8 +152,6 @@ out:
 	mutex_unlock(&inode->i_mutex);
 
 	if (retval > 0) {
-		ssize_t err;
-
 		mark_inode_dirty(inode);
 		err = generic_write_sync(file, iocb->ki_pos - retval, retval);
 		if (err < 0)
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 6afac3d56..8d0b3ade0 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1652,17 +1652,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		       iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
 					sizeof(struct unallocSpaceEntry));
 		use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
-		use->descTag.tagLocation =
-				cpu_to_le32(iinfo->i_location.logicalBlockNum);
-		crclen = sizeof(struct unallocSpaceEntry) +
-				iinfo->i_lenAlloc - sizeof(struct tag);
-		use->descTag.descCRCLength = cpu_to_le16(crclen);
-		use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
-							   sizeof(struct tag),
-							   crclen));
-		use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
+		crclen = sizeof(struct unallocSpaceEntry);
 
-		goto out;
+		goto finish;
 	}
 
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
@@ -1782,6 +1774,8 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE);
 		crclen = sizeof(struct extendedFileEntry);
 	}
+
+finish:
 	if (iinfo->i_strat4096) {
 		fe->icbTag.strategyType = cpu_to_le16(4096);
 		fe->icbTag.strategyParameter = cpu_to_le16(1);
@@ -1791,7 +1785,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		fe->icbTag.numEntries = cpu_to_le16(1);
 	}
 
-	if (S_ISDIR(inode->i_mode))
+	if (iinfo->i_use)
+		fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE;
+	else if (S_ISDIR(inode->i_mode))
 		fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY;
 	else if (S_ISREG(inode->i_mode))
 		fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR;
@@ -1828,7 +1824,6 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 						  crclen));
 	fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
 
-out:
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 5c03f0dfb..c97b5a8d1 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -138,6 +138,25 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
 	return 0;
 }
 
+/**
+ * udf_find_entry - find entry in given directory.
+ *
+ * @dir:	directory inode to search in
+ * @child:	qstr of the name
+ * @fibh:	buffer head / inode with file identifier descriptor we found
+ * @cfi:	found file identifier descriptor with given name
+ *
+ * This function searches in the directory @dir for a file name @child. When
+ * found, @fibh points to the buffer head(s) (bh is NULL for in ICB
+ * directories) containing the file identifier descriptor (FID). In that case
+ * the function returns pointer to the FID in the buffer or inode - but note
+ * that FID may be split among two buffers (blocks) so accessing it via that
+ * pointer isn't easily possible. This pointer can be used only as an iterator
+ * for other directory manipulation functions. For inspection of the FID @cfi
+ * can be used - the found FID is copied there.
+ *
+ * Returns pointer to FID, NULL when nothing found, or error code.
+ */
 static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 					    const struct qstr *child,
 					    struct udf_fileident_bh *fibh,
@@ -167,8 +186,11 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 	fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1);
 	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 		if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos,
-		    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
+		    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
+			fi = ERR_PTR(-EIO);
 			goto out_err;
+		}
+
 		block = udf_get_lb_pblock(sb, &eloc, offset);
 		if ((++offset << sb->s_blocksize_bits) < elen) {
 			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -179,19 +201,25 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 			offset = 0;
 
 		fibh->sbh = fibh->ebh = udf_tread(sb, block);
-		if (!fibh->sbh)
+		if (!fibh->sbh) {
+			fi = ERR_PTR(-EIO);
 			goto out_err;
+		}
 	}
 
 	fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
-	if (!fname)
+	if (!fname) {
+		fi = ERR_PTR(-ENOMEM);
 		goto out_err;
+	}
 
 	while (f_pos < size) {
 		fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc,
 					&elen, &offset);
-		if (!fi)
+		if (!fi) {
+			fi = ERR_PTR(-EIO);
 			goto out_err;
+		}
 
 		liu = le16_to_cpu(cfi->lengthOfImpUse);
 		lfi = cfi->lengthFileIdent;
@@ -234,12 +262,17 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 			continue;
 
 		flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
-		if (flen && udf_match(flen, fname, child->len, child->name))
+		if (flen < 0) {
+			fi = ERR_PTR(flen);
+			goto out_err;
+		}
+
+		if (udf_match(flen, fname, child->len, child->name))
 			goto out_ok;
 	}
 
-out_err:
 	fi = NULL;
+out_err:
 	if (fibh->sbh != fibh->ebh)
 		brelse(fibh->ebh);
 	brelse(fibh->sbh);
@@ -256,6 +289,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	struct fileIdentDesc cfi;
 	struct udf_fileident_bh fibh;
+	struct fileIdentDesc *fi;
 
 	if (dentry->d_name.len > UDF_NAME_LEN - 2)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -275,7 +309,11 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 	} else
 #endif /* UDF_RECOVERY */
 
-	if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
+	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
+	if (IS_ERR(fi))
+		return ERR_CAST(fi);
+
+	if (fi) {
 		struct kernel_lb_addr loc;
 
 		if (fibh.sbh != fibh.ebh)
@@ -774,8 +812,11 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 
 	retval = -ENOENT;
 	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
-	if (!fi)
+	if (IS_ERR_OR_NULL(fi)) {
+		if (fi)
+			retval = PTR_ERR(fi);
 		goto out;
+	}
 
 	retval = -EIO;
 	tloc = lelb_to_cpu(cfi.icb.extLocation);
@@ -817,8 +858,12 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 
 	retval = -ENOENT;
 	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
-	if (!fi)
+
+	if (IS_ERR_OR_NULL(fi)) {
+		if (fi)
+			retval = PTR_ERR(fi);
 		goto out;
+	}
 
 	retval = -EIO;
 	tloc = lelb_to_cpu(cfi.icb.extLocation);
@@ -1049,24 +1094,30 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
 
 	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
-	if (ofi) {
-		if (ofibh.sbh != ofibh.ebh)
-			brelse(ofibh.ebh);
-		brelse(ofibh.sbh);
+	if (IS_ERR(ofi)) {
+		retval = PTR_ERR(ofi);
+		goto end_rename;
 	}
+
+	if (ofibh.sbh != ofibh.ebh)
+		brelse(ofibh.ebh);
+
+	brelse(ofibh.sbh);
 	tloc = lelb_to_cpu(ocfi.icb.extLocation);
 	if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
 	    != old_inode->i_ino)
 		goto end_rename;
 
 	nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
-	if (nfi) {
-		if (!new_inode) {
-			if (nfibh.sbh != nfibh.ebh)
-				brelse(nfibh.ebh);
-			brelse(nfibh.sbh);
-			nfi = NULL;
-		}
+	if (IS_ERR(nfi)) {
+		retval = PTR_ERR(nfi);
+		goto end_rename;
+	}
+	if (nfi && !new_inode) {
+		if (nfibh.sbh != nfibh.ebh)
+			brelse(nfibh.ebh);
+		brelse(nfibh.sbh);
+		nfi = NULL;
 	}
 	if (S_ISDIR(old_inode->i_mode)) {
 		int offset = udf_ext0_offset(old_inode);
@@ -1221,7 +1272,7 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
 static struct dentry *udf_fh_to_dentry(struct super_block *sb,
 				       struct fid *fid, int fh_len, int fh_type)
 {
-	if ((fh_len != 3 && fh_len != 5) ||
+	if (fh_len < 3 ||
 	    (fh_type != FILEID_UDF_WITH_PARENT &&
 	     fh_type != FILEID_UDF_WITHOUT_PARENT))
 		return NULL;
@@ -1233,7 +1284,7 @@ static struct dentry *udf_fh_to_dentry(struct super_block *sb,
 static struct dentry *udf_fh_to_parent(struct super_block *sb,
 				       struct fid *fid, int fh_len, int fh_type)
 {
-	if (fh_len != 5 || fh_type != FILEID_UDF_WITH_PARENT)
+	if (fh_len < 5 || fh_type != FILEID_UDF_WITH_PARENT)
 		return NULL;
 
 	return udf_nfs_get_inode(sb, fid->udf.parent_block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6299f3419..b96f190bc 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -927,17 +927,23 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 #endif
 	}
 
-	if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
-		if (udf_CS0toUTF8(outstr, instr)) {
-			strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
-				outstr->u_len > 31 ? 31 : outstr->u_len);
-			udf_debug("volIdent[] = '%s'\n",
-				  UDF_SB(sb)->s_volume_ident);
-		}
+	if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) {
+		ret = udf_CS0toUTF8(outstr, instr);
+		if (ret < 0)
+			goto out_bh;
+
+		strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
+			outstr->u_len > 31 ? 31 : outstr->u_len);
+		udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
+	}
 
-	if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
-		if (udf_CS0toUTF8(outstr, instr))
-			udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
+	if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) {
+		ret = udf_CS0toUTF8(outstr, instr);
+		if (ret < 0)
+			goto out_bh;
+
+		udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
+	}
 
 	ret = 0;
 out_bh:
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 8dfbc4025..862535b3b 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -82,6 +82,9 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
 			comp_len = udf_get_filename(sb, pc->componentIdent,
 						    pc->lengthComponentIdent,
 						    p, tolen);
+			if (comp_len < 0)
+				return comp_len;
+
 			p += comp_len;
 			tolen -= comp_len;
 			if (tolen == 0)
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index b5cd8ed2a..b1b9a63d8 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -56,7 +56,7 @@ struct udf_inode_info {
 
 static inline struct udf_inode_info *UDF_I(struct inode *inode)
 {
-	return list_entry(inode, struct udf_inode_info, vfs_inode);
+	return container_of(inode, struct udf_inode_info, vfs_inode);
 }
 
 #endif /* _UDF_I_H) */
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index b84fee372..ab478e62b 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -68,21 +68,16 @@ int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
 /*
  * udf_build_ustr_exact
  */
-static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
+static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
 {
-	if ((!dest) || (!ptr) || (!exactsize))
-		return -1;
-
 	memset(dest, 0, sizeof(struct ustr));
 	dest->u_cmpID = ptr[0];
 	dest->u_len = exactsize - 1;
 	memcpy(dest->u_name, ptr + 1, exactsize - 1);
-
-	return 0;
 }
 
 /*
- * udf_ocu_to_utf8
+ * udf_CS0toUTF8
  *
  * PURPOSE
  *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
@@ -94,7 +89,7 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
  * 				both of type "struct ustr *"
  *
  * POST-CONDITIONS
- *	<return>		Zero on success.
+ *	<return>		>= 0 on success.
  *
  * HISTORY
  *	November 12, 1997 - Andrew E. Mileski
@@ -117,7 +112,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
 		memset(utf_o, 0, sizeof(struct ustr));
 		pr_err("unknown compression code (%d) stri=%s\n",
 		       cmp_id, ocu_i->u_name);
-		return 0;
+		return -EINVAL;
 	}
 
 	ocu = ocu_i->u_name;
@@ -154,7 +149,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
 
 /*
  *
- * udf_utf8_to_ocu
+ * udf_UTF8toCS0
  *
  * PURPOSE
  *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
@@ -270,7 +265,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 		memset(utf_o, 0, sizeof(struct ustr));
 		pr_err("unknown compression code (%d) stri=%s\n",
 		       cmp_id, ocu_i->u_name);
-		return 0;
+		return -EINVAL;
 	}
 
 	ocu = ocu_i->u_name;
@@ -338,43 +333,51 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
 		     uint8_t *dname, int dlen)
 {
 	struct ustr *filename, *unifilename;
-	int len = 0;
+	int ret;
+
+	if (!slen)
+		return -EIO;
 
 	filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
 	if (!filename)
-		return 0;
+		return -ENOMEM;
 
 	unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
-	if (!unifilename)
+	if (!unifilename) {
+		ret = -ENOMEM;
 		goto out1;
+	}
 
-	if (udf_build_ustr_exact(unifilename, sname, slen))
-		goto out2;
-
+	udf_build_ustr_exact(unifilename, sname, slen);
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-		if (!udf_CS0toUTF8(filename, unifilename)) {
+		ret = udf_CS0toUTF8(filename, unifilename);
+		if (ret < 0) {
 			udf_debug("Failed in udf_get_filename: sname = %s\n",
 				  sname);
 			goto out2;
 		}
 	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
-		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
-				  unifilename)) {
+		ret = udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
+				   unifilename);
+		if (ret < 0) {
 			udf_debug("Failed in udf_get_filename: sname = %s\n",
 				  sname);
 			goto out2;
 		}
 	} else
-		goto out2;
+		BUG();
 
-	len = udf_translate_to_linux(dname, dlen,
+	ret = udf_translate_to_linux(dname, dlen,
 				     filename->u_name, filename->u_len,
 				     unifilename->u_name, unifilename->u_len);
+	/* Zero length filename isn't valid... */
+	if (ret == 0)
+		ret = -EINVAL;
 out2:
 	kfree(unifilename);
 out1:
 	kfree(filename);
-	return len;
+	return ret;
 }
 
 int udf_put_filename(struct super_block *sb, const uint8_t *sname,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 1bfe8cabf..74f2e8028 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -65,11 +65,6 @@ static inline void ufs_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static inline unsigned long ufs_dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 {
 	ino_t res = 0;
@@ -87,7 +82,8 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 
 /* Releases the page */
 void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
-		  struct page *page, struct inode *inode)
+		  struct page *page, struct inode *inode,
+		  bool update_times)
 {
 	loff_t pos = page_offset(page) +
 			(char *) de - (char *) page_address(page);
@@ -103,7 +99,8 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 
 	err = ufs_commit_chunk(page, pos, len);
 	ufs_put_page(page);
-	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	if (update_times)
+		dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
 }
 
@@ -256,7 +253,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
 	int namelen = qstr->len;
 	unsigned reclen = UFS_DIR_REC_LEN(namelen);
 	unsigned long start, n;
-	unsigned long npages = ufs_dir_pages(dir);
+	unsigned long npages = dir_pages(dir);
 	struct page *page = NULL;
 	struct ufs_inode_info *ui = UFS_I(dir);
 	struct ufs_dir_entry *de;
@@ -320,7 +317,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
 	unsigned short rec_len, name_len;
 	struct page *page = NULL;
 	struct ufs_dir_entry *de;
-	unsigned long npages = ufs_dir_pages(dir);
+	unsigned long npages = dir_pages(dir);
 	unsigned long n;
 	char *kaddr;
 	loff_t pos;
@@ -437,7 +434,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
 	struct super_block *sb = inode->i_sb;
 	unsigned int offset = pos & ~PAGE_CACHE_MASK;
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
-	unsigned long npages = ufs_dir_pages(inode);
+	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
 	int need_revalidate = file->f_version != inode->i_version;
 	unsigned flags = UFS_SB(sb)->s_flags;
@@ -608,7 +605,7 @@ int ufs_empty_dir(struct inode * inode)
 {
 	struct super_block *sb = inode->i_sb;
 	struct page *page = NULL;
-	unsigned long i, npages = ufs_dir_pages(inode);
+	unsigned long i, npages = dir_pages(inode);
 
 	for (i = 0; i < npages; i++) {
 		char *kaddr;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2d93ab07d..f913a6924 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -572,9 +572,10 @@ static void ufs_set_inode_ops(struct inode *inode)
 		inode->i_fop = &ufs_dir_operations;
 		inode->i_mapping->a_ops = &ufs_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (!inode->i_blocks)
+		if (!inode->i_blocks) {
 			inode->i_op = &ufs_fast_symlink_inode_operations;
-		else {
+			inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+		} else {
 			inode->i_op = &ufs_symlink_inode_operations;
 			inode->i_mapping->a_ops = &ufs_aops;
 		}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 60ee32249..479665543 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -56,11 +56,9 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi
 	if (dentry->d_name.len > UFS_MAXNAMLEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	lock_ufs(dir->i_sb);
 	ino = ufs_inode_by_name(dir, &dentry->d_name);
 	if (ino)
 		inode = ufs_iget(dir->i_sb, ino);
-	unlock_ufs(dir->i_sb);
 	return d_splice_alias(inode, dentry);
 }
 
@@ -76,24 +74,16 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode,
 		bool excl)
 {
 	struct inode *inode;
-	int err;
-
-	UFSD("BEGIN\n");
 
 	inode = ufs_new_inode(dir, mode);
-	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
-	if (!IS_ERR(inode)) {
-		inode->i_op = &ufs_file_inode_operations;
-		inode->i_fop = &ufs_file_operations;
-		inode->i_mapping->a_ops = &ufs_aops;
-		mark_inode_dirty(inode);
-		lock_ufs(dir->i_sb);
-		err = ufs_add_nondir(dentry, inode);
-		unlock_ufs(dir->i_sb);
-	}
-	UFSD("END: err=%d\n", err);
-	return err;
+	inode->i_op = &ufs_file_inode_operations;
+	inode->i_fop = &ufs_file_operations;
+	inode->i_mapping->a_ops = &ufs_aops;
+	mark_inode_dirty(inode);
+	return ufs_add_nondir(dentry, inode);
 }
 
 static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
@@ -110,9 +100,7 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev
 		init_special_inode(inode, mode, rdev);
 		ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev);
 		mark_inode_dirty(inode);
-		lock_ufs(dir->i_sb);
 		err = ufs_add_nondir(dentry, inode);
-		unlock_ufs(dir->i_sb);
 	}
 	return err;
 }
@@ -121,18 +109,17 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 	const char * symname)
 {
 	struct super_block * sb = dir->i_sb;
-	int err = -ENAMETOOLONG;
+	int err;
 	unsigned l = strlen(symname)+1;
 	struct inode * inode;
 
 	if (l > sb->s_blocksize)
-		goto out_notlocked;
+		return -ENAMETOOLONG;
 
-	lock_ufs(dir->i_sb);
 	inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
-		goto out;
+		return err;
 
 	if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
 		/* slow symlink */
@@ -144,22 +131,19 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 	} else {
 		/* fast symlink */
 		inode->i_op = &ufs_fast_symlink_inode_operations;
-		memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l);
+		inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+		memcpy(inode->i_link, symname, l);
 		inode->i_size = l-1;
 	}
 	mark_inode_dirty(inode);
 
-	err = ufs_add_nondir(dentry, inode);
-out:
-	unlock_ufs(dir->i_sb);
-out_notlocked:
-	return err;
+	return ufs_add_nondir(dentry, inode);
 
 out_fail:
 	inode_dec_link_count(inode);
 	unlock_new_inode(inode);
 	iput(inode);
-	goto out;
+	return err;
 }
 
 static int ufs_link (struct dentry * old_dentry, struct inode * dir,
@@ -168,8 +152,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
 	struct inode *inode = d_inode(old_dentry);
 	int error;
 
-	lock_ufs(dir->i_sb);
-
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
 	ihold(inode);
@@ -180,7 +162,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
 		iput(inode);
 	} else
 		d_instantiate(dentry, inode);
-	unlock_ufs(dir->i_sb);
 	return error;
 }
 
@@ -189,7 +170,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
 	struct inode * inode;
 	int err;
 
-	lock_ufs(dir->i_sb);
 	inode_inc_link_count(dir);
 
 	inode = ufs_new_inode(dir, S_IFDIR|mode);
@@ -210,12 +190,10 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
 	err = ufs_add_link(dentry, inode);
 	if (err)
 		goto out_fail;
-	unlock_ufs(dir->i_sb);
 
 	unlock_new_inode(inode);
 	d_instantiate(dentry, inode);
-out:
-	return err;
+	return 0;
 
 out_fail:
 	inode_dec_link_count(inode);
@@ -224,8 +202,7 @@ out_fail:
 	iput (inode);
 out_dir:
 	inode_dec_link_count(dir);
-	unlock_ufs(dir->i_sb);
-	goto out;
+	return err;
 }
 
 static int ufs_unlink(struct inode *dir, struct dentry *dentry)
@@ -255,7 +232,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 	struct inode * inode = d_inode(dentry);
 	int err= -ENOTEMPTY;
 
-	lock_ufs(dir->i_sb);
 	if (ufs_empty_dir (inode)) {
 		err = ufs_unlink(dir, dentry);
 		if (!err) {
@@ -264,7 +240,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 			inode_dec_link_count(dir);
 		}
 	}
-	unlock_ufs(dir->i_sb);
 	return err;
 }
 
@@ -302,7 +277,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
-		ufs_set_link(new_dir, new_de, new_page, old_inode);
+		ufs_set_link(new_dir, new_de, new_page, old_inode, 1);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
 			drop_nlink(new_inode);
@@ -325,7 +300,12 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	mark_inode_dirty(old_inode);
 
 	if (dir_de) {
-		ufs_set_link(old_inode, dir_de, dir_page, new_dir);
+		if (old_dir != new_dir)
+			ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0);
+		else {
+			kunmap(dir_page);
+			page_cache_release(dir_page);
+		}
 		inode_dec_link_count(old_dir);
 	}
 	return 0;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index dc33f9416..250579a80 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -80,6 +80,7 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
index 5b537e2fd..874480bb4 100644
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -25,23 +25,12 @@
  *  ext2 symlink handling code
  */
 
-#include <linux/fs.h>
-#include <linux/namei.h>
-
 #include "ufs_fs.h"
 #include "ufs.h"
 
-
-static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct ufs_inode_info *p = UFS_I(d_inode(dentry));
-	nd_set_link(nd, (char*)p->i_u1.i_symlink);
-	return NULL;
-}
-
 const struct inode_operations ufs_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= ufs_follow_link,
+	.follow_link	= simple_follow_link,
 	.setattr	= ufs_setattr,
 };
 
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index cf6368d42..2e31ea2e3 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -106,7 +106,7 @@ extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page
 extern int ufs_empty_dir (struct inode *);
 extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
 extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
-			 struct page *page, struct inode *inode);
+			 struct page *page, struct inode *inode, bool update_times);
 
 /* file.c */
 extern const struct inode_operations ufs_file_inode_operations;
diff --git a/fs/xattr.c b/fs/xattr.c
index 6bb630399..072fee125 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -207,7 +207,6 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
 	*xattr_value = value;
 	return error;
 }
-EXPORT_SYMBOL(vfs_getxattr_alloc);
 
 /* Compare an extended attribute value with the given value */
 int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
@@ -299,18 +298,18 @@ vfs_removexattr(struct dentry *dentry, const char *name)
 
 	mutex_lock(&inode->i_mutex);
 	error = security_inode_removexattr(dentry, name);
-	if (error) {
-		mutex_unlock(&inode->i_mutex);
-		return error;
-	}
+	if (error)
+		goto out;
 
 	error = inode->i_op->removexattr(dentry, name);
-	mutex_unlock(&inode->i_mutex);
 
 	if (!error) {
 		fsnotify_xattr(dentry);
 		evm_inode_post_removexattr(dentry, name);
 	}
+
+out:
+	mutex_unlock(&inode->i_mutex);
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_removexattr);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 516162be1..f9e9ffe6f 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -149,13 +149,27 @@ xfs_alloc_compute_aligned(
 {
 	xfs_agblock_t	bno;
 	xfs_extlen_t	len;
+	xfs_extlen_t	diff;
 
 	/* Trim busy sections out of found extent */
 	xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
 
+	/*
+	 * If we have a largish extent that happens to start before min_agbno,
+	 * see if we can shift it into range...
+	 */
+	if (bno < args->min_agbno && bno + len > args->min_agbno) {
+		diff = args->min_agbno - bno;
+		if (len > diff) {
+			bno += diff;
+			len -= diff;
+		}
+	}
+
 	if (args->alignment > 1 && len >= args->minlen) {
 		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
-		xfs_extlen_t	diff = aligned_bno - bno;
+
+		diff = aligned_bno - bno;
 
 		*resbno = aligned_bno;
 		*reslen = diff >= len ? 0 : len - diff;
@@ -795,9 +809,13 @@ xfs_alloc_find_best_extent(
 		 * The good extent is closer than this one.
 		 */
 		if (!dir) {
+			if (*sbnoa > args->max_agbno)
+				goto out_use_good;
 			if (*sbnoa >= args->agbno + gdiff)
 				goto out_use_good;
 		} else {
+			if (*sbnoa < args->min_agbno)
+				goto out_use_good;
 			if (*sbnoa <= args->agbno - gdiff)
 				goto out_use_good;
 		}
@@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near(
 	dofirst = prandom_u32() & 1;
 #endif
 
+	/* handle unitialized agbno range so caller doesn't have to */
+	if (!args->min_agbno && !args->max_agbno)
+		args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
+	ASSERT(args->min_agbno <= args->max_agbno);
+
+	/* clamp agbno to the range if it's outside */
+	if (args->agbno < args->min_agbno)
+		args->agbno = args->min_agbno;
+	if (args->agbno > args->max_agbno)
+		args->agbno = args->max_agbno;
+
 restart:
 	bno_cur_lt = NULL;
 	bno_cur_gt = NULL;
@@ -976,6 +1005,8 @@ restart:
 						  &ltbnoa, &ltlena);
 			if (ltlena < args->minlen)
 				continue;
+			if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
+				continue;
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
 			ASSERT(args->len >= args->minlen);
@@ -1096,11 +1127,11 @@ restart:
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 			xfs_alloc_compute_aligned(args, ltbno, ltlen,
 						  &ltbnoa, &ltlena);
-			if (ltlena >= args->minlen)
+			if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
 				break;
 			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
 				goto error0;
-			if (!i) {
+			if (!i || ltbnoa < args->min_agbno) {
 				xfs_btree_del_cursor(bno_cur_lt,
 						     XFS_BTREE_NOERROR);
 				bno_cur_lt = NULL;
@@ -1112,11 +1143,11 @@ restart:
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 			xfs_alloc_compute_aligned(args, gtbno, gtlen,
 						  &gtbnoa, &gtlena);
-			if (gtlena >= args->minlen)
+			if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
 				break;
 			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
 				goto error0;
-			if (!i) {
+			if (!i || gtbnoa > args->max_agbno) {
 				xfs_btree_del_cursor(bno_cur_gt,
 						     XFS_BTREE_NOERROR);
 				bno_cur_gt = NULL;
@@ -1216,6 +1247,7 @@ restart:
 	ASSERT(ltnew >= ltbno);
 	ASSERT(ltnew + rlen <= ltbnoa + ltlena);
 	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+	ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
 	args->agbno = ltnew;
 
 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
@@ -1825,11 +1857,11 @@ xfs_alloc_compute_maxlevels(
 xfs_extlen_t
 xfs_alloc_longest_free_extent(
 	struct xfs_mount	*mp,
-	struct xfs_perag	*pag)
+	struct xfs_perag	*pag,
+	xfs_extlen_t		need)
 {
-	xfs_extlen_t		need, delta = 0;
+	xfs_extlen_t		delta = 0;
 
-	need = XFS_MIN_FREELIST_PAG(pag, mp);
 	if (need > pag->pagf_flcount)
 		delta = need - pag->pagf_flcount;
 
@@ -1838,131 +1870,150 @@ xfs_alloc_longest_free_extent(
 	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
 }
 
+unsigned int
+xfs_alloc_min_freelist(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag)
+{
+	unsigned int		min_free;
+
+	/* space needed by-bno freespace btree */
+	min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1,
+				       mp->m_ag_maxlevels);
+	/* space needed by-size freespace btree */
+	min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
+				       mp->m_ag_maxlevels);
+
+	return min_free;
+}
+
+/*
+ * Check if the operation we are fixing up the freelist for should go ahead or
+ * not. If we are freeing blocks, we always allow it, otherwise the allocation
+ * is dependent on whether the size and shape of free space available will
+ * permit the requested allocation to take place.
+ */
+static bool
+xfs_alloc_space_available(
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		min_free,
+	int			flags)
+{
+	struct xfs_perag	*pag = args->pag;
+	xfs_extlen_t		longest;
+	int			available;
+
+	if (flags & XFS_ALLOC_FLAG_FREEING)
+		return true;
+
+	/* do we have enough contiguous free space for the allocation? */
+	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+	if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+		return false;
+
+	/* do have enough free space remaining for the allocation? */
+	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
+			  min_free - args->total);
+	if (available < (int)args->minleft)
+		return false;
+
+	return true;
+}
+
 /*
  * Decide whether to use this allocation group for this allocation.
  * If so, fix up the btree freelist's size.
  */
 STATIC int			/* error */
 xfs_alloc_fix_freelist(
-	xfs_alloc_arg_t	*args,	/* allocation argument structure */
-	int		flags)	/* XFS_ALLOC_FLAG_... */
+	struct xfs_alloc_arg	*args,	/* allocation argument structure */
+	int			flags)	/* XFS_ALLOC_FLAG_... */
 {
-	xfs_buf_t	*agbp;	/* agf buffer pointer */
-	xfs_agf_t	*agf;	/* a.g. freespace structure pointer */
-	xfs_buf_t	*agflbp;/* agfl buffer pointer */
-	xfs_agblock_t	bno;	/* freelist block */
-	xfs_extlen_t	delta;	/* new blocks needed in freelist */
-	int		error;	/* error result code */
-	xfs_extlen_t	longest;/* longest extent in allocation group */
-	xfs_mount_t	*mp;	/* file system mount point structure */
-	xfs_extlen_t	need;	/* total blocks needed in freelist */
-	xfs_perag_t	*pag;	/* per-ag information structure */
-	xfs_alloc_arg_t	targs;	/* local allocation arguments */
-	xfs_trans_t	*tp;	/* transaction pointer */
-
-	mp = args->mp;
+	struct xfs_mount	*mp = args->mp;
+	struct xfs_perag	*pag = args->pag;
+	struct xfs_trans	*tp = args->tp;
+	struct xfs_buf		*agbp = NULL;
+	struct xfs_buf		*agflbp = NULL;
+	struct xfs_alloc_arg	targs;	/* local allocation arguments */
+	xfs_agblock_t		bno;	/* freelist block */
+	xfs_extlen_t		need;	/* total blocks needed in freelist */
+	int			error;
 
-	pag = args->pag;
-	tp = args->tp;
 	if (!pag->pagf_init) {
-		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
-				&agbp)))
-			return error;
+		error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+		if (error)
+			goto out_no_agbp;
 		if (!pag->pagf_init) {
 			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
 			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-			args->agbp = NULL;
-			return 0;
+			goto out_agbp_relse;
 		}
-	} else
-		agbp = NULL;
+	}
 
 	/*
-	 * If this is a metadata preferred pag and we are user data
-	 * then try somewhere else if we are not being asked to
-	 * try harder at this point
+	 * If this is a metadata preferred pag and we are user data then try
+	 * somewhere else if we are not being asked to try harder at this
+	 * point
 	 */
 	if (pag->pagf_metadata && args->userdata &&
 	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
 		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-		args->agbp = NULL;
-		return 0;
+		goto out_agbp_relse;
 	}
 
-	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-		/*
-		 * If it looks like there isn't a long enough extent, or enough
-		 * total blocks, reject it.
-		 */
-		need = XFS_MIN_FREELIST_PAG(pag, mp);
-		longest = xfs_alloc_longest_free_extent(mp, pag);
-		if ((args->minlen + args->alignment + args->minalignslop - 1) >
-				longest ||
-		    ((int)(pag->pagf_freeblks + pag->pagf_flcount -
-			   need - args->total) < (int)args->minleft)) {
-			if (agbp)
-				xfs_trans_brelse(tp, agbp);
-			args->agbp = NULL;
-			return 0;
-		}
-	}
+	need = xfs_alloc_min_freelist(mp, pag);
+	if (!xfs_alloc_space_available(args, need, flags))
+		goto out_agbp_relse;
 
 	/*
 	 * Get the a.g. freespace buffer.
 	 * Can fail if we're not blocking on locks, and it's held.
 	 */
-	if (agbp == NULL) {
-		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
-				&agbp)))
-			return error;
-		if (agbp == NULL) {
+	if (!agbp) {
+		error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+		if (error)
+			goto out_no_agbp;
+		if (!agbp) {
 			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
 			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-			args->agbp = NULL;
-			return 0;
-		}
-	}
-	/*
-	 * Figure out how many blocks we should have in the freelist.
-	 */
-	agf = XFS_BUF_TO_AGF(agbp);
-	need = XFS_MIN_FREELIST(agf, mp);
-	/*
-	 * If there isn't enough total or single-extent, reject it.
-	 */
-	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-		delta = need > be32_to_cpu(agf->agf_flcount) ?
-			(need - be32_to_cpu(agf->agf_flcount)) : 0;
-		longest = be32_to_cpu(agf->agf_longest);
-		longest = (longest > delta) ? (longest - delta) :
-			(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
-		if ((args->minlen + args->alignment + args->minalignslop - 1) >
-				longest ||
-		    ((int)(be32_to_cpu(agf->agf_freeblks) +
-		     be32_to_cpu(agf->agf_flcount) - need - args->total) <
-				(int)args->minleft)) {
-			xfs_trans_brelse(tp, agbp);
-			args->agbp = NULL;
-			return 0;
+			goto out_no_agbp;
 		}
 	}
+
+	/* If there isn't enough total space or single-extent, reject it. */
+	need = xfs_alloc_min_freelist(mp, pag);
+	if (!xfs_alloc_space_available(args, need, flags))
+		goto out_agbp_relse;
+
 	/*
 	 * Make the freelist shorter if it's too long.
+	 *
+	 * Note that from this point onwards, we will always release the agf and
+	 * agfl buffers on error. This handles the case where we error out and
+	 * the buffers are clean or may not have been joined to the transaction
+	 * and hence need to be released manually. If they have been joined to
+	 * the transaction, then xfs_trans_brelse() will handle them
+	 * appropriately based on the recursion count and dirty state of the
+	 * buffer.
+	 *
+	 * XXX (dgc): When we have lots of free space, does this buy us
+	 * anything other than extra overhead when we need to put more blocks
+	 * back on the free list? Maybe we should only do this when space is
+	 * getting low or the AGFL is more than half full?
 	 */
-	while (be32_to_cpu(agf->agf_flcount) > need) {
-		xfs_buf_t	*bp;
+	while (pag->pagf_flcount > need) {
+		struct xfs_buf	*bp;
 
 		error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
 		if (error)
-			return error;
-		if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
-			return error;
+			goto out_agbp_relse;
+		error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1);
+		if (error)
+			goto out_agbp_relse;
 		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
 		xfs_trans_binval(tp, bp);
 	}
-	/*
-	 * Initialize the args structure.
-	 */
+
 	memset(&targs, 0, sizeof(targs));
 	targs.tp = tp;
 	targs.mp = mp;
@@ -1971,21 +2022,20 @@ xfs_alloc_fix_freelist(
 	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
 	targs.type = XFS_ALLOCTYPE_THIS_AG;
 	targs.pag = pag;
-	if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
-		return error;
-	/*
-	 * Make the freelist longer if it's too short.
-	 */
-	while (be32_to_cpu(agf->agf_flcount) < need) {
+	error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
+	if (error)
+		goto out_agbp_relse;
+
+	/* Make the freelist longer if it's too short. */
+	while (pag->pagf_flcount < need) {
 		targs.agbno = 0;
-		targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
-		/*
-		 * Allocate as many blocks as possible at once.
-		 */
-		if ((error = xfs_alloc_ag_vextent(&targs))) {
-			xfs_trans_brelse(tp, agflbp);
-			return error;
-		}
+		targs.maxlen = need - pag->pagf_flcount;
+
+		/* Allocate as many blocks as possible at once. */
+		error = xfs_alloc_ag_vextent(&targs);
+		if (error)
+			goto out_agflbp_relse;
+
 		/*
 		 * Stop if we run out.  Won't happen if callers are obeying
 		 * the restrictions correctly.  Can happen for free calls
@@ -1994,9 +2044,7 @@ xfs_alloc_fix_freelist(
 		if (targs.agbno == NULLAGBLOCK) {
 			if (flags & XFS_ALLOC_FLAG_FREEING)
 				break;
-			xfs_trans_brelse(tp, agflbp);
-			args->agbp = NULL;
-			return 0;
+			goto out_agflbp_relse;
 		}
 		/*
 		 * Put each allocated block on the list.
@@ -2005,12 +2053,21 @@ xfs_alloc_fix_freelist(
 			error = xfs_alloc_put_freelist(tp, agbp,
 							agflbp, bno, 0);
 			if (error)
-				return error;
+				goto out_agflbp_relse;
 		}
 	}
 	xfs_trans_brelse(tp, agflbp);
 	args->agbp = agbp;
 	return 0;
+
+out_agflbp_relse:
+	xfs_trans_brelse(tp, agflbp);
+out_agbp_relse:
+	if (agbp)
+		xfs_trans_brelse(tp, agbp);
+out_no_agbp:
+	args->agbp = NULL;
+	return error;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d1b4b6a5c..ca1c81683 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg {
 	xfs_extlen_t	total;		/* total blocks needed in xaction */
 	xfs_extlen_t	alignment;	/* align answer to multiple of this */
 	xfs_extlen_t	minalignslop;	/* slop for minlen+alignment calcs */
+	xfs_agblock_t	min_agbno;	/* set an agbno range for NEAR allocs */
+	xfs_agblock_t	max_agbno;	/* ... */
 	xfs_extlen_t	len;		/* output: actual size of extent */
 	xfs_alloctype_t	type;		/* allocation type XFS_ALLOCTYPE_... */
 	xfs_alloctype_t	otype;		/* original allocation type */
@@ -128,11 +130,9 @@ typedef struct xfs_alloc_arg {
 #define XFS_ALLOC_USERDATA		1	/* allocation is for user data*/
 #define XFS_ALLOC_INITIAL_USER_DATA	2	/* special case start of file */
 
-/*
- * Find the length of the longest extent in an AG.
- */
-xfs_extlen_t
-xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+		struct xfs_perag *pag, xfs_extlen_t need);
+unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
 		struct xfs_perag *pag);
 
 /*
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 0a472fbe0..3349c9a1e 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -266,7 +266,7 @@ xfs_attr_set(
 	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
 	error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
 	if (error) {
-		xfs_trans_cancel(args.trans, 0);
+		xfs_trans_cancel(args.trans);
 		return error;
 	}
 	xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -276,7 +276,7 @@ xfs_attr_set(
 				       XFS_QMOPT_RES_REGBLKS);
 	if (error) {
 		xfs_iunlock(dp, XFS_ILOCK_EXCL);
-		xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_trans_cancel(args.trans);
 		return error;
 	}
 
@@ -320,8 +320,7 @@ xfs_attr_set(
 				xfs_trans_ichgtime(args.trans, dp,
 							XFS_ICHGTIME_CHG);
 			}
-			err2 = xfs_trans_commit(args.trans,
-						 XFS_TRANS_RELEASE_LOG_RES);
+			err2 = xfs_trans_commit(args.trans);
 			xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
 			return error ? error : err2;
@@ -383,16 +382,14 @@ xfs_attr_set(
 	 * Commit the last in the sequence of transactions.
 	 */
 	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(args.trans);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
 	return error;
 
 out:
-	if (args.trans) {
-		xfs_trans_cancel(args.trans,
-			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-	}
+	if (args.trans)
+		xfs_trans_cancel(args.trans);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 	return error;
 }
@@ -462,7 +459,7 @@ xfs_attr_remove(
 	error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
 				  XFS_ATTRRM_SPACE_RES(mp), 0);
 	if (error) {
-		xfs_trans_cancel(args.trans, 0);
+		xfs_trans_cancel(args.trans);
 		return error;
 	}
 
@@ -501,16 +498,14 @@ xfs_attr_remove(
 	 * Commit the last in the sequence of transactions.
 	 */
 	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(args.trans);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
 	return error;
 
 out:
-	if (args.trans) {
-		xfs_trans_cancel(args.trans,
-			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-	}
+	if (args.trans)
+		xfs_trans_cancel(args.trans);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index f1026e86d..63e05b663 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1112,7 +1112,6 @@ xfs_bmap_add_attrfork(
 	int			committed;	/* xaction was committed */
 	int			logflags;	/* logging flags */
 	int			error;		/* error return value */
-	int			cancel_flags = 0;
 
 	ASSERT(XFS_IFORK_Q(ip) == 0);
 
@@ -1124,17 +1123,15 @@ xfs_bmap_add_attrfork(
 		tp->t_flags |= XFS_TRANS_RESERVE;
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
 			XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
 			XFS_QMOPT_RES_REGBLKS);
 	if (error)
 		goto trans_cancel;
-	cancel_flags |= XFS_TRANS_ABORT;
 	if (XFS_IFORK_Q(ip))
 		goto trans_cancel;
 	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
@@ -1218,14 +1215,14 @@ xfs_bmap_add_attrfork(
 	error = xfs_bmap_finish(&tp, &flist, &committed);
 	if (error)
 		goto bmap_cancel;
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 
 bmap_cancel:
 	xfs_bmap_cancel(&flist);
 trans_cancel:
-	xfs_trans_cancel(tp, cancel_flags);
+	xfs_trans_cancel(tp);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
@@ -3521,7 +3518,8 @@ xfs_bmap_longest_free_extent(
 		}
 	}
 
-	longest = xfs_alloc_longest_free_extent(mp, pag);
+	longest = xfs_alloc_longest_free_extent(mp, pag,
+					xfs_alloc_min_freelist(mp, pag));
 	if (*blen < longest)
 		*blen = longest;
 
@@ -4424,7 +4422,15 @@ xfs_bmapi_convert_unwritten(
 	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
 			&bma->cur, mval, bma->firstblock, bma->flist,
 			&tmp_logflags);
-	bma->logflags |= tmp_logflags;
+	/*
+	 * Log the inode core unconditionally in the unwritten extent conversion
+	 * path because the conversion might not have done so (e.g., if the
+	 * extent count hasn't changed). We need to make sure the inode is dirty
+	 * in the transaction for the sake of fsync(), even if nothing has
+	 * changed, because fsync() will not force the log for this transaction
+	 * unless it sees the inode pinned.
+	 */
+	bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
 	if (error)
 		return error;
 
@@ -5918,7 +5924,7 @@ xfs_bmap_split_extent(
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
 			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -5936,10 +5942,9 @@ xfs_bmap_split_extent(
 	if (error)
 		goto out;
 
-	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
+	return xfs_trans_commit(tp);
 
 out:
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 4daaa6623..a0ae57205 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -170,7 +170,7 @@ typedef struct xfs_sb {
 	__uint32_t	sb_features_log_incompat;
 
 	__uint32_t	sb_crc;		/* superblock crc */
-	__uint32_t	sb_pad;
+	xfs_extlen_t	sb_spino_align;	/* sparse inode chunk alignment */
 
 	xfs_ino_t	sb_pquotino;	/* project quota inode */
 	xfs_lsn_t	sb_lsn;		/* last write sequence */
@@ -256,7 +256,7 @@ typedef struct xfs_dsb {
 	__be32		sb_features_log_incompat;
 
 	__le32		sb_crc;		/* superblock crc */
-	__be32		sb_pad;
+	__be32		sb_spino_align;	/* sparse inode chunk alignment */
 
 	__be64		sb_pquotino;	/* project quota inode */
 	__be64		sb_lsn;		/* last write sequence */
@@ -457,8 +457,10 @@ xfs_sb_has_ro_compat_feature(
 }
 
 #define XFS_SB_FEAT_INCOMPAT_FTYPE	(1 << 0)	/* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_SPINODES	(1 << 1)	/* sparse inode chunks */
 #define XFS_SB_FEAT_INCOMPAT_ALL \
-		(XFS_SB_FEAT_INCOMPAT_FTYPE)
+		(XFS_SB_FEAT_INCOMPAT_FTYPE|	\
+		 XFS_SB_FEAT_INCOMPAT_SPINODES)
 
 #define XFS_SB_FEAT_INCOMPAT_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_ALL
 static inline bool
@@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
 		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
 }
 
+static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+		xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
+}
+
 /*
  * end of superblock version macros
  */
@@ -758,19 +766,6 @@ typedef struct xfs_agfl {
 
 #define XFS_AGFL_CRC_OFF	offsetof(struct xfs_agfl, agfl_crc)
 
-
-#define	XFS_AG_MAXLEVELS(mp)		((mp)->m_ag_maxlevels)
-#define	XFS_MIN_FREELIST_RAW(bl,cl,mp)	\
-	(MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
-#define	XFS_MIN_FREELIST(a,mp)		\
-	(XFS_MIN_FREELIST_RAW(		\
-		be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
-		be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
-#define	XFS_MIN_FREELIST_PAG(pag,mp)	\
-	(XFS_MIN_FREELIST_RAW(		\
-		(unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
-		(unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
-
 #define XFS_AGB_TO_FSB(mp,agno,agbno)	\
 	(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
 #define	XFS_FSB_TO_AGNO(mp,fsbno)	\
@@ -1216,26 +1211,54 @@ typedef	__uint64_t	xfs_inofree_t;
 #define	XFS_INOBT_ALL_FREE		((xfs_inofree_t)-1)
 #define	XFS_INOBT_MASK(i)		((xfs_inofree_t)1 << (i))
 
+#define XFS_INOBT_HOLEMASK_FULL		0	/* holemask for full chunk */
+#define XFS_INOBT_HOLEMASK_BITS		(NBBY * sizeof(__uint16_t))
+#define XFS_INODES_PER_HOLEMASK_BIT	\
+	(XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t)))
+
 static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
 {
 	return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
 }
 
 /*
- * Data record structure
+ * The on-disk inode record structure has two formats. The original "full"
+ * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
+ * and replaces the 3 high-order freecount bytes wth the holemask and inode
+ * count.
+ *
+ * The holemask of the sparse record format allows an inode chunk to have holes
+ * that refer to blocks not owned by the inode record. This facilitates inode
+ * allocation in the event of severe free space fragmentation.
  */
 typedef struct xfs_inobt_rec {
 	__be32		ir_startino;	/* starting inode number */
-	__be32		ir_freecount;	/* count of free inodes (set bits) */
+	union {
+		struct {
+			__be32	ir_freecount;	/* count of free inodes */
+		} f;
+		struct {
+			__be16	ir_holemask;/* hole mask for sparse chunks */
+			__u8	ir_count;	/* total inode count */
+			__u8	ir_freecount;	/* count of free inodes */
+		} sp;
+	} ir_u;
 	__be64		ir_free;	/* free inode mask */
 } xfs_inobt_rec_t;
 
 typedef struct xfs_inobt_rec_incore {
 	xfs_agino_t	ir_startino;	/* starting inode number */
-	__int32_t	ir_freecount;	/* count of free inodes (set bits) */
+	__uint16_t	ir_holemask;	/* hole mask for sparse chunks */
+	__uint8_t	ir_count;	/* total inode count */
+	__uint8_t	ir_freecount;	/* count of free inodes (set bits) */
 	xfs_inofree_t	ir_free;	/* free inode mask */
 } xfs_inobt_rec_incore_t;
 
+static inline bool xfs_inobt_issparse(uint16_t holemask)
+{
+	/* non-zero holemask represents a sparse rec. */
+	return holemask;
+}
 
 /*
  * Key structure
@@ -1453,8 +1476,8 @@ struct xfs_acl {
 		sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
 
 /* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE		(unsigned char *)"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT		(unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE		"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT		"SGI_ACL_DEFAULT"
 #define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
 
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca..89689c6a4 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_V5SB	0x8000	/* version 5 superblock */
 #define XFS_FSOP_GEOM_FLAGS_FTYPE	0x10000	/* inode directory types */
 #define XFS_FSOP_GEOM_FLAGS_FINOBT	0x20000	/* free inode btree */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES	0x40000	/* sparse inode chunks	*/
 
 /*
  * Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 1c9e75521..66efc7024 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -65,6 +65,8 @@ xfs_inobt_lookup(
 	int			*stat)	/* success/failure */
 {
 	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_holemask = 0;
+	cur->bc_rec.i.ir_count = 0;
 	cur->bc_rec.i.ir_freecount = 0;
 	cur->bc_rec.i.ir_free = 0;
 	return xfs_btree_lookup(cur, dir, stat);
@@ -82,7 +84,14 @@ xfs_inobt_update(
 	union xfs_btree_rec	rec;
 
 	rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
-	rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+	if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+		rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
+		rec.inobt.ir_u.sp.ir_count = irec->ir_count;
+		rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
+	} else {
+		/* ir_holemask/ir_count not supported on-disk */
+		rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
+	}
 	rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
 	return xfs_btree_update(cur, &rec);
 }
@@ -100,12 +109,27 @@ xfs_inobt_get_rec(
 	int			error;
 
 	error = xfs_btree_get_rec(cur, &rec, stat);
-	if (!error && *stat == 1) {
-		irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
-		irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
-		irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+	if (error || *stat == 0)
+		return error;
+
+	irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+	if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+		irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
+		irec->ir_count = rec->inobt.ir_u.sp.ir_count;
+		irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
+	} else {
+		/*
+		 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
+		 * values for full inode chunks.
+		 */
+		irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
+		irec->ir_count = XFS_INODES_PER_CHUNK;
+		irec->ir_freecount =
+				be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
 	}
-	return error;
+	irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+
+	return 0;
 }
 
 /*
@@ -114,10 +138,14 @@ xfs_inobt_get_rec(
 STATIC int
 xfs_inobt_insert_rec(
 	struct xfs_btree_cur	*cur,
+	__uint16_t		holemask,
+	__uint8_t		count,
 	__int32_t		freecount,
 	xfs_inofree_t		free,
 	int			*stat)
 {
+	cur->bc_rec.i.ir_holemask = holemask;
+	cur->bc_rec.i.ir_count = count;
 	cur->bc_rec.i.ir_freecount = freecount;
 	cur->bc_rec.i.ir_free = free;
 	return xfs_btree_insert(cur, stat);
@@ -154,7 +182,9 @@ xfs_inobt_insert(
 		}
 		ASSERT(i == 0);
 
-		error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+		error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
+					     XFS_INODES_PER_CHUNK,
+					     XFS_INODES_PER_CHUNK,
 					     XFS_INOBT_ALL_FREE, &i);
 		if (error) {
 			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -220,6 +250,7 @@ xfs_ialloc_inode_init(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
 	struct list_head	*buffer_list,
+	int			icount,
 	xfs_agnumber_t		agno,
 	xfs_agblock_t		agbno,
 	xfs_agblock_t		length,
@@ -275,7 +306,7 @@ xfs_ialloc_inode_init(
 		 * they track in the AIL as if they were physically logged.
 		 */
 		if (tp)
-			xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+			xfs_icreate_log(tp, agno, agbno, icount,
 					mp->m_sb.sb_inodesize, length, gen);
 	} else
 		version = 2;
@@ -347,6 +378,214 @@ xfs_ialloc_inode_init(
 }
 
 /*
+ * Align startino and allocmask for a recently allocated sparse chunk such that
+ * they are fit for insertion (or merge) into the on-disk inode btrees.
+ *
+ * Background:
+ *
+ * When enabled, sparse inode support increases the inode alignment from cluster
+ * size to inode chunk size. This means that the minimum range between two
+ * non-adjacent inode records in the inobt is large enough for a full inode
+ * record. This allows for cluster sized, cluster aligned block allocation
+ * without need to worry about whether the resulting inode record overlaps with
+ * another record in the tree. Without this basic rule, we would have to deal
+ * with the consequences of overlap by potentially undoing recent allocations in
+ * the inode allocation codepath.
+ *
+ * Because of this alignment rule (which is enforced on mount), there are two
+ * inobt possibilities for newly allocated sparse chunks. One is that the
+ * aligned inode record for the chunk covers a range of inodes not already
+ * covered in the inobt (i.e., it is safe to insert a new sparse record). The
+ * other is that a record already exists at the aligned startino that considers
+ * the newly allocated range as sparse. In the latter case, record content is
+ * merged in hope that sparse inode chunks fill to full chunks over time.
+ */
+STATIC void
+xfs_align_sparse_ino(
+	struct xfs_mount		*mp,
+	xfs_agino_t			*startino,
+	uint16_t			*allocmask)
+{
+	xfs_agblock_t			agbno;
+	xfs_agblock_t			mod;
+	int				offset;
+
+	agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
+	mod = agbno % mp->m_sb.sb_inoalignmt;
+	if (!mod)
+		return;
+
+	/* calculate the inode offset and align startino */
+	offset = mod << mp->m_sb.sb_inopblog;
+	*startino -= offset;
+
+	/*
+	 * Since startino has been aligned down, left shift allocmask such that
+	 * it continues to represent the same physical inodes relative to the
+	 * new startino.
+	 */
+	*allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
+}
+
+/*
+ * Determine whether the source inode record can merge into the target. Both
+ * records must be sparse, the inode ranges must match and there must be no
+ * allocation overlap between the records.
+ */
+STATIC bool
+__xfs_inobt_can_merge(
+	struct xfs_inobt_rec_incore	*trec,	/* tgt record */
+	struct xfs_inobt_rec_incore	*srec)	/* src record */
+{
+	uint64_t			talloc;
+	uint64_t			salloc;
+
+	/* records must cover the same inode range */
+	if (trec->ir_startino != srec->ir_startino)
+		return false;
+
+	/* both records must be sparse */
+	if (!xfs_inobt_issparse(trec->ir_holemask) ||
+	    !xfs_inobt_issparse(srec->ir_holemask))
+		return false;
+
+	/* both records must track some inodes */
+	if (!trec->ir_count || !srec->ir_count)
+		return false;
+
+	/* can't exceed capacity of a full record */
+	if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
+		return false;
+
+	/* verify there is no allocation overlap */
+	talloc = xfs_inobt_irec_to_allocmask(trec);
+	salloc = xfs_inobt_irec_to_allocmask(srec);
+	if (talloc & salloc)
+		return false;
+
+	return true;
+}
+
+/*
+ * Merge the source inode record into the target. The caller must call
+ * __xfs_inobt_can_merge() to ensure the merge is valid.
+ */
+STATIC void
+__xfs_inobt_rec_merge(
+	struct xfs_inobt_rec_incore	*trec,	/* target */
+	struct xfs_inobt_rec_incore	*srec)	/* src */
+{
+	ASSERT(trec->ir_startino == srec->ir_startino);
+
+	/* combine the counts */
+	trec->ir_count += srec->ir_count;
+	trec->ir_freecount += srec->ir_freecount;
+
+	/*
+	 * Merge the holemask and free mask. For both fields, 0 bits refer to
+	 * allocated inodes. We combine the allocated ranges with bitwise AND.
+	 */
+	trec->ir_holemask &= srec->ir_holemask;
+	trec->ir_free &= srec->ir_free;
+}
+
+/*
+ * Insert a new sparse inode chunk into the associated inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * This function supports two modes of handling preexisting records depending on
+ * the merge flag. If merge is true, the provided record is merged with the
+ * existing record and updated in place. The merged record is returned in nrec.
+ * If merge is false, an existing record is replaced with the provided record.
+ * If no preexisting record exists, the provided record is always inserted.
+ *
+ * It is considered corruption if a merge is requested and not possible. Given
+ * the sparse inode alignment constraints, this should never happen.
+ */
+STATIC int
+xfs_inobt_insert_sprec(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	int				btnum,
+	struct xfs_inobt_rec_incore	*nrec,	/* in/out: new/merged rec. */
+	bool				merge)	/* merge or replace */
+{
+	struct xfs_btree_cur		*cur;
+	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
+	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno);
+	int				error;
+	int				i;
+	struct xfs_inobt_rec_incore	rec;
+
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+	/* the new record is pre-aligned so we know where to look */
+	error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		goto error;
+	/* if nothing there, insert a new record and return */
+	if (i == 0) {
+		error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+					     nrec->ir_count, nrec->ir_freecount,
+					     nrec->ir_free, &i);
+		if (error)
+			goto error;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+
+		goto out;
+	}
+
+	/*
+	 * A record exists at this startino. Merge or replace the record
+	 * depending on what we've been asked to do.
+	 */
+	if (merge) {
+		error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (error)
+			goto error;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+		XFS_WANT_CORRUPTED_GOTO(mp,
+					rec.ir_startino == nrec->ir_startino,
+					error);
+
+		/*
+		 * This should never fail. If we have coexisting records that
+		 * cannot merge, something is seriously wrong.
+		 */
+		XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
+					error);
+
+		trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
+					 rec.ir_holemask, nrec->ir_startino,
+					 nrec->ir_holemask);
+
+		/* merge to nrec to output the updated record */
+		__xfs_inobt_rec_merge(nrec, &rec);
+
+		trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
+					  nrec->ir_holemask);
+
+		error = xfs_inobt_rec_check_count(mp, nrec);
+		if (error)
+			goto error;
+	}
+
+	error = xfs_inobt_update(cur, nrec);
+	if (error)
+		goto error;
+
+out:
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	return 0;
+error:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
  * Allocate new inodes in the allocation group specified by agbp.
  * Return 0 for success, else error code.
  */
@@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc(
 	xfs_agino_t	newlen;		/* new number of inodes */
 	int		isaligned = 0;	/* inode allocation at stripe unit */
 					/* boundary */
+	uint16_t	allocmask = (uint16_t) -1; /* init. to full chunk */
+	struct xfs_inobt_rec_incore rec;
 	struct xfs_perag *pag;
+	int		do_sparse = 0;
 
 	memset(&args, 0, sizeof(args));
 	args.tp = tp;
 	args.mp = tp->t_mountp;
+	args.fsbno = NULLFSBLOCK;
+
+#ifdef DEBUG
+	/* randomly do sparse inode allocations */
+	if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
+	    args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
+		do_sparse = prandom_u32() & 1;
+#endif
 
 	/*
 	 * Locking will ensure that we don't have two callers in here
@@ -390,6 +640,8 @@ xfs_ialloc_ag_alloc(
 	agno = be32_to_cpu(agi->agi_seqno);
 	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
 		     args.mp->m_ialloc_blks;
+	if (do_sparse)
+		goto sparse_alloc;
 	if (likely(newino != NULLAGINO &&
 		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
 		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -428,8 +680,7 @@ xfs_ialloc_ag_alloc(
 		 * subsequent requests.
 		 */
 		args.minalignslop = 0;
-	} else
-		args.fsbno = NULLFSBLOCK;
+	}
 
 	if (unlikely(args.fsbno == NULLFSBLOCK)) {
 		/*
@@ -480,6 +731,47 @@ xfs_ialloc_ag_alloc(
 			return error;
 	}
 
+	/*
+	 * Finally, try a sparse allocation if the filesystem supports it and
+	 * the sparse allocation length is smaller than a full chunk.
+	 */
+	if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
+	    args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
+	    args.fsbno == NULLFSBLOCK) {
+sparse_alloc:
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.agbno = be32_to_cpu(agi->agi_root);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+		args.alignment = args.mp->m_sb.sb_spino_align;
+		args.prod = 1;
+
+		args.minlen = args.mp->m_ialloc_min_blks;
+		args.maxlen = args.minlen;
+
+		/*
+		 * The inode record will be aligned to full chunk size. We must
+		 * prevent sparse allocation from AG boundaries that result in
+		 * invalid inode records, such as records that start at agbno 0
+		 * or extend beyond the AG.
+		 *
+		 * Set min agbno to the first aligned, non-zero agbno and max to
+		 * the last aligned agbno that is at least one full chunk from
+		 * the end of the AG.
+		 */
+		args.min_agbno = args.mp->m_sb.sb_inoalignmt;
+		args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+					    args.mp->m_sb.sb_inoalignmt) -
+				 args.mp->m_ialloc_blks;
+
+		error = xfs_alloc_vextent(&args);
+		if (error)
+			return error;
+
+		newlen = args.len << args.mp->m_sb.sb_inopblog;
+		ASSERT(newlen <= XFS_INODES_PER_CHUNK);
+		allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
+	}
+
 	if (args.fsbno == NULLFSBLOCK) {
 		*alloc = 0;
 		return 0;
@@ -495,8 +787,8 @@ xfs_ialloc_ag_alloc(
 	 * rather than a linear progression to prevent the next generation
 	 * number from being easily guessable.
 	 */
-	error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
-			args.len, prandom_u32());
+	error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
+			args.agbno, args.len, prandom_u32());
 
 	if (error)
 		return error;
@@ -504,6 +796,73 @@ xfs_ialloc_ag_alloc(
 	 * Convert the results.
 	 */
 	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+
+	if (xfs_inobt_issparse(~allocmask)) {
+		/*
+		 * We've allocated a sparse chunk. Align the startino and mask.
+		 */
+		xfs_align_sparse_ino(args.mp, &newino, &allocmask);
+
+		rec.ir_startino = newino;
+		rec.ir_holemask = ~allocmask;
+		rec.ir_count = newlen;
+		rec.ir_freecount = newlen;
+		rec.ir_free = XFS_INOBT_ALL_FREE;
+
+		/*
+		 * Insert the sparse record into the inobt and allow for a merge
+		 * if necessary. If a merge does occur, rec is updated to the
+		 * merged record.
+		 */
+		error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
+					       &rec, true);
+		if (error == -EFSCORRUPTED) {
+			xfs_alert(args.mp,
+	"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
+				  XFS_AGINO_TO_INO(args.mp, agno,
+						   rec.ir_startino),
+				  rec.ir_holemask, rec.ir_count);
+			xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
+		}
+		if (error)
+			return error;
+
+		/*
+		 * We can't merge the part we've just allocated as for the inobt
+		 * due to finobt semantics. The original record may or may not
+		 * exist independent of whether physical inodes exist in this
+		 * sparse chunk.
+		 *
+		 * We must update the finobt record based on the inobt record.
+		 * rec contains the fully merged and up to date inobt record
+		 * from the previous call. Set merge false to replace any
+		 * existing record with this one.
+		 */
+		if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+			error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
+						       XFS_BTNUM_FINO, &rec,
+						       false);
+			if (error)
+				return error;
+		}
+	} else {
+		/* full chunk - insert new records to both btrees */
+		error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+					 XFS_BTNUM_INO);
+		if (error)
+			return error;
+
+		if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+			error = xfs_inobt_insert(args.mp, tp, agbp, newino,
+						 newlen, XFS_BTNUM_FINO);
+			if (error)
+				return error;
+		}
+	}
+
+	/*
+	 * Update AGI counts and newino.
+	 */
 	be32_add_cpu(&agi->agi_count, newlen);
 	be32_add_cpu(&agi->agi_freecount, newlen);
 	pag = xfs_perag_get(args.mp, agno);
@@ -512,20 +871,6 @@ xfs_ialloc_ag_alloc(
 	agi->agi_newino = cpu_to_be32(newino);
 
 	/*
-	 * Insert records describing the new inode chunk into the btrees.
-	 */
-	error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-				 XFS_BTNUM_INO);
-	if (error)
-		return error;
-
-	if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
-		error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-					 XFS_BTNUM_FINO);
-		if (error)
-			return error;
-	}
-	/*
 	 * Log allocation group header fields
 	 */
 	xfs_ialloc_log_agi(tp, agbp,
@@ -645,7 +990,7 @@ xfs_ialloc_ag_select(
 		 * if we fail allocation due to alignment issues then it is most
 		 * likely a real ENOSPC condition.
 		 */
-		ineed = mp->m_ialloc_blks;
+		ineed = mp->m_ialloc_min_blks;
 		if (flags && ineed > 1)
 			ineed += xfs_ialloc_cluster_alignment(mp);
 		longest = pag->pagf_longest;
@@ -732,6 +1077,27 @@ xfs_ialloc_get_rec(
 }
 
 /*
+ * Return the offset of the first free inode in the record. If the inode chunk
+ * is sparsely allocated, we convert the record holemask to inode granularity
+ * and mask off the unallocated regions from the inode free mask.
+ */
+STATIC int
+xfs_inobt_first_free_inode(
+	struct xfs_inobt_rec_incore	*rec)
+{
+	xfs_inofree_t			realfree;
+
+	/* if there are no holes, return the first available offset */
+	if (!xfs_inobt_issparse(rec->ir_holemask))
+		return xfs_lowbit64(rec->ir_free);
+
+	realfree = xfs_inobt_irec_to_allocmask(rec);
+	realfree &= rec->ir_free;
+
+	return xfs_lowbit64(realfree);
+}
+
+/*
  * Allocate an inode using the inobt-only algorithm.
  */
 STATIC int
@@ -961,7 +1327,7 @@ newino:
 	}
 
 alloc_inode:
-	offset = xfs_lowbit64(rec.ir_free);
+	offset = xfs_inobt_first_free_inode(&rec);
 	ASSERT(offset >= 0);
 	ASSERT(offset < XFS_INODES_PER_CHUNK);
 	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1210,7 +1576,7 @@ xfs_dialloc_ag(
 	if (error)
 		goto error_cur;
 
-	offset = xfs_lowbit64(rec.ir_free);
+	offset = xfs_inobt_first_free_inode(&rec);
 	ASSERT(offset >= 0);
 	ASSERT(offset < XFS_INODES_PER_CHUNK);
 	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1439,6 +1805,83 @@ out_error:
 	return error;
 }
 
+/*
+ * Free the blocks of an inode chunk. We must consider that the inode chunk
+ * might be sparse and only free the regions that are allocated as part of the
+ * chunk.
+ */
+STATIC void
+xfs_difree_inode_chunk(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno,
+	struct xfs_inobt_rec_incore	*rec,
+	struct xfs_bmap_free		*flist)
+{
+	xfs_agblock_t	sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
+	int		startidx, endidx;
+	int		nextbit;
+	xfs_agblock_t	agbno;
+	int		contigblk;
+	DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+
+	if (!xfs_inobt_issparse(rec->ir_holemask)) {
+		/* not sparse, calculate extent info directly */
+		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+				  XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
+				  mp->m_ialloc_blks, flist, mp);
+		return;
+	}
+
+	/* holemask is only 16-bits (fits in an unsigned long) */
+	ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
+	holemask[0] = rec->ir_holemask;
+
+	/*
+	 * Find contiguous ranges of zeroes (i.e., allocated regions) in the
+	 * holemask and convert the start/end index of each range to an extent.
+	 * We start with the start and end index both pointing at the first 0 in
+	 * the mask.
+	 */
+	startidx = endidx = find_first_zero_bit(holemask,
+						XFS_INOBT_HOLEMASK_BITS);
+	nextbit = startidx + 1;
+	while (startidx < XFS_INOBT_HOLEMASK_BITS) {
+		nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
+					     nextbit);
+		/*
+		 * If the next zero bit is contiguous, update the end index of
+		 * the current range and continue.
+		 */
+		if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
+		    nextbit == endidx + 1) {
+			endidx = nextbit;
+			goto next;
+		}
+
+		/*
+		 * nextbit is not contiguous with the current end index. Convert
+		 * the current start/end to an extent and add it to the free
+		 * list.
+		 */
+		agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
+				  mp->m_sb.sb_inopblock;
+		contigblk = ((endidx - startidx + 1) *
+			     XFS_INODES_PER_HOLEMASK_BIT) /
+			    mp->m_sb.sb_inopblock;
+
+		ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
+		ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
+		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
+				  flist, mp);
+
+		/* reset range to current bit and carry on... */
+		startidx = endidx = nextbit;
+
+next:
+		nextbit++;
+	}
+}
+
 STATIC int
 xfs_difree_inobt(
 	struct xfs_mount		*mp,
@@ -1446,8 +1889,7 @@ xfs_difree_inobt(
 	struct xfs_buf			*agbp,
 	xfs_agino_t			agino,
 	struct xfs_bmap_free		*flist,
-	int				*deleted,
-	xfs_ino_t			*first_ino,
+	struct xfs_icluster		*xic,
 	struct xfs_inobt_rec_incore	*orec)
 {
 	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp);
@@ -1501,20 +1943,23 @@ xfs_difree_inobt(
 	rec.ir_freecount++;
 
 	/*
-	 * When an inode cluster is free, it becomes eligible for removal
+	 * When an inode chunk is free, it becomes eligible for removal. Don't
+	 * remove the chunk if the block size is large enough for multiple inode
+	 * chunks (that might not be free).
 	 */
 	if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
-	    (rec.ir_freecount == mp->m_ialloc_inos)) {
-
-		*deleted = 1;
-		*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+	    rec.ir_free == XFS_INOBT_ALL_FREE &&
+	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+		xic->deleted = 1;
+		xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+		xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
 
 		/*
 		 * Remove the inode cluster from the AGI B+Tree, adjust the
 		 * AGI and Superblock inode counts, and mark the disk space
 		 * to be freed when the transaction is committed.
 		 */
-		ilen = mp->m_ialloc_inos;
+		ilen = rec.ir_freecount;
 		be32_add_cpu(&agi->agi_count, -ilen);
 		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1530,11 +1975,9 @@ xfs_difree_inobt(
 			goto error0;
 		}
 
-		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
-				  XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
-				  mp->m_ialloc_blks, flist, mp);
+		xfs_difree_inode_chunk(mp, agno, &rec, flist);
 	} else {
-		*deleted = 0;
+		xic->deleted = 0;
 
 		error = xfs_inobt_update(cur, &rec);
 		if (error) {
@@ -1599,7 +2042,9 @@ xfs_difree_finobt(
 		 */
 		XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
 
-		error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+		error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
+					     ibtrec->ir_count,
+					     ibtrec->ir_freecount,
 					     ibtrec->ir_free, &i);
 		if (error)
 			goto error;
@@ -1634,8 +2079,13 @@ xfs_difree_finobt(
 	 * free inode. Hence, if all of the inodes are free and we aren't
 	 * keeping inode chunks permanently on disk, remove the record.
 	 * Otherwise, update the record with the new information.
+	 *
+	 * Note that we currently can't free chunks when the block size is large
+	 * enough for multiple chunks. Leave the finobt record to remain in sync
+	 * with the inobt.
 	 */
-	if (rec.ir_freecount == mp->m_ialloc_inos &&
+	if (rec.ir_free == XFS_INOBT_ALL_FREE &&
+	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
 	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 		error = xfs_btree_delete(cur, &i);
 		if (error)
@@ -1671,8 +2121,7 @@ xfs_difree(
 	struct xfs_trans	*tp,		/* transaction pointer */
 	xfs_ino_t		inode,		/* inode to be freed */
 	struct xfs_bmap_free	*flist,		/* extents to free */
-	int			*deleted,/* set if inode cluster was deleted */
-	xfs_ino_t		*first_ino)/* first inode in deleted cluster */
+	struct xfs_icluster	*xic)	/* cluster info if deleted */
 {
 	/* REFERENCED */
 	xfs_agblock_t		agbno;	/* block number containing inode */
@@ -1723,8 +2172,7 @@ xfs_difree(
 	/*
 	 * Fix up the inode allocation btree.
 	 */
-	error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
-				 &rec);
+	error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
 	if (error)
 		goto error0;
 
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 100007d56..6e450df29 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -28,6 +28,13 @@ struct xfs_btree_cur;
 /* Move inodes in clusters of this size */
 #define	XFS_INODE_BIG_CLUSTER_SIZE	8192
 
+struct xfs_icluster {
+	bool		deleted;	/* record is deleted */
+	xfs_ino_t	first_ino;	/* first inode number */
+	uint64_t	alloc;		/* inode phys. allocation bitmap for
+					 * sparse chunks */
+};
+
 /* Calculate and return the number of filesystem blocks per inode cluster */
 static inline int
 xfs_icluster_size_fsb(
@@ -44,8 +51,7 @@ xfs_icluster_size_fsb(
 static inline struct xfs_dinode *
 xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
 {
-	return (struct xfs_dinode *)
-		(xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+	return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog);
 }
 
 /*
@@ -90,8 +96,7 @@ xfs_difree(
 	struct xfs_trans *tp,		/* transaction pointer */
 	xfs_ino_t	inode,		/* inode to be freed */
 	struct xfs_bmap_free *flist,	/* extents to free */
-	int		*deleted,	/* set if inode cluster was deleted */
-	xfs_ino_t	*first_ino);	/* first inode in deleted cluster */
+	struct xfs_icluster *ifree);	/* cluster info if deleted */
 
 /*
  * Return the location of the inode in imap, for mapping it into a buffer.
@@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
  * Inode chunk initialisation routine
  */
 int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
-			  struct list_head *buffer_list,
+			  struct list_head *buffer_list, int icount,
 			  xfs_agnumber_t agno, xfs_agblock_t agbno,
 			  xfs_agblock_t length, unsigned int gen);
 
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 964c465ca..674ad8f76 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur(
 	union xfs_btree_rec	*rec)
 {
 	rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-	rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+	if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+		rec->inobt.ir_u.sp.ir_holemask =
+					cpu_to_be16(cur->bc_rec.i.ir_holemask);
+		rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
+		rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount;
+	} else {
+		/* ir_holemask/ir_count not supported on-disk */
+		rec->inobt.ir_u.f.ir_freecount =
+					cpu_to_be32(cur->bc_rec.i.ir_freecount);
+	}
 	rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
 }
 
@@ -418,3 +427,85 @@ xfs_inobt_maxrecs(
 		return blocklen / sizeof(xfs_inobt_rec_t);
 	return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
 }
+
+/*
+ * Convert the inode record holemask to an inode allocation bitmap. The inode
+ * allocation bitmap is inode granularity and specifies whether an inode is
+ * physically allocated on disk (not whether the inode is considered allocated
+ * or free by the fs).
+ *
+ * A bit value of 1 means the inode is allocated, a value of 0 means it is free.
+ */
+uint64_t
+xfs_inobt_irec_to_allocmask(
+	struct xfs_inobt_rec_incore	*rec)
+{
+	uint64_t			bitmap = 0;
+	uint64_t			inodespbit;
+	int				nextbit;
+	uint				allocbitmap;
+
+	/*
+	 * The holemask has 16-bits for a 64 inode record. Therefore each
+	 * holemask bit represents multiple inodes. Create a mask of bits to set
+	 * in the allocmask for each holemask bit.
+	 */
+	inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1;
+
+	/*
+	 * Allocated inodes are represented by 0 bits in holemask. Invert the 0
+	 * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask
+	 * anything beyond the 16 holemask bits since this casts to a larger
+	 * type.
+	 */
+	allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1);
+
+	/*
+	 * allocbitmap is the inverted holemask so every set bit represents
+	 * allocated inodes. To expand from 16-bit holemask granularity to
+	 * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target
+	 * bitmap for every holemask bit.
+	 */
+	nextbit = xfs_next_bit(&allocbitmap, 1, 0);
+	while (nextbit != -1) {
+		ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY));
+
+		bitmap |= (inodespbit <<
+			   (nextbit * XFS_INODES_PER_HOLEMASK_BIT));
+
+		nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1);
+	}
+
+	return bitmap;
+}
+
+#if defined(DEBUG) || defined(XFS_WARN)
+/*
+ * Verify that an in-core inode record has a valid inode count.
+ */
+int
+xfs_inobt_rec_check_count(
+	struct xfs_mount		*mp,
+	struct xfs_inobt_rec_incore	*rec)
+{
+	int				inocount = 0;
+	int				nextbit = 0;
+	uint64_t			allocbmap;
+	int				wordsz;
+
+	wordsz = sizeof(allocbmap) / sizeof(unsigned int);
+	allocbmap = xfs_inobt_irec_to_allocmask(rec);
+
+	nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
+	while (nextbit != -1) {
+		inocount++;
+		nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
+				       nextbit + 1);
+	}
+
+	if (inocount != rec->ir_count)
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+#endif	/* DEBUG */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index d7ebea72c..bd8845321 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
 		xfs_btnum_t);
 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 
+/* ir_holemask to inode allocation bitmap conversion */
+uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
+
+#if defined(DEBUG) || defined(XFS_WARN)
+int xfs_inobt_rec_check_count(struct xfs_mount *,
+			      struct xfs_inobt_rec_incore *);
+#else
+#define xfs_inobt_rec_check_count(mp, rec)	0
+#endif	/* DEBUG */
+
 #endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 002b6b3a1..6526e7696 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -46,8 +46,7 @@ xfs_inobp_check(
 	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
 
 	for (i = 0; i < j; i++) {
-		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-					i * mp->m_sb.sb_inodesize);
+		dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
 		if (!dip->di_next_unlinked)  {
 			xfs_alert(mp,
 	"Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
@@ -86,8 +85,7 @@ xfs_inode_buf_verify(
 		int		di_ok;
 		xfs_dinode_t	*dip;
 
-		dip = (struct xfs_dinode *)xfs_buf_offset(bp,
-					(i << mp->m_sb.sb_inodelog));
+		dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
 		di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
 			    XFS_DINODE_GOOD_VERSION(dip->di_version);
 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
@@ -186,7 +184,7 @@ xfs_imap_to_bp(
 	}
 
 	*bpp = bp;
-	*dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+	*dipp = xfs_buf_offset(bp, imap->im_boffset);
 	return 0;
 }
 
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index dc4bfc5d8..df9851c46 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -174,6 +174,27 @@ xfs_mount_validate_sb(
 			return -EFSCORRUPTED;
 	}
 
+	/*
+	 * Full inode chunks must be aligned to inode chunk size when
+	 * sparse inodes are enabled to support the sparse chunk
+	 * allocation algorithm and prevent overlapping inode records.
+	 */
+	if (xfs_sb_version_hassparseinodes(sbp)) {
+		uint32_t	align;
+
+		xfs_alert(mp,
+	"EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
+
+		align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
+				>> sbp->sb_blocklog;
+		if (sbp->sb_inoalignmt != align) {
+			xfs_warn(mp,
+"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
+				 sbp->sb_inoalignmt, align);
+			return -EINVAL;
+		}
+	}
+
 	if (unlikely(
 	    sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
 		xfs_warn(mp,
@@ -374,7 +395,7 @@ __xfs_sb_from_disk(
 				be32_to_cpu(from->sb_features_log_incompat);
 	/* crc is only used on disk, not in memory; just init to 0 here. */
 	to->sb_crc = 0;
-	to->sb_pad = 0;
+	to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
 	to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
 	to->sb_lsn = be64_to_cpu(from->sb_lsn);
 	/* Convert on-disk flags to in-memory flags? */
@@ -516,7 +537,7 @@ xfs_sb_to_disk(
 				cpu_to_be32(from->sb_features_incompat);
 		to->sb_features_log_incompat =
 				cpu_to_be32(from->sb_features_log_incompat);
-		to->sb_pad = 0;
+		to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
 		to->sb_lsn = cpu_to_be64(from->sb_lsn);
 	}
 }
@@ -689,6 +710,11 @@ xfs_sb_mount_common(
 	mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
 					sbp->sb_inopblock);
 	mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+
+	if (sbp->sb_spino_align)
+		mp->m_ialloc_min_blks = sbp->sb_spino_align;
+	else
+		mp->m_ialloc_min_blks = mp->m_ialloc_blks;
 }
 
 /*
@@ -792,12 +818,12 @@ xfs_sync_sb(
 	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
 	xfs_log_sb(tp);
 	if (wait)
 		xfs_trans_set_sync(tp);
-	return xfs_trans_commit(tp, 0);
+	return xfs_trans_commit(tp);
 }
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 8dda4b321..5be529707 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -182,12 +182,6 @@ int	xfs_log_calc_minimum_size(struct xfs_mount *);
 #define XFS_TRANS_FREEZE_PROT	0x40	/* Transaction has elevated writer
 					   count in superblock */
 /*
- * Values for call flags parameter.
- */
-#define	XFS_TRANS_RELEASE_LOG_RES	0x4
-#define	XFS_TRANS_ABORT			0x8
-
-/*
  * Field values for xfs_trans_mod_sb.
  */
 #define	XFS_TRANS_SB_ICOUNT		0x00000001
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 2d5bdfce6..797815012 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -73,9 +73,9 @@ struct xfs_trans_resv {
  * 2 trees * (2 blocks/level * max depth - 1) * block size
  */
 #define	XFS_ALLOCFREE_LOG_RES(mp,nx) \
-	((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+	((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1)))
 #define	XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
-	((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+	((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1)))
 
 /*
  * Per-directory log reservation for any directory change.
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index bf9c45793..41e0428d8 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -67,7 +67,7 @@
 #define	XFS_DIOSTRAT_SPACE_RES(mp, v)	\
 	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
 #define	XFS_GROWFS_SPACE_RES(mp)	\
-	(2 * XFS_AG_MAXLEVELS(mp))
+	(2 * (mp)->m_ag_maxlevels)
 #define	XFS_GROWFSRT_SPACE_RES(mp,b)	\
 	((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
 #define	XFS_LINK_SPACE_RES(mp,nl)	\
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a56960dd1..3859f5e27 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -109,7 +109,7 @@ xfs_setfilesize_trans_alloc(
 
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -145,7 +145,7 @@ xfs_setfilesize(
 	isize = xfs_new_eof(ip, offset + size);
 	if (!isize) {
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return 0;
 	}
 
@@ -155,7 +155,7 @@ xfs_setfilesize(
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	return xfs_trans_commit(tp, 0);
+	return xfs_trans_commit(tp);
 }
 
 STATIC int
@@ -356,7 +356,6 @@ xfs_end_bio(
 {
 	xfs_ioend_t		*ioend = bio->bi_private;
 
-	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
 	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 
 	/* Toss bio and pass work off to an xfsdatad thread */
@@ -1349,7 +1348,7 @@ __xfs_get_blocks(
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create,
-	int			direct)
+	bool			direct)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
@@ -1414,6 +1413,7 @@ __xfs_get_blocks(
 			if (error)
 				return error;
 			new = 1;
+
 		} else {
 			/*
 			 * Delalloc reservations do not require a transaction,
@@ -1508,49 +1508,29 @@ xfs_get_blocks(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
 }
 
-STATIC int
+int
 xfs_get_blocks_direct(
 	struct inode		*inode,
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
 }
 
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
-	struct kiocb		*iocb,
+static void
+__xfs_end_io_direct_write(
+	struct inode		*inode,
+	struct xfs_ioend	*ioend,
 	loff_t			offset,
-	ssize_t			size,
-	void			*private)
+	ssize_t			size)
 {
-	struct inode		*inode = file_inode(iocb->ki_filp);
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ioend	*ioend = private;
-
-	trace_xfs_gbmap_direct_endio(ip, offset, size,
-				     ioend ? ioend->io_type : 0, NULL);
+	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
 
-	if (!ioend) {
-		ASSERT(offset + size <= i_size_read(inode));
-		return;
-	}
-
-	if (XFS_FORCED_SHUTDOWN(mp))
+	if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
 		goto out_end_io;
 
 	/*
@@ -1587,10 +1567,10 @@ xfs_end_io_direct_write(
 	 * here can result in EOF moving backwards and Bad Things Happen when
 	 * that occurs.
 	 */
-	spin_lock(&ip->i_flags_lock);
+	spin_lock(&XFS_I(inode)->i_flags_lock);
 	if (offset + size > i_size_read(inode))
 		i_size_write(inode, offset + size);
-	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&XFS_I(inode)->i_flags_lock);
 
 	/*
 	 * If we are doing an append IO that needs to update the EOF on disk,
@@ -1607,6 +1587,98 @@ out_end_io:
 	return;
 }
 
+/*
+ * Complete a direct I/O write request.
+ *
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
+ */
+STATIC void
+xfs_end_io_direct_write(
+	struct kiocb		*iocb,
+	loff_t			offset,
+	ssize_t			size,
+	void			*private)
+{
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_ioend	*ioend = private;
+
+	trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
+				     ioend ? ioend->io_type : 0, NULL);
+
+	if (!ioend) {
+		ASSERT(offset + size <= i_size_read(inode));
+		return;
+	}
+
+	__xfs_end_io_direct_write(inode, ioend, offset, size);
+}
+
+/*
+ * For DAX we need a mapping buffer callback for unwritten extent conversion
+ * when page faults allocate blocks and then zero them. Note that in this
+ * case the mapping indicated by the ioend may extend beyond EOF. We most
+ * definitely do not want to extend EOF here, so we trim back the ioend size to
+ * EOF.
+ */
+#ifdef CONFIG_FS_DAX
+void
+xfs_end_io_dax_write(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+	struct xfs_ioend	*ioend = bh->b_private;
+	struct inode		*inode = ioend->io_inode;
+	ssize_t			size = ioend->io_size;
+
+	ASSERT(IS_DAX(ioend->io_inode));
+
+	/* if there was an error zeroing, then don't convert it */
+	if (!uptodate)
+		ioend->io_error = -EIO;
+
+	/*
+	 * Trim update to EOF, so we don't extend EOF during unwritten extent
+	 * conversion of partial EOF blocks.
+	 */
+	spin_lock(&XFS_I(inode)->i_flags_lock);
+	if (ioend->io_offset + size > i_size_read(inode))
+		size = i_size_read(inode) - ioend->io_offset;
+	spin_unlock(&XFS_I(inode)->i_flags_lock);
+
+	__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
+
+}
+#else
+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
+#endif
+
+static inline ssize_t
+xfs_vm_do_dio(
+	struct inode		*inode,
+	struct kiocb		*iocb,
+	struct iov_iter		*iter,
+	loff_t			offset,
+	void			(*endio)(struct kiocb	*iocb,
+					 loff_t		offset,
+					 ssize_t	size,
+					 void		*private),
+	int			flags)
+{
+	struct block_device	*bdev;
+
+	if (IS_DAX(inode))
+		return dax_do_io(iocb, inode, iter, offset,
+				 xfs_get_blocks_direct, endio, 0);
+
+	bdev = xfs_find_bdev_for_inode(inode);
+	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+				     xfs_get_blocks_direct, endio, NULL, flags);
+}
+
 STATIC ssize_t
 xfs_vm_direct_IO(
 	struct kiocb		*iocb,
@@ -1614,16 +1686,11 @@ xfs_vm_direct_IO(
 	loff_t			offset)
 {
 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
-	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
 
-	if (iov_iter_rw(iter) == WRITE) {
-		return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-					    xfs_get_blocks_direct,
-					    xfs_end_io_direct_write, NULL,
-					    DIO_ASYNC_EXTEND);
-	}
-	return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-				    xfs_get_blocks_direct, NULL, NULL, 0);
+	if (iov_iter_rw(iter) == WRITE)
+		return xfs_vm_do_dio(inode, iocb, iter, offset,
+				     xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
+	return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
 }
 
 /*
@@ -1874,6 +1941,7 @@ xfs_vm_set_page_dirty(
 	loff_t			end_offset;
 	loff_t			offset;
 	int			newly_dirty;
+	struct mem_cgroup	*memcg;
 
 	if (unlikely(!mapping))
 		return !TestSetPageDirty(page);
@@ -1893,6 +1961,11 @@ xfs_vm_set_page_dirty(
 			offset += 1 << inode->i_blkbits;
 		} while (bh != head);
 	}
+	/*
+	 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+	 * per-memcg dirty page counters.
+	 */
+	memcg = mem_cgroup_begin_page_stat(page);
 	newly_dirty = !TestSetPageDirty(page);
 	spin_unlock(&mapping->private_lock);
 
@@ -1903,13 +1976,15 @@ xfs_vm_set_page_dirty(
 		spin_lock_irqsave(&mapping->tree_lock, flags);
 		if (page->mapping) {	/* Race with truncate? */
 			WARN_ON_ONCE(!PageUptodate(page));
-			account_page_dirtied(page, mapping);
+			account_page_dirtied(page, mapping, memcg);
 			radix_tree_tag_set(&mapping->page_tree,
 					page_index(page), PAGECACHE_TAG_DIRTY);
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
-		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
+	mem_cgroup_end_page_stat(memcg);
+	if (newly_dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	return newly_dirty;
 }
 
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index ac644e013..86afd1ac7 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -53,7 +53,12 @@ typedef struct xfs_ioend {
 } xfs_ioend_t;
 
 extern const struct address_space_operations xfs_address_space_operations;
-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+int	xfs_get_blocks(struct inode *inode, sector_t offset,
+		       struct buffer_head *map_bh, int create);
+int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
+			      struct buffer_head *map_bh, int create);
+void	xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
 
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 73e75a87a..2bb959ada 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -394,7 +394,6 @@ xfs_attr_inactive(
 {
 	struct xfs_trans	*trans;
 	struct xfs_mount	*mp;
-	int			cancel_flags = 0;
 	int			lock_mode = XFS_ILOCK_SHARED;
 	int			error = 0;
 
@@ -423,7 +422,6 @@ xfs_attr_inactive(
 		goto out_cancel;
 
 	lock_mode = XFS_ILOCK_EXCL;
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT;
 	xfs_ilock(dp, lock_mode);
 
 	if (!XFS_IFORK_Q(dp))
@@ -455,12 +453,12 @@ xfs_attr_inactive(
 	/* Reset the attribute fork - this also destroys the in-core fork */
 	xfs_attr_fork_remove(dp, trans);
 
-	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(trans);
 	xfs_iunlock(dp, lock_mode);
 	return error;
 
 out_cancel:
-	xfs_trans_cancel(trans, cancel_flags);
+	xfs_trans_cancel(trans);
 out_destroy_fork:
 	/* kill the in-core attr fork before we drop the inode lock */
 	if (dp->i_afp)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a52bbd3ab..0f34886cf 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -75,28 +75,20 @@ xfs_bmap_finish(
 	xfs_efi_log_item_t	*efi;		/* extent free intention */
 	int			error;		/* error return value */
 	xfs_bmap_free_item_t	*free;		/* free extent item */
-	struct xfs_trans_res	tres;		/* new log reservation */
 	xfs_mount_t		*mp;		/* filesystem mount structure */
 	xfs_bmap_free_item_t	*next;		/* next item on free list */
-	xfs_trans_t		*ntp;		/* new transaction pointer */
 
 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 	if (flist->xbf_count == 0) {
 		*committed = 0;
 		return 0;
 	}
-	ntp = *tp;
-	efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+	efi = xfs_trans_get_efi(*tp, flist->xbf_count);
 	for (free = flist->xbf_first; free; free = free->xbfi_next)
-		xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+		xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
 			free->xbfi_blockcount);
 
-	tres.tr_logres = ntp->t_log_res;
-	tres.tr_logcount = ntp->t_log_count;
-	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-	ntp = xfs_trans_dup(*tp);
-	error = xfs_trans_commit(*tp, 0);
-	*tp = ntp;
+	error = xfs_trans_roll(tp, NULL);
 	*committed = 1;
 	/*
 	 * We have a new transaction, so we should return committed=1,
@@ -105,19 +97,10 @@ xfs_bmap_finish(
 	if (error)
 		return error;
 
-	/*
-	 * transaction commit worked ok so we can drop the extra ticket
-	 * reference that we gained in xfs_trans_dup()
-	 */
-	xfs_log_ticket_put(ntp->t_ticket);
-
-	error = xfs_trans_reserve(ntp, &tres, 0, 0);
-	if (error)
-		return error;
-	efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+	efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
 	for (free = flist->xbf_first; free != NULL; free = next) {
 		next = free->xbfi_next;
-		if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+		if ((error = xfs_free_extent(*tp, free->xbfi_startblock,
 				free->xbfi_blockcount))) {
 			/*
 			 * The bmap free list will be cleaned up at a
@@ -127,7 +110,7 @@ xfs_bmap_finish(
 			 * happens, since this transaction may not be
 			 * dirty yet.
 			 */
-			mp = ntp->t_mountp;
+			mp = (*tp)->t_mountp;
 			if (!XFS_FORCED_SHUTDOWN(mp))
 				xfs_force_shutdown(mp,
 						   (error == -EFSCORRUPTED) ?
@@ -135,7 +118,7 @@ xfs_bmap_finish(
 						   SHUTDOWN_META_IO_ERROR);
 			return error;
 		}
-		xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+		xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock,
 			free->xbfi_blockcount);
 		xfs_bmap_del_free(flist, NULL, free);
 	}
@@ -878,7 +861,7 @@ xfs_free_eofblocks(
 
 		if (need_iolock) {
 			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
-				xfs_trans_cancel(tp, 0);
+				xfs_trans_cancel(tp);
 				return -EAGAIN;
 			}
 		}
@@ -886,7 +869,7 @@ xfs_free_eofblocks(
 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
 		if (error) {
 			ASSERT(XFS_FORCED_SHUTDOWN(mp));
-			xfs_trans_cancel(tp, 0);
+			xfs_trans_cancel(tp);
 			if (need_iolock)
 				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 			return error;
@@ -908,12 +891,9 @@ xfs_free_eofblocks(
 			 * If we get an error at this point we simply don't
 			 * bother truncating the file.
 			 */
-			xfs_trans_cancel(tp,
-					 (XFS_TRANS_RELEASE_LOG_RES |
-					  XFS_TRANS_ABORT));
+			xfs_trans_cancel(tp);
 		} else {
-			error = xfs_trans_commit(tp,
-						XFS_TRANS_RELEASE_LOG_RES);
+			error = xfs_trans_commit(tp);
 			if (!error)
 				xfs_inode_clear_eofblocks_tag(ip);
 		}
@@ -1026,7 +1006,7 @@ xfs_alloc_file_space(
 			 * Free the transaction structure.
 			 */
 			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-			xfs_trans_cancel(tp, 0);
+			xfs_trans_cancel(tp);
 			break;
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1053,7 +1033,7 @@ xfs_alloc_file_space(
 			goto error0;
 		}
 
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		error = xfs_trans_commit(tp);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		if (error) {
 			break;
@@ -1077,7 +1057,7 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 	xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
 
 error1:	/* Just cancel transaction */
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
@@ -1133,14 +1113,29 @@ xfs_zero_remaining_bytes(
 			break;
 		ASSERT(imap.br_blockcount >= 1);
 		ASSERT(imap.br_startoff == offset_fsb);
+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+
+		if (imap.br_startblock == HOLESTARTBLOCK ||
+		    imap.br_state == XFS_EXT_UNWRITTEN) {
+			/* skip the entire extent */
+			lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
+						      imap.br_blockcount) - 1;
+			continue;
+		}
+
 		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
 		if (lastoffset > endoff)
 			lastoffset = endoff;
-		if (imap.br_startblock == HOLESTARTBLOCK)
-			continue;
-		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-		if (imap.br_state == XFS_EXT_UNWRITTEN)
+
+		/* DAX can just zero the backing device directly */
+		if (IS_DAX(VFS_I(ip))) {
+			error = dax_zero_page_range(VFS_I(ip), offset,
+						    lastoffset - offset + 1,
+						    xfs_get_blocks_direct);
+			if (error)
+				return error;
 			continue;
+		}
 
 		error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -1289,7 +1284,7 @@ xfs_free_file_space(
 			 * Free the transaction structure.
 			 */
 			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-			xfs_trans_cancel(tp, 0);
+			xfs_trans_cancel(tp);
 			break;
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1320,7 +1315,7 @@ xfs_free_file_space(
 			goto error0;
 		}
 
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		error = xfs_trans_commit(tp);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 
@@ -1330,7 +1325,7 @@ xfs_free_file_space(
  error0:
 	xfs_bmap_cancel(&free_list);
  error1:
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	goto out;
 }
@@ -1462,7 +1457,7 @@ xfs_shift_file_space(
 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
 				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
 		if (error) {
-			xfs_trans_cancel(tp, 0);
+			xfs_trans_cancel(tp);
 			break;
 		}
 
@@ -1492,13 +1487,13 @@ xfs_shift_file_space(
 		if (error)
 			goto out;
 
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		error = xfs_trans_commit(tp);
 	}
 
 	return error;
 
 out:
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 	return error;
 }
 
@@ -1718,7 +1713,7 @@ xfs_swap_extents(
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		goto out_unlock;
 	}
 
@@ -1901,7 +1896,7 @@ xfs_swap_extents(
 	if (mp->m_flags & XFS_MOUNT_WSYNC)
 		xfs_trans_set_sync(tp);
 
-	error = xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp);
 
 	trace_xfs_swap_extent_after(ip, 0);
 	trace_xfs_swap_extent_after(tip, 1);
@@ -1915,6 +1910,6 @@ out_unlock:
 	goto out;
 
 out_trans_cancel:
-	xfs_trans_cancel(tp, 0);
+	xfs_trans_cancel(tp);
 	goto out;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1790b00be..a4b7d92e9 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1419,9 +1419,9 @@ xfs_buf_submit_wait(
 	return error;
 }
 
-xfs_caddr_t
+void *
 xfs_buf_offset(
-	xfs_buf_t		*bp,
+	struct xfs_buf		*bp,
 	size_t			offset)
 {
 	struct page		*page;
@@ -1431,7 +1431,7 @@ xfs_buf_offset(
 
 	offset += bp->b_offset;
 	page = bp->b_pages[offset >> PAGE_SHIFT];
-	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+	return page_address(page) + (offset & (PAGE_SIZE-1));
 }
 
 /*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 75ff5d5a7..331c1ccf8 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -299,7 +299,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 
 /* Buffer Utility Routines */
-extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+extern void *xfs_buf_offset(struct xfs_buf *, size_t);
 
 /* Delayed Write Buffer Routines */
 extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 02c01bbbc..4143dc75d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -568,8 +568,6 @@ xfs_qm_dqread(
 	struct xfs_buf		*bp;
 	struct xfs_trans	*tp = NULL;
 	int			error;
-	int			cancelflags = 0;
-
 
 	dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
 
@@ -617,7 +615,6 @@ xfs_qm_dqread(
 					  XFS_QM_DQALLOC_SPACE_RES(mp), 0);
 		if (error)
 			goto error1;
-		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
 	}
 
 	/*
@@ -632,7 +629,6 @@ xfs_qm_dqread(
 		 * allocate (ENOENT).
 		 */
 		trace_xfs_dqread_fail(dqp);
-		cancelflags |= XFS_TRANS_ABORT;
 		goto error1;
 	}
 
@@ -670,7 +666,7 @@ xfs_qm_dqread(
 	xfs_trans_brelse(tp, bp);
 
 	if (tp) {
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		error = xfs_trans_commit(tp);
 		if (error)
 			goto error0;
 	}
@@ -680,7 +676,7 @@ xfs_qm_dqread(
 
 error1:
 	if (tp)
-		xfs_trans_cancel(tp, cancelflags);
+		xfs_trans_cancel(tp);
 error0:
 	xfs_qm_dqdestroy(dqp);
 	*O_dqpp = NULL;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 338e50bbf..74d0e5966 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -127,7 +127,7 @@ xfs_error_report(
 	struct xfs_mount	*mp,
 	const char		*filename,
 	int			linenum,
-	inst_t			*ra)
+	void			*ra)
 {
 	if (level <= xfs_error_level) {
 		xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
@@ -146,7 +146,7 @@ xfs_corruption_error(
 	void			*p,
 	const char		*filename,
 	int			linenum,
-	inst_t			*ra)
+	void			*ra)
 {
 	if (level <= xfs_error_level)
 		xfs_hex_dump(p, 64);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c0394ed12..4ed3042a0 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -21,10 +21,10 @@
 struct xfs_mount;
 
 extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
-			const char *filename, int linenum, inst_t *ra);
+			const char *filename, int linenum, void *ra);
 extern void xfs_corruption_error(const char *tag, int level,
 			struct xfs_mount *mp, void *p, const char *filename,
-			int linenum, inst_t *ra);
+			int linenum, void *ra);
 extern void xfs_verifier_error(struct xfs_buf *bp);
 
 #define	XFS_ERROR_REPORT(e, lvl, mp)	\
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index cb7fe64cd..adc8f8fdd 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -239,7 +239,7 @@ xfs_efi_init(
 
 	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
 	efip->efi_format.efi_nextents = nextents;
-	efip->efi_format.efi_id = (__psint_t)(void*)efip;
+	efip->efi_format.efi_id = (uintptr_t)(void *)efip;
 	atomic_set(&efip->efi_next_extent, 0);
 	atomic_set(&efip->efi_refcount, 2);
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 3b7591224..db4acc1c3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -41,6 +41,7 @@
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
+#include <linux/backing-dev.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -79,14 +80,15 @@ xfs_rw_ilock_demote(
 }
 
 /*
- *	xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
  *
- *	xfs_iozero clears the specified range of buffer supplied,
- *	and marks all the affected blocks as valid and modified.  If
- *	an affected block is not allocated, it will be allocated.  If
- *	an affected block is not completely overwritten, and is not
- *	valid before the operation, it will be read from disk before
- *	being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
  */
 int
 xfs_iozero(
@@ -96,7 +98,8 @@ xfs_iozero(
 {
 	struct page		*page;
 	struct address_space	*mapping;
-	int			status;
+	int			status = 0;
+
 
 	mapping = VFS_I(ip)->i_mapping;
 	do {
@@ -108,20 +111,27 @@ xfs_iozero(
 		if (bytes > count)
 			bytes = count;
 
-		status = pagecache_write_begin(NULL, mapping, pos, bytes,
-					AOP_FLAG_UNINTERRUPTIBLE,
-					&page, &fsdata);
-		if (status)
-			break;
+		if (IS_DAX(VFS_I(ip))) {
+			status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+						     xfs_get_blocks_direct);
+			if (status)
+				break;
+		} else {
+			status = pagecache_write_begin(NULL, mapping, pos, bytes,
+						AOP_FLAG_UNINTERRUPTIBLE,
+						&page, &fsdata);
+			if (status)
+				break;
 
-		zero_user(page, offset, bytes);
+			zero_user(page, offset, bytes);
 
-		status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
-					page, fsdata);
-		WARN_ON(status <= 0); /* can't return less than zero! */
+			status = pagecache_write_end(NULL, mapping, pos, bytes,
+						bytes, page, fsdata);
+			WARN_ON(status <= 0); /* can't return less than zero! */
+			status = 0;
+		}
 		pos += bytes;
 		count -= bytes;
-		status = 0;
 	} while (count);
 
 	return status;
@@ -138,7 +148,7 @@ xfs_update_prealloc_flags(
 	tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
 	error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -160,7 +170,7 @@ xfs_update_prealloc_flags(
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	if (flags & XFS_PREALLOC_SYNC)
 		xfs_trans_set_sync(tp);
-	return xfs_trans_commit(tp, 0);
+	return xfs_trans_commit(tp);
 }
 
 /*
@@ -284,7 +294,7 @@ xfs_file_read_iter(
 	if (file->f_mode & FMODE_NOCMTIME)
 		ioflags |= XFS_IO_INVIS;
 
-	if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+	if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
 		xfs_buftarg_t	*target =
 			XFS_IS_REALTIME_INODE(ip) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -378,7 +388,11 @@ xfs_file_splice_read(
 
 	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
 
-	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+	/* for dax, we need to avoid the page cache */
+	if (IS_DAX(VFS_I(ip)))
+		ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+	else
+		ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
@@ -563,6 +577,13 @@ restart:
 	if (error)
 		return error;
 
+	/* For changing security info in file_remove_privs() we need i_mutex */
+	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+		xfs_rw_iunlock(ip, *iolock);
+		*iolock = XFS_IOLOCK_EXCL;
+		xfs_rw_ilock(ip, *iolock);
+		goto restart;
+	}
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
 	 * blocks that fall between the existing EOF and the start of this
@@ -623,7 +644,9 @@ restart:
 	 * setgid bits if the process is not being run by root.  This keeps
 	 * people from modifying setuid and setgid binaries.
 	 */
-	return file_remove_suid(file);
+	if (!IS_NOSEC(inode))
+		return file_remove_privs(file);
+	return 0;
 }
 
 /*
@@ -672,7 +695,7 @@ xfs_file_dio_aio_write(
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
 	/* DIO must be aligned to device logical sector size */
-	if ((pos | count) & target->bt_logical_sectormask)
+	if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
 		return -EINVAL;
 
 	/* "unaligned" here means not aligned to a filesystem block */
@@ -758,8 +781,11 @@ xfs_file_dio_aio_write(
 out:
 	xfs_rw_iunlock(ip, iolock);
 
-	/* No fallback to buffered IO on errors for XFS. */
-	ASSERT(ret < 0 || ret == count);
+	/*
+	 * No fallback to buffered IO on errors for XFS. DAX can result in
+	 * partial writes, but direct IO will either complete fully or fail.
+	 */
+	ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
 	return ret;
 }
 
@@ -842,7 +868,7 @@ xfs_file_write_iter(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
-	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
 		ret = xfs_file_dio_aio_write(iocb, from);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1063,17 +1089,6 @@ xfs_file_readdir(
 	return xfs_readdir(ip, ctx, bufsize);
 }
 
-STATIC int
-xfs_file_mmap(
-	struct file	*filp,
-	struct vm_area_struct *vma)
-{
-	vma->vm_ops = &xfs_file_vm_ops;
-
-	file_accessed(filp);
-	return 0;
-}
-
 /*
  * This type is designed to indicate the type of offset we would like
  * to search from page cache for xfs_seek_hole_data().
@@ -1454,48 +1469,92 @@ xfs_file_llseek(
  * ordering of:
  *
  * mmap_sem (MM)
- *   i_mmap_lock (XFS - truncate serialisation)
- *     page_lock (MM)
- *       i_lock (XFS - extent map serialisation)
+ *   sb_start_pagefault(vfs, freeze)
+ *     i_mmap_lock (XFS - truncate serialisation)
+ *       page_lock (MM)
+ *         i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
  */
 STATIC int
-xfs_filemap_fault(
+xfs_filemap_page_mkwrite(
 	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
-	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
-	int			error;
+	struct inode		*inode = file_inode(vma->vm_file);
+	int			ret;
 
-	trace_xfs_filemap_fault(ip);
+	trace_xfs_filemap_page_mkwrite(XFS_I(inode));
 
-	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-	error = filemap_fault(vma, vmf);
-	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
+	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
-	return error;
+	if (IS_DAX(inode)) {
+		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
+				    xfs_end_io_dax_write);
+	} else {
+		ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+		ret = block_page_mkwrite_return(ret);
+	}
+
+	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	sb_end_pagefault(inode->i_sb);
+
+	return ret;
 }
 
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
 STATIC int
-xfs_filemap_page_mkwrite(
+xfs_filemap_fault(
 	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
-	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
-	int			error;
+	struct inode		*inode = file_inode(vma->vm_file);
+	int			ret;
 
-	trace_xfs_filemap_page_mkwrite(ip);
+	trace_xfs_filemap_fault(XFS_I(inode));
 
-	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-	error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
-	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+	/* DAX can shortcut the normal fault path on write faults! */
+	if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
+		return xfs_filemap_page_mkwrite(vma, vmf);
 
-	return error;
+	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	if (IS_DAX(inode)) {
+		/*
+		 * we do not want to trigger unwritten extent conversion on read
+		 * faults - that is unnecessary overhead and would also require
+		 * changes to xfs_get_blocks_direct() to map unwritten extent
+		 * ioend for conversion on read-only mappings.
+		 */
+		ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+	} else
+		ret = filemap_fault(vma, vmf);
+	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+	return ret;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+	.fault		= xfs_filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite	= xfs_filemap_page_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+	struct file	*filp,
+	struct vm_area_struct *vma)
+{
+	file_accessed(filp);
+	vma->vm_ops = &xfs_file_vm_ops;
+	if (IS_DAX(file_inode(filp)))
+		vma->vm_flags |= VM_MIXEDMAP;
+	return 0;
 }
 
 const struct file_operations xfs_file_operations = {
@@ -1526,9 +1585,3 @@ const struct file_operations xfs_dir_file_operations = {
 #endif
 	.fsync		= xfs_dir_fsync,
 };
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
-	.fault		= xfs_filemap_fault,
-	.map_pages	= filemap_map_pages,
-	.page_mkwrite	= xfs_filemap_page_mkwrite,
-};
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index da82f1cb4..c4c130f9b 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -196,7 +196,8 @@ xfs_filestream_pick_ag(
 			goto next_ag;
 		}
 
-		longest = xfs_alloc_longest_free_extent(mp, pag);
+		longest = xfs_alloc_longest_free_extent(mp, pag,
+					xfs_alloc_min_freelist(mp, pag));
 		if (((minlen && longest >= minlen) ||
 		     (!minlen && pag->pagf_freeblks >= minfree)) &&
 		    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cb7e8a29d..9b3438a76 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -101,7 +101,9 @@ xfs_fs_geometry(
 			(xfs_sb_version_hasftype(&mp->m_sb) ?
 				XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
 			(xfs_sb_version_hasfinobt(&mp->m_sb) ?
-				XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
+				XFS_FSOP_GEOM_FLAGS_FINOBT : 0) |
+			(xfs_sb_version_hassparseinodes(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_SPINODES : 0);
 		geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
 				mp->m_sb.sb_logsectsize : BBSIZE;
 		geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -201,7 +203,7 @@ xfs_growfs_data_private(
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
 				  XFS_GROWFS_SPACE_RES(mp), 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -489,7 +491,7 @@ xfs_growfs_data_private(
 	if (dpct)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
 	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp);
 	if (error)
 		return error;
 
@@ -557,7 +559,7 @@ xfs_growfs_data_private(
 	return saved_error ? saved_error : error;
 
  error0:
-	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 539a85fdd..3da9f4da4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -905,7 +905,6 @@ xfs_dir_ialloc(
 
 {
 	xfs_trans_t	*tp;
-	xfs_trans_t	*ntp;
 	xfs_inode_t	*ip;
 	xfs_buf_t	*ialloc_context = NULL;
 	int		code;
@@ -954,8 +953,6 @@ xfs_dir_ialloc(
 	 * to succeed the second time.
 	 */
 	if (ialloc_context) {
-		struct xfs_trans_res tres;
-
 		/*
 		 * Normally, xfs_trans_commit releases all the locks.
 		 * We call bhold to hang on to the ialloc_context across
@@ -964,12 +961,6 @@ xfs_dir_ialloc(
 		 * allocation group.
 		 */
 		xfs_trans_bhold(tp, ialloc_context);
-		/*
-		 * Save the log reservation so we can use
-		 * them in the next transaction.
-		 */
-		tres.tr_logres = xfs_trans_get_log_res(tp);
-		tres.tr_logcount = xfs_trans_get_log_count(tp);
 
 		/*
 		 * We want the quota changes to be associated with the next
@@ -985,35 +976,9 @@ xfs_dir_ialloc(
 			tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
 		}
 
-		ntp = xfs_trans_dup(tp);
-		code = xfs_trans_commit(tp, 0);
-		tp = ntp;
-		if (committed != NULL) {
+		code = xfs_trans_roll(&tp, 0);
+		if (committed != NULL)
 			*committed = 1;
-		}
-		/*
-		 * If we get an error during the commit processing,
-		 * release the buffer that is still held and return
-		 * to the caller.
-		 */
-		if (code) {
-			xfs_buf_relse(ialloc_context);
-			if (dqinfo) {
-				tp->t_dqinfo = dqinfo;
-				xfs_trans_free_dqinfo(tp);
-			}
-			*tpp = ntp;
-			*ipp = NULL;
-			return code;
-		}
-
-		/*
-		 * transaction commit worked ok so we can drop the extra ticket
-		 * reference that we gained in xfs_trans_dup()
-		 */
-		xfs_log_ticket_put(tp->t_ticket);
-		tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-		code = xfs_trans_reserve(tp, &tres, 0, 0);
 
 		/*
 		 * Re-attach the quota info that we detached from prev trx.
@@ -1025,7 +990,7 @@ xfs_dir_ialloc(
 
 		if (code) {
 			xfs_buf_relse(ialloc_context);
-			*tpp = ntp;
+			*tpp = tp;
 			*ipp = NULL;
 			return code;
 		}
@@ -1127,7 +1092,6 @@ xfs_create(
 	xfs_bmap_free_t		free_list;
 	xfs_fsblock_t		first_block;
 	bool                    unlock_dp_on_error = false;
-	uint			cancel_flags;
 	int			committed;
 	prid_t			prid;
 	struct xfs_dquot	*udqp = NULL;
@@ -1164,8 +1128,6 @@ xfs_create(
 		tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
 	}
 
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
 	/*
 	 * Initially assume that the file does not exist and
 	 * reserve the resources for that case.  If that is not
@@ -1183,10 +1145,9 @@ xfs_create(
 		resblks = 0;
 		error = xfs_trans_reserve(tp, tres, 0, 0);
 	}
-	if (error) {
-		cancel_flags = 0;
+	if (error)
 		goto out_trans_cancel;
-	}
+
 
 	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = true;
@@ -1217,7 +1178,7 @@ xfs_create(
 	if (error) {
 		if (error == -ENOSPC)
 			goto out_trans_cancel;
-		goto out_trans_abort;
+		goto out_trans_cancel;
 	}
 
 	/*
@@ -1235,7 +1196,7 @@ xfs_create(
 					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
 		ASSERT(error != -ENOSPC);
-		goto out_trans_abort;
+		goto out_trans_cancel;
 	}
 	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -1269,7 +1230,7 @@ xfs_create(
 	if (error)
 		goto out_bmap_cancel;
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 	if (error)
 		goto out_release_inode;
 
@@ -1282,10 +1243,8 @@ xfs_create(
 
  out_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
- out_trans_abort:
-	cancel_flags |= XFS_TRANS_ABORT;
  out_trans_cancel:
-	xfs_trans_cancel(tp, cancel_flags);
+	xfs_trans_cancel(tp);
  out_release_inode:
 	/*
 	 * Wait until after the current transaction is aborted to finish the
@@ -1317,7 +1276,6 @@ xfs_create_tmpfile(
 	struct xfs_inode	*ip = NULL;
 	struct xfs_trans	*tp = NULL;
 	int			error;
-	uint			cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 	prid_t                  prid;
 	struct xfs_dquot	*udqp = NULL;
 	struct xfs_dquot	*gdqp = NULL;
@@ -1350,10 +1308,8 @@ xfs_create_tmpfile(
 		resblks = 0;
 		error = xfs_trans_reserve(tp, tres, 0, 0);
 	}
-	if (error) {
-		cancel_flags = 0;
+	if (error)
 		goto out_trans_cancel;
-	}
 
 	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
 						pdqp, resblks, 1, 0);
@@ -1365,7 +1321,7 @@ xfs_create_tmpfile(
 	if (error) {
 		if (error == -ENOSPC)
 			goto out_trans_cancel;
-		goto out_trans_abort;
+		goto out_trans_cancel;
 	}
 
 	if (mp->m_flags & XFS_MOUNT_WSYNC)
@@ -1381,9 +1337,9 @@ xfs_create_tmpfile(
 	ip->i_d.di_nlink--;
 	error = xfs_iunlink(tp, ip);
 	if (error)
-		goto out_trans_abort;
+		goto out_trans_cancel;
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 	if (error)
 		goto out_release_inode;
 
@@ -1394,10 +1350,8 @@ xfs_create_tmpfile(
 	*ipp = ip;
 	return 0;
 
- out_trans_abort:
-	cancel_flags |= XFS_TRANS_ABORT;
  out_trans_cancel:
-	xfs_trans_cancel(tp, cancel_flags);
+	xfs_trans_cancel(tp);
  out_release_inode:
 	/*
 	 * Wait until after the current transaction is aborted to finish the
@@ -1427,7 +1381,6 @@ xfs_link(
 	int			error;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
-	int			cancel_flags;
 	int			committed;
 	int			resblks;
 
@@ -1447,17 +1400,14 @@ xfs_link(
 		goto std_return;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
 	if (error == -ENOSPC) {
 		resblks = 0;
 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
 	}
-	if (error) {
-		cancel_flags = 0;
+	if (error)
 		goto error_return;
-	}
 
 	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
@@ -1486,19 +1436,19 @@ xfs_link(
 	if (sip->i_d.di_nlink == 0) {
 		error = xfs_iunlink_remove(tp, sip);
 		if (error)
-			goto abort_return;
+			goto error_return;
 	}
 
 	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
 					&first_block, &free_list, resblks);
 	if (error)
-		goto abort_return;
+		goto error_return;
 	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
 
 	error = xfs_bumplink(tp, sip);
 	if (error)
-		goto abort_return;
+		goto error_return;
 
 	/*
 	 * If this is a synchronous mount, make sure that the
@@ -1512,15 +1462,13 @@ xfs_link(
 	error = xfs_bmap_finish (&tp, &free_list, &committed);
 	if (error) {
 		xfs_bmap_cancel(&free_list);
-		goto abort_return;
+		goto error_return;
 	}
 
-	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	return xfs_trans_commit(tp);
 
- abort_return:
-	cancel_flags |= XFS_TRANS_ABORT;
  error_return:
-	xfs_trans_cancel(tp, cancel_flags);
+	xfs_trans_cancel(tp);
  std_return:
 	return error;
 }
@@ -1555,7 +1503,6 @@ xfs_itruncate_extents(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp = *tpp;
-	struct xfs_trans	*ntp;
 	xfs_bmap_free_t		free_list;
 	xfs_fsblock_t		first_block;
 	xfs_fileoff_t		first_unmap_block;
@@ -1613,29 +1560,7 @@ xfs_itruncate_extents(
 		if (error)
 			goto out_bmap_cancel;
 
-		if (committed) {
-			/*
-			 * Mark the inode dirty so it will be logged and
-			 * moved forward in the log as part of every commit.
-			 */
-			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		}
-
-		ntp = xfs_trans_dup(tp);
-		error = xfs_trans_commit(tp, 0);
-		tp = ntp;
-
-		xfs_trans_ijoin(tp, ip, 0);
-
-		if (error)
-			goto out;
-
-		/*
-		 * Transaction commit worked ok so we can drop the extra ticket
-		 * reference that we gained in xfs_trans_dup()
-		 */
-		xfs_log_ticket_put(tp->t_ticket);
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+		error = xfs_trans_roll(&tp, ip);
 		if (error)
 			goto out;
 	}
@@ -1756,7 +1681,7 @@ xfs_inactive_truncate(
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
 	if (error) {
 		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -1777,7 +1702,7 @@ xfs_inactive_truncate(
 
 	ASSERT(ip->i_d.di_nextents == 0);
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 	if (error)
 		goto error_unlock;
 
@@ -1785,7 +1710,7 @@ xfs_inactive_truncate(
 	return 0;
 
 error_trans_cancel:
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 error_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
@@ -1835,7 +1760,7 @@ xfs_inactive_ifree(
 		} else {
 			ASSERT(XFS_FORCED_SHUTDOWN(mp));
 		}
-		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -1855,7 +1780,7 @@ xfs_inactive_ifree(
 				__func__, error);
 			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 		}
-		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+		xfs_trans_cancel(tp);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		return error;
 	}
@@ -1874,7 +1799,7 @@ xfs_inactive_ifree(
 	if (error)
 		xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
 			__func__, error);
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 	if (error)
 		xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
 			__func__, error);
@@ -2235,28 +2160,42 @@ xfs_iunlink_remove(
  */
 STATIC int
 xfs_ifree_cluster(
-	xfs_inode_t	*free_ip,
-	xfs_trans_t	*tp,
-	xfs_ino_t	inum)
+	xfs_inode_t		*free_ip,
+	xfs_trans_t		*tp,
+	struct xfs_icluster	*xic)
 {
 	xfs_mount_t		*mp = free_ip->i_mount;
 	int			blks_per_cluster;
 	int			inodes_per_cluster;
 	int			nbufs;
 	int			i, j;
+	int			ioffset;
 	xfs_daddr_t		blkno;
 	xfs_buf_t		*bp;
 	xfs_inode_t		*ip;
 	xfs_inode_log_item_t	*iip;
 	xfs_log_item_t		*lip;
 	struct xfs_perag	*pag;
+	xfs_ino_t		inum;
 
+	inum = xic->first_ino;
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
 	blks_per_cluster = xfs_icluster_size_fsb(mp);
 	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
 	nbufs = mp->m_ialloc_blks / blks_per_cluster;
 
 	for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
+		/*
+		 * The allocation bitmap tells us which inodes of the chunk were
+		 * physically allocated. Skip the cluster if an inode falls into
+		 * a sparse region.
+		 */
+		ioffset = inum - xic->first_ino;
+		if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
+			ASSERT(do_mod(ioffset, inodes_per_cluster) == 0);
+			continue;
+		}
+
 		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
 					 XFS_INO_TO_AGBNO(mp, inum));
 
@@ -2414,8 +2353,7 @@ xfs_ifree(
 	xfs_bmap_free_t	*flist)
 {
 	int			error;
-	int			delete;
-	xfs_ino_t		first_ino;
+	struct xfs_icluster	xic = { 0 };
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(ip->i_d.di_nlink == 0);
@@ -2431,7 +2369,7 @@ xfs_ifree(
 	if (error)
 		return error;
 
-	error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
+	error = xfs_difree(tp, ip->i_ino, flist, &xic);
 	if (error)
 		return error;
 
@@ -2448,8 +2386,8 @@ xfs_ifree(
 	ip->i_d.di_gen++;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	if (delete)
-		error = xfs_ifree_cluster(ip, tp, first_ino);
+	if (xic.deleted)
+		error = xfs_ifree_cluster(ip, tp, &xic);
 
 	return error;
 }
@@ -2536,7 +2474,6 @@ xfs_remove(
 	int                     error = 0;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
-	int			cancel_flags;
 	int			committed;
 	uint			resblks;
 
@@ -2557,7 +2494,6 @@ xfs_remove(
 		tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
 	else
 		tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 
 	/*
 	 * We try to get the real space reservation first,
@@ -2576,7 +2512,6 @@ xfs_remove(
 	}
 	if (error) {
 		ASSERT(error != -ENOSPC);
-		cancel_flags = 0;
 		goto out_trans_cancel;
 	}
 
@@ -2588,7 +2523,6 @@ xfs_remove(
 	/*
 	 * If we're removing a directory perform some additional validation.
 	 */
-	cancel_flags |= XFS_TRANS_ABORT;
 	if (is_dir) {
 		ASSERT(ip->i_d.di_nlink >= 2);
 		if (ip->i_d.di_nlink != 2) {
@@ -2644,7 +2578,7 @@ xfs_remove(
 	if (error)
 		goto out_bmap_cancel;
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 	if (error)
 		goto std_return;
 
@@ -2656,7 +2590,7 @@ xfs_remove(
  out_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
  out_trans_cancel:
-	xfs_trans_cancel(tp, cancel_flags);
+	xfs_trans_cancel(tp);
  std_return:
 	return error;
 }
@@ -2730,11 +2664,11 @@ xfs_finish_rename(
 	error = xfs_bmap_finish(&tp, free_list, &committed);
 	if (error) {
 		xfs_bmap_cancel(free_list);
-		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
-	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	return xfs_trans_commit(tp);
 }
 
 /*
@@ -2855,7 +2789,7 @@ xfs_cross_rename(
 
 out_trans_abort:
 	xfs_bmap_cancel(free_list);
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 	return error;
 }
 
@@ -2915,7 +2849,6 @@ xfs_rename(
 	int			num_inodes = __XFS_SORT_INODES;
 	bool			new_parent = (src_dp != target_dp);
 	bool			src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
-	int			cancel_flags = 0;
 	int			spaceres;
 	int			error;
 
@@ -2951,7 +2884,6 @@ xfs_rename(
 	}
 	if (error)
 		goto out_trans_cancel;
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 
 	/*
 	 * Attach the dquots to the inodes
@@ -3022,10 +2954,8 @@ xfs_rename(
 		error = xfs_dir_createname(tp, target_dp, target_name,
 						src_ip->i_ino, &first_block,
 						&free_list, spaceres);
-		if (error == -ENOSPC)
-			goto out_bmap_cancel;
 		if (error)
-			goto out_trans_abort;
+			goto out_bmap_cancel;
 
 		xfs_trans_ichgtime(tp, target_dp,
 					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3033,7 +2963,7 @@ xfs_rename(
 		if (new_parent && src_is_directory) {
 			error = xfs_bumplink(tp, target_dp);
 			if (error)
-				goto out_trans_abort;
+				goto out_bmap_cancel;
 		}
 	} else { /* target_ip != NULL */
 		/*
@@ -3065,7 +2995,7 @@ xfs_rename(
 					src_ip->i_ino,
 					&first_block, &free_list, spaceres);
 		if (error)
-			goto out_trans_abort;
+			goto out_bmap_cancel;
 
 		xfs_trans_ichgtime(tp, target_dp,
 					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3076,7 +3006,7 @@ xfs_rename(
 		 */
 		error = xfs_droplink(tp, target_ip);
 		if (error)
-			goto out_trans_abort;
+			goto out_bmap_cancel;
 
 		if (src_is_directory) {
 			/*
@@ -3084,7 +3014,7 @@ xfs_rename(
 			 */
 			error = xfs_droplink(tp, target_ip);
 			if (error)
-				goto out_trans_abort;
+				goto out_bmap_cancel;
 		}
 	} /* target_ip != NULL */
 
@@ -3101,7 +3031,7 @@ xfs_rename(
 					&first_block, &free_list, spaceres);
 		ASSERT(error != -EEXIST);
 		if (error)
-			goto out_trans_abort;
+			goto out_bmap_cancel;
 	}
 
 	/*
@@ -3127,7 +3057,7 @@ xfs_rename(
 		 */
 		error = xfs_droplink(tp, src_dp);
 		if (error)
-			goto out_trans_abort;
+			goto out_bmap_cancel;
 	}
 
 	/*
@@ -3142,7 +3072,7 @@ xfs_rename(
 		error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
 					   &first_block, &free_list, spaceres);
 	if (error)
-		goto out_trans_abort;
+		goto out_bmap_cancel;
 
 	/*
 	 * For whiteouts, we need to bump the link count on the whiteout inode.
@@ -3156,10 +3086,10 @@ xfs_rename(
 		ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
 		error = xfs_bumplink(tp, wip);
 		if (error)
-			goto out_trans_abort;
+			goto out_bmap_cancel;
 		error = xfs_iunlink_remove(tp, wip);
 		if (error)
-			goto out_trans_abort;
+			goto out_bmap_cancel;
 		xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
 
 		/*
@@ -3180,12 +3110,10 @@ xfs_rename(
 		IRELE(wip);
 	return error;
 
-out_trans_abort:
-	cancel_flags |= XFS_TRANS_ABORT;
 out_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
 out_trans_cancel:
-	xfs_trans_cancel(tp, cancel_flags);
+	xfs_trans_cancel(tp);
 	if (wip)
 		IRELE(wip);
 	return error;
@@ -3464,7 +3392,7 @@ xfs_iflush_int(
 	ASSERT(ip->i_d.di_version > 1);
 
 	/* set *dip = inode's place in the buffer */
-	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
 
 	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
 			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 87f67c6b6..ea7d85af5 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -336,7 +336,7 @@ xfs_set_dmattrs(
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -346,7 +346,7 @@ xfs_set_dmattrs(
 	ip->i_d.di_dmstate  = state;
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	error = xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp);
 
 	return error;
 }
@@ -1076,7 +1076,7 @@ xfs_ioctl_setattr_get_trans(
 	return tp;
 
 out_cancel:
-	xfs_trans_cancel(tp, 0);
+	xfs_trans_cancel(tp);
 	return ERR_PTR(error);
 }
 
@@ -1253,7 +1253,7 @@ xfs_ioctl_setattr(
 	else
 		ip->i_d.di_extsize = 0;
 
-	code = xfs_trans_commit(tp, 0);
+	code = xfs_trans_commit(tp);
 
 	/*
 	 * Release any dquot(s) the inode had kept before chown.
@@ -1265,7 +1265,7 @@ xfs_ioctl_setattr(
 	return code;
 
 error_trans_cancel:
-	xfs_trans_cancel(tp, 0);
+	xfs_trans_cancel(tp);
 error_free_dquots:
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(pdqp);
@@ -1338,11 +1338,11 @@ xfs_ioc_setxflags(
 
 	error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		goto out_drop_write;
 	}
 
-	error = xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp);
 out_drop_write:
 	mnt_drop_write_file(filp);
 	return error;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 38e633bad..1f8603317 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -183,7 +183,7 @@ xfs_iomap_write_direct(
 	 * Check for running out of space, note: need lock to return
 	 */
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -213,7 +213,7 @@ xfs_iomap_write_direct(
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
 	if (error)
 		goto out_bmap_cancel;
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 	if (error)
 		goto out_unlock;
 
@@ -236,7 +236,7 @@ out_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
 	xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
 out_trans_cancel:
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 	goto out_unlock;
 }
 
@@ -690,7 +690,7 @@ xfs_iomap_write_allocate(
 			error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
 						  nres, 0);
 			if (error) {
-				xfs_trans_cancel(tp, 0);
+				xfs_trans_cancel(tp);
 				return error;
 			}
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -760,7 +760,7 @@ xfs_iomap_write_allocate(
 			if (error)
 				goto trans_cancel;
 
-			error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+			error = xfs_trans_commit(tp);
 			if (error)
 				goto error0;
 
@@ -791,7 +791,7 @@ xfs_iomap_write_allocate(
 
 trans_cancel:
 	xfs_bmap_cancel(&free_list);
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 error0:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
@@ -853,7 +853,7 @@ xfs_iomap_write_unwritten(
 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
 					  resblks, 0);
 		if (error) {
-			xfs_trans_cancel(tp, 0);
+			xfs_trans_cancel(tp);
 			return error;
 		}
 
@@ -890,7 +890,7 @@ xfs_iomap_write_unwritten(
 		if (error)
 			goto error_on_bmapi_transaction;
 
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		error = xfs_trans_commit(tp);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		if (error)
 			return error;
@@ -914,7 +914,7 @@ xfs_iomap_write_unwritten(
 
 error_on_bmapi_transaction:
 	xfs_bmap_cancel(&free_list);
-	xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+	xfs_trans_cancel(tp);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f4cd7204e..766b23f86 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -41,7 +41,6 @@
 
 #include <linux/capability.h>
 #include <linux/xattr.h>
-#include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
 #include <linux/fiemap.h>
@@ -414,10 +413,10 @@ xfs_vn_rename(
  * we need to be very careful about how much stack we use.
  * uio is kmalloced for this reason...
  */
-STATIC void *
+STATIC const char *
 xfs_vn_follow_link(
 	struct dentry		*dentry,
-	struct nameidata	*nd)
+	void			**cookie)
 {
 	char			*link;
 	int			error = -ENOMEM;
@@ -430,14 +429,12 @@ xfs_vn_follow_link(
 	if (unlikely(error))
 		goto out_kfree;
 
-	nd_set_link(nd, link);
-	return NULL;
+	return *cookie = link;
 
  out_kfree:
 	kfree(link);
  out_err:
-	nd_set_link(nd, ERR_PTR(error));
-	return NULL;
+	return ERR_PTR(error);
 }
 
 STATIC int
@@ -702,7 +699,7 @@ xfs_setattr_nonsize(
 
 	if (mp->m_flags & XFS_MOUNT_WSYNC)
 		xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp);
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
@@ -733,7 +730,7 @@ xfs_setattr_nonsize(
 	return 0;
 
 out_trans_cancel:
-	xfs_trans_cancel(tp, 0);
+	xfs_trans_cancel(tp);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_dqrele:
 	xfs_qm_dqrele(udqp);
@@ -755,7 +752,6 @@ xfs_setattr_size(
 	struct xfs_trans	*tp;
 	int			error;
 	uint			lock_flags = 0;
-	uint			commit_flags = 0;
 	bool			did_zeroing = false;
 
 	trace_xfs_setattr(ip);
@@ -851,7 +847,11 @@ xfs_setattr_size(
 	 * to hope that the caller sees ENOMEM and retries the truncate
 	 * operation.
 	 */
-	error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+	if (IS_DAX(inode))
+		error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
+	else
+		error = block_truncate_page(inode->i_mapping, newsize,
+					    xfs_get_blocks);
 	if (error)
 		return error;
 	truncate_setsize(inode, newsize);
@@ -861,7 +861,6 @@ xfs_setattr_size(
 	if (error)
 		goto out_trans_cancel;
 
-	commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 	lock_flags |= XFS_ILOCK_EXCL;
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
@@ -901,7 +900,7 @@ xfs_setattr_size(
 	if (newsize <= oldsize) {
 		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
 		if (error)
-			goto out_trans_abort;
+			goto out_trans_cancel;
 
 		/*
 		 * Truncated "down", so we're removing references to old data
@@ -928,16 +927,14 @@ xfs_setattr_size(
 	if (mp->m_flags & XFS_MOUNT_WSYNC)
 		xfs_trans_set_sync(tp);
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 out_unlock:
 	if (lock_flags)
 		xfs_iunlock(ip, lock_flags);
 	return error;
 
-out_trans_abort:
-	commit_flags |= XFS_TRANS_ABORT;
 out_trans_cancel:
-	xfs_trans_cancel(tp, commit_flags);
+	xfs_trans_cancel(tp);
 	goto out_unlock;
 }
 
@@ -984,7 +981,7 @@ xfs_vn_update_time(
 	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -1006,7 +1003,7 @@ xfs_vn_update_time(
 	}
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
-	return xfs_trans_commit(tp, 0);
+	return xfs_trans_commit(tp);
 }
 
 #define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1191,22 +1188,22 @@ xfs_diflags_to_iflags(
 	struct inode		*inode,
 	struct xfs_inode	*ip)
 {
-	if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+	uint16_t		flags = ip->i_d.di_flags;
+
+	inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
+			    S_NOATIME | S_DAX);
+
+	if (flags & XFS_DIFLAG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
-	else
-		inode->i_flags &= ~S_IMMUTABLE;
-	if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+	if (flags & XFS_DIFLAG_APPEND)
 		inode->i_flags |= S_APPEND;
-	else
-		inode->i_flags &= ~S_APPEND;
-	if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+	if (flags & XFS_DIFLAG_SYNC)
 		inode->i_flags |= S_SYNC;
-	else
-		inode->i_flags &= ~S_SYNC;
-	if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+	if (flags & XFS_DIFLAG_NOATIME)
 		inode->i_flags |= S_NOATIME;
-	else
-		inode->i_flags &= ~S_NOATIME;
+	/* XXX: Also needs an on-disk per inode flag! */
+	if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+		inode->i_flags |= S_DAX;
 }
 
 /*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 80429891d..f41b0c3fd 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk(
 		}
 
 		irec->ir_free |= xfs_inobt_maskn(0, idx);
-		*icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+		*icount = irec->ir_count - irec->ir_freecount;
 	}
 
 	return 0;
@@ -415,6 +415,8 @@ xfs_bulkstat(
 				goto del_cursor;
 			if (icount) {
 				irbp->ir_startino = r.ir_startino;
+				irbp->ir_holemask = r.ir_holemask;
+				irbp->ir_count = r.ir_count;
 				irbp->ir_freecount = r.ir_freecount;
 				irbp->ir_free = r.ir_free;
 				irbp++;
@@ -447,13 +449,15 @@ xfs_bulkstat(
 			 * If this chunk has any allocated inodes, save it.
 			 * Also start read-ahead now for this chunk.
 			 */
-			if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+			if (r.ir_freecount < r.ir_count) {
 				xfs_bulkstat_ichunk_ra(mp, agno, &r);
 				irbp->ir_startino = r.ir_startino;
+				irbp->ir_holemask = r.ir_holemask;
+				irbp->ir_count = r.ir_count;
 				irbp->ir_freecount = r.ir_freecount;
 				irbp->ir_free = r.ir_free;
 				irbp++;
-				icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
+				icount += r.ir_count - r.ir_freecount;
 			}
 			error = xfs_btree_increment(cur, 0, &stat);
 			if (error || stat == 0) {
@@ -599,8 +603,7 @@ xfs_inumbers(
 		agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
 		buffer[bufidx].xi_startino =
 			XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
-		buffer[bufidx].xi_alloccount =
-			XFS_INODES_PER_CHUNK - r.ir_freecount;
+		buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
 		buffer[bufidx].xi_allocmask = ~r.ir_free;
 		if (++bufidx == bcount) {
 			long	written;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 7c7842c85..85f883dd6 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,26 +32,12 @@ typedef unsigned int		__uint32_t;
 typedef signed long long int	__int64_t;
 typedef unsigned long long int	__uint64_t;
 
-typedef __uint32_t		inst_t;		/* an instruction */
-
 typedef __s64			xfs_off_t;	/* <file offset> type */
 typedef unsigned long long	xfs_ino_t;	/* <inode> type */
 typedef __s64			xfs_daddr_t;	/* <disk address> type */
-typedef char *			xfs_caddr_t;	/* <core address> type */
 typedef __u32			xfs_dev_t;
 typedef __u32			xfs_nlink_t;
 
-/* __psint_t is the same size as a pointer */
-#if (BITS_PER_LONG == 32)
-typedef __int32_t __psint_t;
-typedef __uint32_t __psunsigned_t;
-#elif (BITS_PER_LONG == 64)
-typedef __int64_t __psint_t;
-typedef __uint64_t __psunsigned_t;
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-
 #include "xfs_types.h"
 
 #include "kmem.h"
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bcc7cfabb..08d4fe46f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -109,7 +109,7 @@ xlog_ungrant_log_space(
 STATIC void
 xlog_verify_dest_ptr(
 	struct xlog		*log,
-	char			*ptr);
+	void			*ptr);
 STATIC void
 xlog_verify_grant_tail(
 	struct xlog *log);
@@ -513,7 +513,7 @@ xfs_log_done(
 	struct xfs_mount	*mp,
 	struct xlog_ticket	*ticket,
 	struct xlog_in_core	**iclog,
-	uint			flags)
+	bool			regrant)
 {
 	struct xlog		*log = mp->m_log;
 	xfs_lsn_t		lsn = 0;
@@ -526,14 +526,11 @@ xfs_log_done(
 	    (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
 	     (xlog_commit_record(log, ticket, iclog, &lsn)))) {
 		lsn = (xfs_lsn_t) -1;
-		if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
-			flags |= XFS_LOG_REL_PERM_RESERV;
-		}
+		regrant = false;
 	}
 
 
-	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
-	    (flags & XFS_LOG_REL_PERM_RESERV)) {
+	if (!regrant) {
 		trace_xfs_log_done_nonperm(log, ticket);
 
 		/*
@@ -541,7 +538,6 @@ xfs_log_done(
 		 * request has been made to release a permanent reservation.
 		 */
 		xlog_ungrant_log_space(log, ticket);
-		xfs_log_ticket_put(ticket);
 	} else {
 		trace_xfs_log_done_perm(log, ticket);
 
@@ -553,6 +549,7 @@ xfs_log_done(
 		ticket->t_flags |= XLOG_TIC_INITED;
 	}
 
+	xfs_log_ticket_put(ticket);
 	return lsn;
 }
 
@@ -1447,7 +1444,7 @@ xlog_alloc_log(
 		iclog->ic_bp = bp;
 		iclog->ic_data = bp->b_addr;
 #ifdef DEBUG
-		log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+		log->l_iclog_bak[i] = &iclog->ic_header;
 #endif
 		head = &iclog->ic_header;
 		memset(head, 0, sizeof(xlog_rec_header_t));
@@ -1602,7 +1599,7 @@ xlog_pack_data(
 	int			i, j, k;
 	int			size = iclog->ic_offset + roundoff;
 	__be32			cycle_lsn;
-	xfs_caddr_t		dp;
+	char			*dp;
 
 	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
 
@@ -3664,7 +3661,7 @@ xlog_ticket_alloc(
 void
 xlog_verify_dest_ptr(
 	struct xlog	*log,
-	char		*ptr)
+	void		*ptr)
 {
 	int i;
 	int good_ptr = 0;
@@ -3767,9 +3764,8 @@ xlog_verify_iclog(
 	xlog_op_header_t	*ophead;
 	xlog_in_core_t		*icptr;
 	xlog_in_core_2_t	*xhdr;
-	xfs_caddr_t		ptr;
-	xfs_caddr_t		base_ptr;
-	__psint_t		field_offset;
+	void			*base_ptr, *ptr, *p;
+	ptrdiff_t		field_offset;
 	__uint8_t		clientid;
 	int			len, i, j, k, op_len;
 	int			idx;
@@ -3788,9 +3784,9 @@ xlog_verify_iclog(
 	if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 		xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
 
-	ptr = (xfs_caddr_t) &iclog->ic_header;
-	for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
-	     ptr += BBSIZE) {
+	base_ptr = ptr = &iclog->ic_header;
+	p = &iclog->ic_header;
+	for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
 		if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 			xfs_emerg(log->l_mp, "%s: unexpected magic num",
 				__func__);
@@ -3798,20 +3794,19 @@ xlog_verify_iclog(
 
 	/* check fields */
 	len = be32_to_cpu(iclog->ic_header.h_num_logops);
-	ptr = iclog->ic_datap;
-	base_ptr = ptr;
-	ophead = (xlog_op_header_t *)ptr;
+	base_ptr = ptr = iclog->ic_datap;
+	ophead = ptr;
 	xhdr = iclog->ic_data;
 	for (i = 0; i < len; i++) {
-		ophead = (xlog_op_header_t *)ptr;
+		ophead = ptr;
 
 		/* clientid is only 1 byte */
-		field_offset = (__psint_t)
-			       ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+		p = &ophead->oh_clientid;
+		field_offset = p - base_ptr;
 		if (!syncing || (field_offset & 0x1ff)) {
 			clientid = ophead->oh_clientid;
 		} else {
-			idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+			idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
 			if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
 				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
 				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,13 +3824,13 @@ xlog_verify_iclog(
 				(unsigned long)field_offset);
 
 		/* check length */
-		field_offset = (__psint_t)
-			       ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+		p = &ophead->oh_len;
+		field_offset = p - base_ptr;
 		if (!syncing || (field_offset & 0x1ff)) {
 			op_len = be32_to_cpu(ophead->oh_len);
 		} else {
-			idx = BTOBBT((__psint_t)&ophead->oh_len -
-				    (__psint_t)iclog->ic_datap);
+			idx = BTOBBT((uintptr_t)&ophead->oh_len -
+				    (uintptr_t)iclog->ic_datap);
 			if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
 				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
 				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 84e0deb95..fa27aaec7 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -111,15 +111,6 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define	XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
 
 /*
- * Macros, structures, prototypes for interface to the log manager.
- */
-
-/*
- * Flags to xfs_log_done()
- */
-#define XFS_LOG_REL_PERM_RESERV	0x1
-
-/*
  * Flags to xfs_log_force()
  *
  *	XFS_LOG_SYNC:	Synchronous force in-core log to disk
@@ -138,7 +129,7 @@ struct xfs_log_callback;
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
 		       struct xlog_ticket *ticket,
 		       struct xlog_in_core **iclog,
-		       uint		flags);
+		       bool regrant);
 int	  _xfs_log_force(struct xfs_mount *mp,
 			 uint		flags,
 			 int		*log_forced);
@@ -183,7 +174,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
 
 void	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
-				xfs_lsn_t *commit_lsn, int flags);
+				xfs_lsn_t *commit_lsn, bool regrant);
 bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
 void	xfs_log_work_queue(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 45cc0ce18..abc2ccbff 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -624,7 +624,7 @@ restart:
 	spin_unlock(&cil->xc_push_lock);
 
 	/* xfs_log_done always frees the ticket on error. */
-	commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+	commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
 	if (commit_lsn == -1)
 		goto out_abort;
 
@@ -773,14 +773,10 @@ xfs_log_commit_cil(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
 	xfs_lsn_t		*commit_lsn,
-	int			flags)
+	bool			regrant)
 {
 	struct xlog		*log = mp->m_log;
 	struct xfs_cil		*cil = log->l_cilp;
-	int			log_flags = 0;
-
-	if (flags & XFS_TRANS_RELEASE_LOG_RES)
-		log_flags = XFS_LOG_REL_PERM_RESERV;
 
 	/* lock out background commit */
 	down_read(&cil->xc_ctx_lock);
@@ -795,7 +791,7 @@ xfs_log_commit_cil(
 	if (commit_lsn)
 		*commit_lsn = tp->t_commit_lsn;
 
-	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+	xfs_log_done(mp, tp->t_ticket, NULL, regrant);
 	xfs_trans_unreserve_and_mod_sb(tp);
 
 	/*
@@ -809,7 +805,7 @@ xfs_log_commit_cil(
 	 * the log items. This affects (at least) processing of stale buffers,
 	 * inodes and EFIs.
 	 */
-	xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
+	xfs_trans_free_items(tp, tp->t_commit_lsn, false);
 
 	xlog_cil_push_background(log);
 
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index db7cbdeb2..1c87c8abf 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -409,7 +409,7 @@ struct xlog {
 
 	/* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
-	char			*l_iclog_bak[XLOG_MAX_ICLOGS];
+	void			*l_iclog_bak[XLOG_MAX_ICLOGS];
 #endif
 
 };
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a5d03396d..480ebba84 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -147,7 +147,7 @@ xlog_put_bp(
  * Return the address of the start of the given block number's data
  * in a log buffer.  The buffer covers a log sector-aligned region.
  */
-STATIC xfs_caddr_t
+STATIC char *
 xlog_align(
 	struct xlog	*log,
 	xfs_daddr_t	blk_no,
@@ -203,7 +203,7 @@ xlog_bread(
 	xfs_daddr_t	blk_no,
 	int		nbblks,
 	struct xfs_buf	*bp,
-	xfs_caddr_t	*offset)
+	char		**offset)
 {
 	int		error;
 
@@ -225,9 +225,9 @@ xlog_bread_offset(
 	xfs_daddr_t	blk_no,		/* block to read from */
 	int		nbblks,		/* blocks to read */
 	struct xfs_buf	*bp,
-	xfs_caddr_t	offset)
+	char		*offset)
 {
-	xfs_caddr_t	orig_offset = bp->b_addr;
+	char		*orig_offset = bp->b_addr;
 	int		orig_len = BBTOB(bp->b_length);
 	int		error, error2;
 
@@ -396,7 +396,7 @@ xlog_find_cycle_start(
 	xfs_daddr_t	*last_blk,
 	uint		cycle)
 {
-	xfs_caddr_t	offset;
+	char		*offset;
 	xfs_daddr_t	mid_blk;
 	xfs_daddr_t	end_blk;
 	uint		mid_cycle;
@@ -443,7 +443,7 @@ xlog_find_verify_cycle(
 	uint		cycle;
 	xfs_buf_t	*bp;
 	xfs_daddr_t	bufblks;
-	xfs_caddr_t	buf = NULL;
+	char		*buf = NULL;
 	int		error = 0;
 
 	/*
@@ -509,7 +509,7 @@ xlog_find_verify_log_record(
 {
 	xfs_daddr_t		i;
 	xfs_buf_t		*bp;
-	xfs_caddr_t		offset = NULL;
+	char			*offset = NULL;
 	xlog_rec_header_t	*head = NULL;
 	int			error = 0;
 	int			smallmem = 0;
@@ -616,7 +616,7 @@ xlog_find_head(
 	xfs_daddr_t	*return_head_blk)
 {
 	xfs_buf_t	*bp;
-	xfs_caddr_t	offset;
+	char		*offset;
 	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
 	int		num_scan_bblks;
 	uint		first_half_cycle, last_half_cycle;
@@ -891,7 +891,7 @@ xlog_find_tail(
 {
 	xlog_rec_header_t	*rhead;
 	xlog_op_header_t	*op_head;
-	xfs_caddr_t		offset = NULL;
+	char			*offset = NULL;
 	xfs_buf_t		*bp;
 	int			error, i, found;
 	xfs_daddr_t		umount_data_blk;
@@ -1099,7 +1099,7 @@ xlog_find_zeroed(
 	xfs_daddr_t	*blk_no)
 {
 	xfs_buf_t	*bp;
-	xfs_caddr_t	offset;
+	char		*offset;
 	uint	        first_cycle, last_cycle;
 	xfs_daddr_t	new_blk, last_blk, start_blk;
 	xfs_daddr_t     num_scan_bblks;
@@ -1199,7 +1199,7 @@ bp_err:
 STATIC void
 xlog_add_record(
 	struct xlog		*log,
-	xfs_caddr_t		buf,
+	char			*buf,
 	int			cycle,
 	int			block,
 	int			tail_cycle,
@@ -1227,7 +1227,7 @@ xlog_write_log_records(
 	int		tail_cycle,
 	int		tail_block)
 {
-	xfs_caddr_t	offset;
+	char		*offset;
 	xfs_buf_t	*bp;
 	int		balign, ealign;
 	int		sectbb = log->l_sectBBsize;
@@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer(
 			return -EFSCORRUPTED;
 		}
 
-		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
-					      next_unlinked_offset);
+		buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
 		*buffer_nextp = *logged_nextp;
 
 		/*
@@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer(
 		 * have to leave the inode in a consistent state for whoever
 		 * reads it next....
 		 */
-		xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+		xfs_dinode_calc_crc(mp,
 				xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
 
 	}
@@ -2508,8 +2507,8 @@ xlog_recover_inode_pass2(
 	xfs_buf_t		*bp;
 	xfs_dinode_t		*dip;
 	int			len;
-	xfs_caddr_t		src;
-	xfs_caddr_t		dest;
+	char			*src;
+	char			*dest;
 	int			error;
 	int			attr_index;
 	uint			fields;
@@ -2551,7 +2550,7 @@ xlog_recover_inode_pass2(
 		goto out_release;
 	}
 	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
-	dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
+	dip = xfs_buf_offset(bp, in_f->ilf_boffset);
 
 	/*
 	 * Make sure the place we're flushing out to really looks
@@ -2890,7 +2889,7 @@ xlog_recover_dquot_pass2(
 		return error;
 
 	ASSERT(bp);
-	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+	ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
 
 	/*
 	 * If the dquot has an LSN in it, recover the dquot only if it's less
@@ -3073,12 +3072,22 @@ xlog_recover_do_icreate_pass2(
 		return -EINVAL;
 	}
 
-	/* existing allocation is fixed value */
-	ASSERT(count == mp->m_ialloc_inos);
-	ASSERT(length == mp->m_ialloc_blks);
-	if (count != mp->m_ialloc_inos ||
-	     length != mp->m_ialloc_blks) {
-		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+	/*
+	 * The inode chunk is either full or sparse and we only support
+	 * m_ialloc_min_blks sized sparse allocations at this time.
+	 */
+	if (length != mp->m_ialloc_blks &&
+	    length != mp->m_ialloc_min_blks) {
+		xfs_warn(log->l_mp,
+			 "%s: unsupported chunk length", __FUNCTION__);
+		return -EINVAL;
+	}
+
+	/* verify inode count is consistent with extent length */
+	if ((count >> mp->m_sb.sb_inopblog) != length) {
+		xfs_warn(log->l_mp,
+			 "%s: inconsistent inode count and chunk length",
+			 __FUNCTION__);
 		return -EINVAL;
 	}
 
@@ -3096,8 +3105,8 @@ xlog_recover_do_icreate_pass2(
 			XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
 		return 0;
 
-	xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
-					be32_to_cpu(icl->icl_gen));
+	xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length,
+			      be32_to_cpu(icl->icl_gen));
 	return 0;
 }
 
@@ -3369,17 +3378,17 @@ STATIC int
 xlog_recover_add_to_cont_trans(
 	struct xlog		*log,
 	struct xlog_recover	*trans,
-	xfs_caddr_t		dp,
+	char			*dp,
 	int			len)
 {
 	xlog_recover_item_t	*item;
-	xfs_caddr_t		ptr, old_ptr;
+	char			*ptr, *old_ptr;
 	int			old_len;
 
 	if (list_empty(&trans->r_itemq)) {
 		/* finish copying rest of trans header */
 		xlog_recover_add_item(&trans->r_itemq);
-		ptr = (xfs_caddr_t) &trans->r_theader +
+		ptr = (char *)&trans->r_theader +
 				sizeof(xfs_trans_header_t) - len;
 		memcpy(ptr, dp, len);
 		return 0;
@@ -3415,12 +3424,12 @@ STATIC int
 xlog_recover_add_to_trans(
 	struct xlog		*log,
 	struct xlog_recover	*trans,
-	xfs_caddr_t		dp,
+	char			*dp,
 	int			len)
 {
 	xfs_inode_log_format_t	*in_f;			/* any will do */
 	xlog_recover_item_t	*item;
-	xfs_caddr_t		ptr;
+	char			*ptr;
 
 	if (!len)
 		return 0;
@@ -3509,7 +3518,7 @@ STATIC int
 xlog_recovery_process_trans(
 	struct xlog		*log,
 	struct xlog_recover	*trans,
-	xfs_caddr_t		dp,
+	char			*dp,
 	unsigned int		len,
 	unsigned int		flags,
 	int			pass)
@@ -3616,8 +3625,8 @@ xlog_recover_process_ophdr(
 	struct hlist_head	rhash[],
 	struct xlog_rec_header	*rhead,
 	struct xlog_op_header	*ohead,
-	xfs_caddr_t		dp,
-	xfs_caddr_t		end,
+	char			*dp,
+	char			*end,
 	int			pass)
 {
 	struct xlog_recover	*trans;
@@ -3666,11 +3675,11 @@ xlog_recover_process_data(
 	struct xlog		*log,
 	struct hlist_head	rhash[],
 	struct xlog_rec_header	*rhead,
-	xfs_caddr_t		dp,
+	char			*dp,
 	int			pass)
 {
 	struct xlog_op_header	*ohead;
-	xfs_caddr_t		end;
+	char			*end;
 	int			num_logops;
 	int			error;
 
@@ -3756,11 +3765,11 @@ xlog_recover_process_efi(
 	}
 
 	set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
-	error = xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp);
 	return error;
 
 abort_error:
-	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 	return error;
 }
 
@@ -3862,13 +3871,13 @@ xlog_recover_clear_agi_bucket(
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
 
-	error = xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp);
 	if (error)
 		goto out_error;
 	return;
 
 out_abort:
-	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 out_error:
 	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
 	return;
@@ -4015,7 +4024,7 @@ xlog_recover_process_iunlinks(
 STATIC int
 xlog_unpack_data_crc(
 	struct xlog_rec_header	*rhead,
-	xfs_caddr_t		dp,
+	char			*dp,
 	struct xlog		*log)
 {
 	__le32			crc;
@@ -4045,7 +4054,7 @@ xlog_unpack_data_crc(
 STATIC int
 xlog_unpack_data(
 	struct xlog_rec_header	*rhead,
-	xfs_caddr_t		dp,
+	char			*dp,
 	struct xlog		*log)
 {
 	int			i, j, k;
@@ -4127,7 +4136,7 @@ xlog_do_recovery_pass(
 {
 	xlog_rec_header_t	*rhead;
 	xfs_daddr_t		blk_no;
-	xfs_caddr_t		offset;
+	char			*offset;
 	xfs_buf_t		*hbp, *dbp;
 	int			error = 0, h_size;
 	int			bblks, split_bblks;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6f23fbdfb..461e791ef 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -725,6 +725,22 @@ xfs_mountfs(
 	}
 
 	/*
+	 * If enabled, sparse inode chunk alignment is expected to match the
+	 * cluster size. Full inode chunk alignment must match the chunk size,
+	 * but that is checked on sb read verification...
+	 */
+	if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
+	    mp->m_sb.sb_spino_align !=
+			XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
+		xfs_warn(mp,
+	"Sparse inode block alignment (%u) must match cluster size (%llu).",
+			 mp->m_sb.sb_spino_align,
+			 XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
+		error = -EINVAL;
+		goto out_remove_uuid;
+	}
+
+	/*
 	 * Set inode alignment fields
 	 */
 	xfs_set_inoalignment(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 8c995a2cc..7999e91cd 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -101,6 +101,8 @@ typedef struct xfs_mount {
 	__uint64_t		m_flags;	/* global mount flags */
 	int			m_ialloc_inos;	/* inodes in inode allocation */
 	int			m_ialloc_blks;	/* blocks in inode allocation */
+	int			m_ialloc_min_blks;/* min blocks in sparse inode
+						   * allocation */
 	int			m_inoalign_mask;/* mask sb_inoalignmt if used */
 	uint			m_qflags;	/* quota status flags */
 	struct xfs_trans_resv	m_resv;		/* precomputed res values */
@@ -179,6 +181,8 @@ typedef struct xfs_mount {
 						   allocator */
 #define XFS_MOUNT_NOATTR2	(1ULL << 25)	/* disable use of attr2 format */
 
+#define XFS_MOUNT_DAX		(1ULL << 62)	/* TEST ONLY! */
+
 
 /*
  * Default minimum read and write sizes.
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 981a657ec..ab4a6066f 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -306,7 +306,7 @@ xfs_fs_commit_blocks(
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		goto out_drop_iolock;
 	}
 
@@ -321,7 +321,7 @@ xfs_fs_commit_blocks(
 	}
 
 	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp);
 
 out_drop_iolock:
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5538468c7..eac9549ef 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -756,7 +756,7 @@ xfs_qm_qino_alloc(
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
 				  XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -764,8 +764,7 @@ xfs_qm_qino_alloc(
 		error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
 								&committed);
 		if (error) {
-			xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-					 XFS_TRANS_ABORT);
+			xfs_trans_cancel(tp);
 			return error;
 		}
 	}
@@ -796,7 +795,7 @@ xfs_qm_qino_alloc(
 	spin_unlock(&mp->m_sb_lock);
 	xfs_log_sb(tp);
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 	if (error) {
 		ASSERT(XFS_FORCED_SHUTDOWN(mp));
 		xfs_alert(mp, "%s failed (error %d)!", __func__, error);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 9a25c9275..3640c6e89 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -239,7 +239,7 @@ xfs_qm_scall_trunc_qfile(
 	tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 		goto out_put;
 	}
@@ -252,15 +252,14 @@ xfs_qm_scall_trunc_qfile(
 
 	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
 	if (error) {
-		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-				     XFS_TRANS_ABORT);
+		xfs_trans_cancel(tp);
 		goto out_unlock;
 	}
 
 	ASSERT(ip->i_d.di_nextents == 0);
 
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
@@ -437,7 +436,7 @@ xfs_qm_scall_setqlim(
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		goto out_rele;
 	}
 
@@ -548,7 +547,7 @@ xfs_qm_scall_setqlim(
 	dqp->dq_flags |= XFS_DQ_DIRTY;
 	xfs_trans_log_dquot(tp, dqp);
 
-	error = xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp);
 
 out_rele:
 	xfs_qm_dqrele(dqp);
@@ -571,7 +570,7 @@ xfs_qm_log_quotaoff_end(
 
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -585,8 +584,7 @@ xfs_qm_log_quotaoff_end(
 	 * We don't care about quotoff's performance.
 	 */
 	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0);
-	return error;
+	return xfs_trans_commit(tp);
 }
 
 
@@ -605,7 +603,7 @@ xfs_qm_log_quotaoff(
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		goto out;
 	}
 
@@ -624,7 +622,7 @@ xfs_qm_log_quotaoff(
 	 * We don't care about quotoff's performance.
 	 */
 	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp);
 	if (error)
 		goto out;
 
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 5376dd406..ce6506ada 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -55,7 +55,6 @@ struct xfs_trans;
 typedef struct xfs_dqtrx {
 	struct xfs_dquot *qt_dquot;	  /* the dquot this refers to */
 	ulong		qt_blk_res;	  /* blks reserved on a dquot */
-	ulong		qt_blk_res_used;  /* blks used from the reservation */
 	ulong		qt_ino_res;	  /* inode reserved on a dquot */
 	ulong		qt_ino_res_used;  /* inodes used from the reservation */
 	long		qt_bcount_delta;  /* dquot blk count changes */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f2079b691..f4e8c06ee 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -780,7 +780,6 @@ xfs_growfs_rt_alloc(
 	 * Allocate space to the file, as necessary.
 	 */
 	while (oblocks < nblocks) {
-		int		cancelflags = 0;
 		xfs_trans_t	*tp;
 
 		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
@@ -792,7 +791,6 @@ xfs_growfs_rt_alloc(
 					  resblks, 0);
 		if (error)
 			goto error_cancel;
-		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
 		/*
 		 * Lock the inode.
 		 */
@@ -804,7 +802,6 @@ xfs_growfs_rt_alloc(
 		 * Allocate blocks to the bitmap file.
 		 */
 		nmap = 1;
-		cancelflags |= XFS_TRANS_ABORT;
 		error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
 					XFS_BMAPI_METADATA, &firstblock,
 					resblks, &map, &nmap, &flist);
@@ -818,14 +815,13 @@ xfs_growfs_rt_alloc(
 		error = xfs_bmap_finish(&tp, &flist, &committed);
 		if (error)
 			goto error_cancel;
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		error = xfs_trans_commit(tp);
 		if (error)
 			goto error;
 		/*
 		 * Now we need to clear the allocated blocks.
 		 * Do this one block per transaction, to keep it simple.
 		 */
-		cancelflags = 0;
 		for (bno = map.br_startoff, fsbno = map.br_startblock;
 		     bno < map.br_startoff + map.br_blockcount;
 		     bno++, fsbno++) {
@@ -851,7 +847,7 @@ xfs_growfs_rt_alloc(
 			if (bp == NULL) {
 				error = -EIO;
 error_cancel:
-				xfs_trans_cancel(tp, cancelflags);
+				xfs_trans_cancel(tp);
 				goto error;
 			}
 			memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
@@ -859,7 +855,7 @@ error_cancel:
 			/*
 			 * Commit the transaction.
 			 */
-			error = xfs_trans_commit(tp, 0);
+			error = xfs_trans_commit(tp);
 			if (error)
 				goto error;
 		}
@@ -973,7 +969,6 @@ xfs_growfs_rt(
 	     bmbno < nrbmblocks;
 	     bmbno++) {
 		xfs_trans_t	*tp;
-		int		cancelflags = 0;
 
 		*nmp = *mp;
 		nsbp = &nmp->m_sb;
@@ -1015,7 +1010,6 @@ xfs_growfs_rt(
 		mp->m_rbmip->i_d.di_size =
 			nsbp->sb_rbmblocks * nsbp->sb_blocksize;
 		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
-		cancelflags |= XFS_TRANS_ABORT;
 		/*
 		 * Get the summary inode into the transaction.
 		 */
@@ -1062,7 +1056,7 @@ xfs_growfs_rt(
 			nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
 		if (error) {
 error_cancel:
-			xfs_trans_cancel(tp, cancelflags);
+			xfs_trans_cancel(tp);
 			break;
 		}
 		/*
@@ -1076,7 +1070,7 @@ error_cancel:
 		mp->m_rsumlevels = nrsumlevels;
 		mp->m_rsumsize = nrsumsize;
 
-		error = xfs_trans_commit(tp, 0);
+		error = xfs_trans_commit(tp);
 		if (error)
 			break;
 	}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 858e1e62b..1fb16562c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
 #define MNTOPT_DISCARD	   "discard"	/* Discard unused blocks */
 #define MNTOPT_NODISCARD   "nodiscard"	/* Do not discard unused blocks */
 
+#define MNTOPT_DAX	"dax"		/* Enable direct access to bdev pages */
+
 /*
  * Table driven mount option parser.
  *
@@ -363,6 +365,10 @@ xfs_parseargs(
 			mp->m_flags |= XFS_MOUNT_DISCARD;
 		} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
 			mp->m_flags &= ~XFS_MOUNT_DISCARD;
+#ifdef CONFIG_FS_DAX
+		} else if (!strcmp(this_char, MNTOPT_DAX)) {
+			mp->m_flags |= XFS_MOUNT_DAX;
+#endif
 		} else {
 			xfs_warn(mp, "unknown mount option [%s].", this_char);
 			return -EINVAL;
@@ -452,8 +458,8 @@ done:
 }
 
 struct proc_xfs_info {
-	int	flag;
-	char	*str;
+	uint64_t	flag;
+	char		*str;
 };
 
 STATIC int
@@ -474,6 +480,7 @@ xfs_showargs(
 		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
 		{ XFS_MOUNT_DISCARD,		"," MNTOPT_DISCARD },
 		{ XFS_MOUNT_SMALL_INUMS,	"," MNTOPT_32BITINODE },
+		{ XFS_MOUNT_DAX,		"," MNTOPT_DAX },
 		{ 0, NULL }
 	};
 	static struct proc_xfs_info xfs_info_unset[] = {
@@ -1507,6 +1514,20 @@ xfs_fs_fill_super(
 	if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
 		sb->s_flags |= MS_I_VERSION;
 
+	if (mp->m_flags & XFS_MOUNT_DAX) {
+		xfs_warn(mp,
+	"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+		if (sb->s_blocksize != PAGE_SIZE) {
+			xfs_alert(mp,
+		"Filesystem block size invalid for DAX Turning DAX off.");
+			mp->m_flags &= ~XFS_MOUNT_DAX;
+		} else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+			xfs_alert(mp,
+		"Block device does not support DAX Turning DAX off.");
+			mp->m_flags &= ~XFS_MOUNT_DAX;
+		}
+	}
+
 	error = xfs_mountfs(mp);
 	if (error)
 		goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 40c076523..4be27b021 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -178,7 +178,6 @@ xfs_symlink(
 	struct xfs_bmap_free	free_list;
 	xfs_fsblock_t		first_block;
 	bool                    unlock_dp_on_error = false;
-	uint			cancel_flags;
 	int			committed;
 	xfs_fileoff_t		first_fsb;
 	xfs_filblks_t		fs_blocks;
@@ -224,7 +223,6 @@ xfs_symlink(
 		return error;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 	/*
 	 * The symlink will fit into the inode data fork?
 	 * There can't be any attributes so we get the whole variable part.
@@ -239,10 +237,8 @@ xfs_symlink(
 		resblks = 0;
 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
 	}
-	if (error) {
-		cancel_flags = 0;
+	if (error)
 		goto out_trans_cancel;
-	}
 
 	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = true;
@@ -394,7 +390,7 @@ xfs_symlink(
 	if (error)
 		goto out_bmap_cancel;
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 	if (error)
 		goto out_release_inode;
 
@@ -407,9 +403,8 @@ xfs_symlink(
 
 out_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
-	cancel_flags |= XFS_TRANS_ABORT;
 out_trans_cancel:
-	xfs_trans_cancel(tp, cancel_flags);
+	xfs_trans_cancel(tp);
 out_release_inode:
 	/*
 	 * Wait until after the current transaction is aborted to finish the
@@ -464,7 +459,7 @@ xfs_inactive_symlink_rmt(
 	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
 	if (error) {
-		xfs_trans_cancel(tp, 0);
+		xfs_trans_cancel(tp);
 		return error;
 	}
 
@@ -533,7 +528,7 @@ xfs_inactive_symlink_rmt(
 	/*
 	 * Commit the transaction containing extent freeing and EFDs.
 	 */
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp);
 	if (error) {
 		ASSERT(XFS_FORCED_SHUTDOWN(mp));
 		goto error_unlock;
@@ -552,7 +547,7 @@ xfs_inactive_symlink_rmt(
 error_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
 error_trans_cancel:
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_trans_cancel(tp);
 error_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 615781bf4..8d916d33d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -738,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
 		  __entry->blocks, __entry->shift, __entry->writeio_blocks)
 )
 
+TRACE_EVENT(xfs_irec_merge_pre,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+		 uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
+	TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+		__field(uint16_t, holemask)
+		__field(xfs_agino_t, nagino)
+		__field(uint16_t, nholemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agino = agino;
+		__entry->holemask = holemask;
+		__entry->nagino = nagino;
+		__entry->nholemask = holemask;
+	),
+	TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+		  __entry->agino, __entry->holemask, __entry->nagino,
+		  __entry->nholemask)
+)
+
+TRACE_EVENT(xfs_irec_merge_post,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+		 uint16_t holemask),
+	TP_ARGS(mp, agno, agino, holemask),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, agino)
+		__field(uint16_t, holemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agino = agino;
+		__entry->holemask = holemask;
+	),
+	TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
+		  MINOR(__entry->dev), __entry->agno, __entry->agino,
+		  __entry->holemask)
+)
+
 #define DEFINE_IREF_EVENT(name) \
 DEFINE_EVENT(xfs_iref_class, name, \
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 220ef2c90..0582a2710 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -113,7 +113,7 @@ xfs_trans_free(
  * blocks.  Locks and log items, however, are no inherited.  They must
  * be added to the new transaction explicitly.
  */
-xfs_trans_t *
+STATIC xfs_trans_t *
 xfs_trans_dup(
 	xfs_trans_t	*tp)
 {
@@ -251,14 +251,7 @@ xfs_trans_reserve(
 	 */
 undo_log:
 	if (resp->tr_logres > 0) {
-		int		log_flags;
-
-		if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
-			log_flags = XFS_LOG_REL_PERM_RESERV;
-		} else {
-			log_flags = 0;
-		}
-		xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+		xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
 		tp->t_ticket = NULL;
 		tp->t_log_res = 0;
 		tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -744,7 +737,7 @@ void
 xfs_trans_free_items(
 	struct xfs_trans	*tp,
 	xfs_lsn_t		commit_lsn,
-	int			flags)
+	bool			abort)
 {
 	struct xfs_log_item_desc *lidp, *next;
 
@@ -755,7 +748,7 @@ xfs_trans_free_items(
 
 		if (commit_lsn != NULLCOMMITLSN)
 			lip->li_ops->iop_committing(lip, commit_lsn);
-		if (flags & XFS_TRANS_ABORT)
+		if (abort)
 			lip->li_flags |= XFS_LI_ABORTED;
 		lip->li_ops->iop_unlock(lip);
 
@@ -892,27 +885,17 @@ xfs_trans_committed_bulk(
  * have already been unlocked as if the commit had succeeded.
  * Do not reference the transaction structure after this call.
  */
-int
-xfs_trans_commit(
+static int
+__xfs_trans_commit(
 	struct xfs_trans	*tp,
-	uint			flags)
+	bool			regrant)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	xfs_lsn_t		commit_lsn = -1;
 	int			error = 0;
-	int			log_flags = 0;
 	int			sync = tp->t_flags & XFS_TRANS_SYNC;
 
 	/*
-	 * Determine whether this commit is releasing a permanent
-	 * log reservation or not.
-	 */
-	if (flags & XFS_TRANS_RELEASE_LOG_RES) {
-		ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-		log_flags = XFS_LOG_REL_PERM_RESERV;
-	}
-
-	/*
 	 * If there is nothing to be logged by the transaction,
 	 * then unlock all of the items associated with the
 	 * transaction and free the transaction structure.
@@ -936,7 +919,7 @@ xfs_trans_commit(
 		xfs_trans_apply_sb_deltas(tp);
 	xfs_trans_apply_dquot_deltas(tp);
 
-	xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
+	xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
 
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 	xfs_trans_free(tp);
@@ -964,18 +947,25 @@ out_unreserve:
 	 */
 	xfs_trans_unreserve_and_mod_dquots(tp);
 	if (tp->t_ticket) {
-		commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+		commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
 		if (commit_lsn == -1 && !error)
 			error = -EIO;
 	}
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-	xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+	xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
 	xfs_trans_free(tp);
 
 	XFS_STATS_INC(xs_trans_empty);
 	return error;
 }
 
+int
+xfs_trans_commit(
+	struct xfs_trans	*tp)
+{
+	return __xfs_trans_commit(tp, false);
+}
+
 /*
  * Unlock all of the transaction's items and free the transaction.
  * The transaction must not have modified any of its items, because
@@ -986,29 +976,22 @@ out_unreserve:
  */
 void
 xfs_trans_cancel(
-	xfs_trans_t		*tp,
-	int			flags)
+	struct xfs_trans	*tp)
 {
-	int			log_flags;
-	xfs_mount_t		*mp = tp->t_mountp;
+	struct xfs_mount	*mp = tp->t_mountp;
+	bool			dirty = (tp->t_flags & XFS_TRANS_DIRTY);
 
 	/*
-	 * See if the caller is being too lazy to figure out if
-	 * the transaction really needs an abort.
-	 */
-	if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
-		flags &= ~XFS_TRANS_ABORT;
-	/*
 	 * See if the caller is relying on us to shut down the
 	 * filesystem.  This happens in paths where we detect
 	 * corruption and decide to give up.
 	 */
-	if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
+	if (dirty && !XFS_FORCED_SHUTDOWN(mp)) {
 		XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 	}
 #ifdef DEBUG
-	if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+	if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
 		struct xfs_log_item_desc *lidp;
 
 		list_for_each_entry(lidp, &tp->t_items, lid_trans)
@@ -1018,27 +1001,20 @@ xfs_trans_cancel(
 	xfs_trans_unreserve_and_mod_sb(tp);
 	xfs_trans_unreserve_and_mod_dquots(tp);
 
-	if (tp->t_ticket) {
-		if (flags & XFS_TRANS_RELEASE_LOG_RES) {
-			ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-			log_flags = XFS_LOG_REL_PERM_RESERV;
-		} else {
-			log_flags = 0;
-		}
-		xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-	}
+	if (tp->t_ticket)
+		xfs_log_done(mp, tp->t_ticket, NULL, false);
 
 	/* mark this thread as no longer being in a transaction */
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
-	xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+	xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
 	xfs_trans_free(tp);
 }
 
 /*
  * Roll from one trans in the sequence of PERMANENT transactions to
  * the next: permanent transactions are only flushed out when
- * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * committed with xfs_trans_commit(), but we still want as soon
  * as possible to let chunks of it go to the log. So we commit the
  * chunk we've been working on and get a new transaction to continue.
  */
@@ -1055,7 +1031,8 @@ xfs_trans_roll(
 	 * Ensure that the inode is always logged.
 	 */
 	trans = *tpp;
-	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+	if (dp)
+		xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
 
 	/*
 	 * Copy the critical parameters from one trans to the next.
@@ -1071,20 +1048,13 @@ xfs_trans_roll(
 	 * is in progress. The caller takes the responsibility to cancel
 	 * the duplicate transaction that gets returned.
 	 */
-	error = xfs_trans_commit(trans, 0);
+	error = __xfs_trans_commit(trans, true);
 	if (error)
 		return error;
 
 	trans = *tpp;
 
 	/*
-	 * transaction commit worked ok so we can drop the extra ticket
-	 * reference that we gained in xfs_trans_dup()
-	 */
-	xfs_log_ticket_put(trans->t_ticket);
-
-
-	/*
 	 * Reserve space in the log for th next transaction.
 	 * This also pushes items in the "AIL", the list of logged items,
 	 * out to disk if they are taking up space at the tail of the log
@@ -1100,6 +1070,7 @@ xfs_trans_roll(
 	if (error)
 		return error;
 
-	xfs_trans_ijoin(trans, dp, 0);
+	if (dp)
+		xfs_trans_ijoin(trans, dp, 0);
 	return 0;
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b5bc1ab3c..3b21b4e5e 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -133,8 +133,6 @@ typedef struct xfs_trans {
  * XFS transaction mechanism exported interfaces that are
  * actually macros.
  */
-#define	xfs_trans_get_log_res(tp)	((tp)->t_log_res)
-#define	xfs_trans_get_log_count(tp)	((tp)->t_log_count)
 #define	xfs_trans_get_block_res(tp)	((tp)->t_blk_res)
 #define	xfs_trans_set_sync(tp)		((tp)->t_flags |= XFS_TRANS_SYNC)
 
@@ -153,7 +151,6 @@ typedef struct xfs_trans {
  */
 xfs_trans_t	*xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-xfs_trans_t	*xfs_trans_dup(xfs_trans_t *);
 int		xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
 				  uint, uint);
 void		xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -228,9 +225,9 @@ void		xfs_trans_log_efd_extent(xfs_trans_t *,
 					 struct xfs_efd_log_item *,
 					 xfs_fsblock_t,
 					 xfs_extlen_t);
-int		xfs_trans_commit(xfs_trans_t *, uint flags);
+int		xfs_trans_commit(struct xfs_trans *);
 int		xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
-void		xfs_trans_cancel(xfs_trans_t *, int);
+void		xfs_trans_cancel(xfs_trans_t *);
 int		xfs_trans_ail_init(struct xfs_mount *);
 void		xfs_trans_ail_destroy(struct xfs_mount *);
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 573aefb5a..1098cf490 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next(
 {
 	struct xfs_log_item	*lip = cur->item;
 
-	if ((__psint_t)lip & 1)
+	if ((uintptr_t)lip & 1)
 		lip = xfs_ail_min(ailp);
 	if (lip)
 		cur->item = xfs_ail_next(ailp, lip);
@@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear(
 	list_for_each_entry(cur, &ailp->xa_cursors, list) {
 		if (cur->item == lip)
 			cur->item = (struct xfs_log_item *)
-					((__psint_t)cur->item | 1);
+					((uintptr_t)cur->item | 1);
 	}
 }
 
@@ -287,7 +287,7 @@ xfs_ail_splice(
 	 * find the place in the AIL where the items belong.
 	 */
 	lip = cur ? cur->item : NULL;
-	if (!lip || (__psint_t) lip & 1)
+	if (!lip || (uintptr_t)lip & 1)
 		lip = __xfs_trans_ail_cursor_last(ailp, lsn);
 
 	/*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 76a16df55..ce78534a0 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo(
 	xfs_trans_t	*ntp)
 {
 	xfs_dqtrx_t	*oq, *nq;
-	int		i,j;
+	int		i, j;
 	xfs_dqtrx_t	*oqa, *nqa;
+	ulong		blk_res_used;
 
 	if (!otp->t_dqinfo)
 		return;
@@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo(
 	 * Because the quota blk reservation is carried forward,
 	 * it is also necessary to carry forward the DQ_DIRTY flag.
 	 */
-	if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+	if (otp->t_flags & XFS_TRANS_DQ_DIRTY)
 		ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
 
 	for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
 		oqa = otp->t_dqinfo->dqs[j];
 		nqa = ntp->t_dqinfo->dqs[j];
 		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			blk_res_used = 0;
+
 			if (oqa[i].qt_dquot == NULL)
 				break;
 			oq = &oqa[i];
 			nq = &nqa[i];
 
+			if (oq->qt_blk_res && oq->qt_bcount_delta > 0)
+				blk_res_used = oq->qt_bcount_delta;
+
 			nq->qt_dquot = oq->qt_dquot;
 			nq->qt_bcount_delta = nq->qt_icount_delta = 0;
 			nq->qt_rtbcount_delta = 0;
@@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo(
 			/*
 			 * Transfer whatever is left of the reservations.
 			 */
-			nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
-			oq->qt_blk_res = oq->qt_blk_res_used;
+			nq->qt_blk_res = oq->qt_blk_res - blk_res_used;
+			oq->qt_blk_res = blk_res_used;
 
 			nq->qt_rtblk_res = oq->qt_rtblk_res -
 				oq->qt_rtblk_res_used;
@@ -239,10 +245,6 @@ xfs_trans_mod_dquot(
 		 * disk blocks used.
 		 */
 	      case XFS_TRANS_DQ_BCOUNT:
-		if (qtrx->qt_blk_res && delta > 0) {
-			qtrx->qt_blk_res_used += (ulong)delta;
-			ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
-		}
 		qtrx->qt_bcount_delta += delta;
 		break;
 
@@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas(
 			 * reservation that a transaction structure knows of.
 			 */
 			if (qtrx->qt_blk_res != 0) {
-				if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
-					if (qtrx->qt_blk_res >
-					    qtrx->qt_blk_res_used)
+				ulong blk_res_used = 0;
+
+				if (qtrx->qt_bcount_delta > 0)
+					blk_res_used = qtrx->qt_bcount_delta;
+
+				if (qtrx->qt_blk_res != blk_res_used) {
+					if (qtrx->qt_blk_res > blk_res_used)
 						dqp->q_res_bcount -= (xfs_qcnt_t)
 							(qtrx->qt_blk_res -
-							 qtrx->qt_blk_res_used);
+							 blk_res_used);
 					else
 						dqp->q_res_bcount -= (xfs_qcnt_t)
-							(qtrx->qt_blk_res_used -
+							(blk_res_used -
 							 qtrx->qt_blk_res);
 				}
 			} else {
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index bd1281862..1b7362945 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -30,7 +30,7 @@ void	xfs_trans_init(struct xfs_mount *);
 void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void	xfs_trans_del_item(struct xfs_log_item *);
 void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
-				int flags);
+				bool abort);
 void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 
 void	xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
author	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-09-08 01:01:14 -0300
committer	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-09-08 01:01:14 -0300
commit	e5fd91f1ef340da553f7a79da9540c3db711c937 (patch)
tree	b11842027dc6641da63f4bcc524f8678263304a3 /fs
parent	2a9b0348e685a63d97486f6749622b61e9e3292f (diff)